From 80925331117a93cf804db9a55d4d9ab4f4edb163 Mon Sep 17 00:00:00 2001
From: "Alex Liu (Engrg-Hardware 1)" <aleliu@nvidia.com>
Date: Thu, 18 Jan 2024 21:59:24 -0800
Subject: [PATCH] [READY]Deprecate AsyncDataReader, LocalizedOneHot and Hybrid
 Embedding

---
 .gitlab-ci.yml                                |   55 -
 HugeCTR/include/collectives/collective.hpp    |   53 +
 HugeCTR/include/common.hpp                    |   46 +-
 HugeCTR/include/data_reader.hpp               |    2 +-
 .../async_reader/async_reader.hpp             |   74 -
 .../async_reader/async_reader_adapter.hpp     |  151 --
 .../async_reader/async_reader_common.hpp      |  104 --
 .../async_reader/split_label_dense_sparse.hpp |   26 -
 .../async_reader/thread_async_reader.hpp      |   75 -
 .../async_reader_common.hpp}                  |   61 +-
 .../parquet_data_reader_worker.hpp            |    2 +-
 .../data_readers/row_group_reading_thread.hpp |    2 +-
 HugeCTR/include/embedding.hpp                 |   23 -
 .../embeddings/hybrid_embedding/allocator.hpp |   45 -
 .../hybrid_embedding/calibration_data.hpp     |  118 --
 .../hybrid_embedding/communication.hpp        |  135 --
 .../embeddings/hybrid_embedding/data.hpp      |   97 --
 .../hybrid_embedding/frequent_embedding.hpp   |  168 ---
 .../hybrid_embedding/hybrid_indices.hpp       |  230 ---
 .../hybrid_embedding/indices_container.hpp    |   65 -
 .../hybrid_embedding/infrequent_embedding.hpp |  195 ---
 .../embeddings/hybrid_embedding/model.hpp     |  102 --
 .../embeddings/hybrid_embedding/select.cuh    |  125 --
 .../hybrid_embedding/statistics.hpp           |  131 --
 .../embeddings/hybrid_embedding/update.cuh    |   90 --
 .../embeddings/hybrid_embedding/utils.cuh     |   33 -
 .../embeddings/hybrid_embedding/utils.hpp     |   38 -
 .../embeddings/hybrid_sparse_embedding.hpp    |  273 ----
 ...ocalized_slot_sparse_embedding_one_hot.hpp |  485 ------
 .../embeddings/sparse_embedding_functors.hpp  |   41 -
 HugeCTR/include/exchange_wgrad.hpp            |   11 +-
 HugeCTR/include/parser.hpp                    |   13 +-
 HugeCTR/include/pybind/common_wrapper.hpp     |   25 -
 HugeCTR/include/pybind/model.hpp              |   71 +-
 HugeCTR/include/pybind/model_wrapper.hpp      |    9 +-
 HugeCTR/include/resource_manager.hpp          |   12 +-
 HugeCTR/include/resource_manager_base.hpp     |    1 +
 .../resource_manager_core.hpp                 |   27 +-
 .../resource_manager_ext.hpp                  |  112 --
 HugeCTR/include/scheduleable.hpp              |   18 -
 HugeCTR/src/collectives/collective.cpp        |   50 +
 .../async_reader/async_reader.cpp             |  253 ----
 .../async_reader/async_reader_adapter.cpp     |  514 -------
 .../data_readers/async_reader/broadcast.cu    |   88 --
 .../async_reader/split_label_dense_sparse.cu  |  226 ---
 .../async_reader/thread_async_reader.cpp      |  336 -----
 .../multi_hot/async_data_reader.cpp           |    3 +-
 .../hybrid_embedding/calibration_data.cu      |  211 ---
 .../hybrid_embedding/communication.cu         |  174 ---
 .../src/embeddings/hybrid_embedding/data.cu   |  148 --
 .../hybrid_embedding/frequent_embedding.cu    |  487 ------
 .../hybrid_embedding/hybrid_indices.cu        |  541 -------
 .../hybrid_embedding/indices_container.cu     |   66 -
 .../hybrid_embedding/infrequent_embedding.cu  |  670 ---------
 .../src/embeddings/hybrid_embedding/model.cu  |  154 --
 .../embeddings/hybrid_embedding/statistics.cu |  412 -----
 .../src/embeddings/hybrid_embedding/utils.cu  |  127 --
 .../src/embeddings/hybrid_sparse_embedding.cu |  820 ----------
 ...localized_slot_sparse_embedding_one_hot.cu | 1334 -----------------
 .../src/embeddings/update_params_functor.cu   |  256 ----
 HugeCTR/src/exchange_wgrad.cpp                |   22 +-
 HugeCTR/src/pybind/add_dense_layer.cpp        |   22 -
 HugeCTR/src/pybind/add_input.cpp              |  146 +-
 HugeCTR/src/pybind/add_sparse_embedding.cpp   |  110 +-
 HugeCTR/src/pybind/model.cpp                  | 1177 +--------------
 HugeCTR/src/pybind/model_compile.cpp          |  977 ++++++++++++
 HugeCTR/src/pybind/model_pipeline.cpp         |  489 +-----
 HugeCTR/src/resource_manager.cpp              |    2 +-
 .../resource_manager_ext.cpp                  |   76 -
 ci/integration_test/dlrm/benchmark_14node.sub |    8 -
 ci/integration_test/dlrm/benchmark_1node.sub  |    8 -
 ci/integration_test/dlrm/dlrm.sub             |    8 -
 ci/integration_test/dlrm/ib_nvlink_1node.sub  |   10 -
 ci/integration_test/dlrm/ib_nvlink_8node.sub  |   10 -
 .../overlapped_pipeline.sub                   |    5 -
 ci/selene/ci.yml                              |   48 -
 ci/template.yml                               |    2 +-
 ci/utest/utest.sub                            |    1 -
 samples/dlrm/README.md                        |  142 +-
 samples/dlrm/config_DGXH100_16x8x1056.sh      |   32 +
 samples/dlrm/config_DGXH100_1x8x6912.sh       |   28 +
 samples/dlrm/config_DGXH100_8x8x2112.sh       |   32 +
 samples/dlrm/dgx_a100.py                      |  229 ---
 samples/dlrm/dgx_a100_14x8x640.py             |  248 ---
 samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py   |  244 ---
 samples/dlrm/dgx_a100_ib_nvlink.py            |  228 ---
 samples/dlrm/mlperf_logger/__init__.py        |    3 +
 samples/dlrm/mlperf_logger/callbacks.py       |  118 ++
 samples/dlrm/mlperf_logger/param_info.py      |   70 +
 .../dlrm/mlperf_logger/utils.py               |   21 +-
 samples/dlrm/preprocessing/convert_to_raw.py  |  251 ++++
 .../preprocessing/md5sums_raw_dataset.txt     |    3 +
 samples/dlrm/requirements.txt                 |    3 +
 samples/dlrm/run.sub                          |  140 ++
 samples/dlrm/run_and_time.sh                  |   68 +
 samples/dlrm/run_with_docker.sh               |   95 ++
 samples/dlrm/sharding/__init__.py             |    2 +
 samples/dlrm/sharding/generate_plan.py        |  131 ++
 samples/dlrm/sharding/planner.py              |  327 ++++
 samples/dlrm/train.py                         |  485 ++++++
 .../sparse_operation_kit/dynamic_variable.py  |    1 -
 test/utest/communication/ar_oneshot_test.cu   |   11 +-
 .../ib_comms_a2a_v_integ_test.cu              |   11 +-
 .../communication/ib_comms_a2a_v_test.cu      |   13 +-
 test/utest/communication/ib_comms_ar_test.cu  |   11 +-
 .../data_distributor_tests.cpp                |    4 +-
 test/utest/data_reader/CMakeLists.txt         |    7 -
 .../data_reader_async_adapter_test.cpp        |  242 ---
 .../data_reader/data_reader_async_test.cpp    |  127 --
 .../data_reader/data_reader_benchmark.cu      |    5 +-
 .../data_reader/data_reader_parquet_test.cpp  |   12 +-
 .../data_reader/data_reader_v2_async_test.cpp |    4 +-
 .../multi_hot_async_data_reader_test.cpp      |    6 +-
 ...ributed_slot_sparse_embedding_hash_test.cu |    8 +-
 .../embedding/hybrid_embedding/data_test.cpp  |  181 ---
 .../embedding/hybrid_embedding/data_test.hpp  |   62 -
 .../hybrid_embedding/end_to_end_test.cpp      |  766 ----------
 .../hybrid_embedding/forward_test.cpp         |  475 ------
 .../hybrid_embedding/hybrid_embedding_cpu.cpp |  591 --------
 .../hybrid_embedding/hybrid_embedding_cpu.hpp |  113 --
 .../hybrid_embedding/indices_test.cpp         |  445 ------
 .../hybrid_embedding/input_generator.cpp      |  296 ----
 .../hybrid_embedding/input_generator.hpp      |   93 --
 .../hybrid_embedding/messages_test.cpp        |  466 ------
 .../embedding/hybrid_embedding/model_test.cpp |  630 --------
 .../embedding/hybrid_embedding/select_test.cu |  140 --
 .../hybrid_embedding/statistics_test.cpp      |  248 ---
 .../hybrid_embedding/statistics_test.hpp      |   31 -
 .../hybrid_embedding/test_common.cuh          |  242 ---
 .../hybrid_embedding/update_test.cpp          |  488 ------
 .../hybrid_sparse_embedding_test.cpp          |  215 ---
 ...calized_slot_sparse_embedding_hash_test.cu |    8 +-
 ...ized_slot_sparse_embedding_one_hot_test.cu |  847 -----------
 ...ot_sparse_embedding_one_hot_update_test.cu |  259 ----
 test/utest/embedding/unified_embedding.hpp    |    1 -
 .../embedding/unified_embedding_test.cpp      |    4 +-
 .../embedding_collection/configuration.hpp    |    2 +-
 .../embedding_collection_utils.hpp            |    2 +-
 .../test_compress_offset.cpp                  |    4 +-
 .../test_embedding_collection_load_dump.cpp   |    9 +-
 .../test_embedding_collection_v2.cu           |    4 +-
 .../test_embedding_table.cpp                  |    4 +-
 .../test_embedding_table_optimizer.cpp        |    4 +-
 test/utest/metrics/auc_test.cpp               |    4 +-
 test/utest/metrics/averageloss_test.cpp       |    4 +-
 test/utest/network/network_build_test.cpp     |    4 +-
 test/utest/pipeline/pipeline_test.cu          |    4 +-
 tools/CMakeLists.txt                          |    1 -
 tools/io_benchmark/main.cpp                   |  136 --
 149 files changed, 3214 insertions(+), 20266 deletions(-)
 create mode 100644 HugeCTR/include/collectives/collective.hpp
 delete mode 100644 HugeCTR/include/data_readers/async_reader/async_reader.hpp
 delete mode 100644 HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp
 delete mode 100644 HugeCTR/include/data_readers/async_reader/async_reader_common.hpp
 delete mode 100644 HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp
 delete mode 100644 HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp
 rename HugeCTR/include/data_readers/{async_reader/broadcast.hpp => multi_hot/async_reader_common.hpp} (66%)
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/communication.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/data.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/model.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/select.cuh
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/update.cuh
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/utils.cuh
 delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/utils.hpp
 delete mode 100644 HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp
 delete mode 100644 HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp
 delete mode 100644 HugeCTR/include/resource_managers/resource_manager_ext.hpp
 create mode 100644 HugeCTR/src/collectives/collective.cpp
 delete mode 100644 HugeCTR/src/data_readers/async_reader/async_reader.cpp
 delete mode 100644 HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp
 delete mode 100644 HugeCTR/src/data_readers/async_reader/broadcast.cu
 delete mode 100644 HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu
 delete mode 100644 HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/communication.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/data.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/model.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/statistics.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/utils.cu
 delete mode 100644 HugeCTR/src/embeddings/hybrid_sparse_embedding.cu
 delete mode 100644 HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu
 delete mode 100644 HugeCTR/src/embeddings/update_params_functor.cu
 create mode 100644 HugeCTR/src/pybind/model_compile.cpp
 delete mode 100644 HugeCTR/src/resource_managers/resource_manager_ext.cpp
 delete mode 100644 ci/integration_test/dlrm/benchmark_14node.sub
 delete mode 100644 ci/integration_test/dlrm/benchmark_1node.sub
 delete mode 100644 ci/integration_test/dlrm/dlrm.sub
 delete mode 100644 ci/integration_test/dlrm/ib_nvlink_1node.sub
 delete mode 100644 ci/integration_test/dlrm/ib_nvlink_8node.sub
 delete mode 100644 ci/integration_test/mlperf_generalization/overlapped_pipeline.sub
 create mode 100644 samples/dlrm/config_DGXH100_16x8x1056.sh
 create mode 100644 samples/dlrm/config_DGXH100_1x8x6912.sh
 create mode 100644 samples/dlrm/config_DGXH100_8x8x2112.sh
 delete mode 100644 samples/dlrm/dgx_a100.py
 delete mode 100644 samples/dlrm/dgx_a100_14x8x640.py
 delete mode 100755 samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py
 delete mode 100755 samples/dlrm/dgx_a100_ib_nvlink.py
 create mode 100644 samples/dlrm/mlperf_logger/__init__.py
 create mode 100644 samples/dlrm/mlperf_logger/callbacks.py
 create mode 100644 samples/dlrm/mlperf_logger/param_info.py
 rename tools/io_benchmark/CMakeLists.txt => samples/dlrm/mlperf_logger/utils.py (56%)
 create mode 100644 samples/dlrm/preprocessing/convert_to_raw.py
 create mode 100644 samples/dlrm/preprocessing/md5sums_raw_dataset.txt
 create mode 100644 samples/dlrm/requirements.txt
 create mode 100755 samples/dlrm/run.sub
 create mode 100755 samples/dlrm/run_and_time.sh
 create mode 100755 samples/dlrm/run_with_docker.sh
 create mode 100644 samples/dlrm/sharding/__init__.py
 create mode 100644 samples/dlrm/sharding/generate_plan.py
 create mode 100644 samples/dlrm/sharding/planner.py
 create mode 100644 samples/dlrm/train.py
 delete mode 100644 test/utest/data_reader/data_reader_async_adapter_test.cpp
 delete mode 100644 test/utest/data_reader/data_reader_async_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/data_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/data_test.hpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/end_to_end_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/forward_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/indices_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/input_generator.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/input_generator.hpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/messages_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/model_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/select_test.cu
 delete mode 100644 test/utest/embedding/hybrid_embedding/statistics_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/statistics_test.hpp
 delete mode 100644 test/utest/embedding/hybrid_embedding/test_common.cuh
 delete mode 100644 test/utest/embedding/hybrid_embedding/update_test.cpp
 delete mode 100644 test/utest/embedding/hybrid_sparse_embedding_test.cpp
 delete mode 100644 test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu
 delete mode 100644 test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu
 delete mode 100644 tools/io_benchmark/main.cpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 757e2393ec..ed2bb603e1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -294,47 +294,6 @@ criteo_multi_node:
     DGXNNODES: 2
     TEST_CMD: ./ci/integration_test/criteo/criteo_multi_node.sub
 
-dlrm_benchmark_14node:
-  extends: .cluster_test_job_daily
-  needs:
-    - build_train_multi_node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm_benchmark_14node
-    GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
-    CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
-    MOUNTS: /raid:/raid
-    WALLTIME: "00:15:00"
-    SBATCH_OTHER_PARAMS: --network sharp
-    DGXNNODES: 14
-    TEST_CMD: ./ci/integration_test/dlrm/benchmark_14node.sub
-
-dlrm_ib_nvlink_1node:
-  extends: .cluster_test_job_daily
-  needs:
-    - build_train_multi_node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm_ib_nvlink_1node
-    GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
-    CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
-    MOUNTS: /raid/datasets/criteo/mlperf/40m.limit_preshuffled:/data
-    WALLTIME: "00:10:00"
-    DGXNNODES: 1
-    TEST_CMD: ./ci/integration_test/dlrm/ib_nvlink_1node.sub
-
-dlrm_ib_nvlink_8node:
-  extends: .cluster_test_job_daily
-  needs:
-    - build_train_multi_node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm_ib_nvlink_8node
-    GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
-    CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
-    MOUNTS: /raid/datasets/criteo/mlperf/40m.limit_preshuffled:/data
-    WALLTIME: "00:10:00"
-    SBATCH_OTHER_PARAMS: --comment=metrics
-    DGXNNODES: 8
-    TEST_CMD: ./ci/integration_test/dlrm/ib_nvlink_8node.sub
-
 dlrm_dcnv2_benchmark_8node:
   extends: .cluster_test_job_daily
   needs:
@@ -576,20 +535,6 @@ inference_CPU_Memory_check:
     DGXNNODES: 1
     TEST_CMD: ./ci/post_test/check_cpu_usage.sub
 
-dlrm_14node_check:
-  # Push logs to gitlab
-  extends: .cluster_post_test_job_daily
-  needs:
-    - dlrm_benchmark_14node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm_14node_check
-    GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
-    CONT: $TRAIN_IMAGE_VERSIONED
-    MOUNTS: $LOGDIR/dlrm_benchmark_14node:/logs
-    WALLTIME: "00:15:00"
-    DGXNNODES: 1
-    TEST_CMD: ./ci/post_test/check_dlrm_14node.sub
-
 dlrm_dcnv2_8node_check:
   # Push logs to gitlab
   extends: .cluster_post_test_job_daily
diff --git a/HugeCTR/include/collectives/collective.hpp b/HugeCTR/include/collectives/collective.hpp
new file mode 100644
index 0000000000..565b5ee5c8
--- /dev/null
+++ b/HugeCTR/include/collectives/collective.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <collectives/all_reduce_comm.hpp>
+#include <collectives/ib_comm.hpp>
+#include <resource_manager.hpp>
+
+namespace HugeCTR {
+
+/**
+ * @brief GPU resources manager which holds all the resources required by training
+ *
+ * An extended GPU Resource manager
+ */
+class CollectiveManager {
+  std::shared_ptr<ResourceManager> core_;
+
+#ifdef ENABLE_MPI
+  std::unique_ptr<IbComm> ib_comm_ = NULL;
+#endif
+  std::shared_ptr<AllReduceInPlaceComm> ar_comm_ = NULL;
+
+ public:
+  CollectiveManager() = default;
+  CollectiveManager(const std::shared_ptr<ResourceManager>& core) : core_(core) {}
+
+  HCTR_DISALLOW_COPY_AND_MOVE(CollectiveManager);
+
+#ifdef ENABLE_MPI
+  void init_ib_comm();
+  IbComm* get_ib_comm() const { return ib_comm_.get(); }
+  void set_ready_to_transfer() {
+    if (ib_comm_) ib_comm_->set_ready_to_transfer();
+  }
+#endif
+  void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision);
+  AllReduceInPlaceComm* get_ar_comm() const { return ar_comm_.get(); }
+};
+}  // namespace HugeCTR
diff --git a/HugeCTR/include/common.hpp b/HugeCTR/include/common.hpp
index 389d00f6bb..00bc55e692 100644
--- a/HugeCTR/include/common.hpp
+++ b/HugeCTR/include/common.hpp
@@ -64,17 +64,8 @@ namespace HugeCTR {
 
 #define WARP_SIZE 32
 
-namespace hybrid_embedding {
-
-enum class HybridEmbeddingType;
-enum class CommunicationType;
-
-}  // namespace hybrid_embedding
-
 enum class Check_t { Sum, None, Unknown };
 
-enum class DataReaderSparse_t { Distributed, Localized };
-
 enum class DataReaderType_t { Norm, Raw, Parquet, RawAsync };
 
 enum class SourceType_t { FileList, Mmap, Parquet };
@@ -154,36 +145,17 @@ enum class Layer_t {
 enum class Embedding_t {
   DistributedSlotSparseEmbeddingHash,
   LocalizedSlotSparseEmbeddingHash,
-  LocalizedSlotSparseEmbeddingOneHot,
-  HybridSparseEmbedding,
   None
 };
 
 enum class Initializer_t { Default, Uniform, XavierNorm, XavierUniform, Sinusoidal, Zero };
 
-enum class TrainState_t {
-  Init,
-  BottomMLPFprop,
-  TopMLPFprop,
-  BottomMLPBprop,
-  TopMLPBprop,
-  MLPExchangeWgrad,
-  MLPUpdate,
-  Finalize
-};
-
 enum class Distribution_t { Uniform, PowerLaw };
 
 enum class PowerLaw_t { Long, Medium, Short, Specific };
 
 enum class Tensor_t { Train, Evaluate };
 
-// TODO: Consider to move them into a separate file
-struct TrainState {
-  TrainState_t state = TrainState_t::Init;
-  cudaEvent_t* event = nullptr;
-};
-
 struct AsyncParam {
   int num_threads;
   int num_batches_per_thread;
@@ -209,17 +181,6 @@ struct AsyncParam {
         is_dense_float(is_dense_float) {}
 };
 
-struct HybridEmbeddingParam {
-  size_t max_num_frequent_categories;
-  int64_t max_num_infrequent_samples;
-  double p_dup_max;
-  double max_all_reduce_bandwidth;
-  double max_all_to_all_bandwidth;
-  double efficiency_bandwidth_ratio;
-  hybrid_embedding::CommunicationType communication_type;
-  hybrid_embedding::HybridEmbeddingType hybrid_embedding_type;
-};
-
 typedef struct DataSetHeader_ {
   long long error_check;        // 0: no error check; 1: check_sum
   long long number_of_records;  // the number of samples in this data file
@@ -278,7 +239,6 @@ struct DataReaderSparseParam {
   std::vector<bool> is_slot_fixed_length;
   int slot_num;
 
-  DataReaderSparse_t type;
   int max_feature_num;
   int max_nnz;
 
@@ -289,8 +249,7 @@ struct DataReaderSparseParam {
         nnz_per_slot(nnz_per_slot_),
         is_fixed_length(is_fixed_length_),
         is_slot_fixed_length(std::vector<bool>(slot_num_, is_fixed_length_)),
-        slot_num(slot_num_),
-        type(DataReaderSparse_t::Distributed) {
+        slot_num(slot_num_) {
     HCTR_CHECK_HINT(slot_num_ > 0, "Illegal value for slot_num!");
     if (static_cast<size_t>(slot_num_) != nnz_per_slot_.size()) {
       HCTR_OWN_THROW(Error_t::WrongInput, "slot num != nnz_per_slot.size().");
@@ -312,8 +271,7 @@ struct DataReaderSparseParam {
         nnz_per_slot(slot_num_, nnz_per_slot_),
         is_fixed_length(is_fixed_length_),
         is_slot_fixed_length(std::vector<bool>(slot_num_, is_fixed_length_)),
-        slot_num(slot_num_),
-        type(DataReaderSparse_t::Distributed) {
+        slot_num(slot_num_) {
     HCTR_CHECK_HINT(slot_num_ > 0, "Illegal value for slot_num!");
     for (size_t i = 0; i < nnz_per_slot.size(); i++) {
       if (nnz_per_slot[i] == 1) {
diff --git a/HugeCTR/include/data_reader.hpp b/HugeCTR/include/data_reader.hpp
index db08757b6f..2e7bfc3c45 100644
--- a/HugeCTR/include/data_reader.hpp
+++ b/HugeCTR/include/data_reader.hpp
@@ -21,7 +21,7 @@
 #include <fstream>
 #include <gpu_resource.hpp>
 #include <io/filesystem.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <tensor2.hpp>
 #include <utils.hpp>
 #include <vector>
diff --git a/HugeCTR/include/data_readers/async_reader/async_reader.hpp b/HugeCTR/include/data_readers/async_reader/async_reader.hpp
deleted file mode 100644
index 664680dad3..0000000000
--- a/HugeCTR/include/data_readers/async_reader/async_reader.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <data_readers/async_reader/async_reader_common.hpp>
-#include <data_readers/async_reader/thread_async_reader.hpp>
-#include <string>
-#include <thread>
-#include <vector>
-
-namespace HugeCTR {
-
-class ResourceManager;
-
-class AsyncReaderImpl {
- public:
-  AsyncReaderImpl(std::string fname, size_t batch_size_bytes,
-                  const ResourceManager* resource_manager, int num_threads,
-                  int num_batches_per_thread, size_t io_block_size, int io_depth, int io_alignment,
-                  bool shuffle = false, bool wait_for_gpu_idle = false);
-
-  bool is_currently_loading();
-  size_t get_num_buffers() const;
-  size_t get_num_batches() const;
-  void load_async();
-  void reset();
-  BatchDesc get_batch();
-  void finalize_batch();
-  void finalize_batch(cudaEvent_t* event);
-  int get_last_batch_device();
-  void wait_for_gpu_events(const std::vector<cudaEvent_t*> events);
-  void wait_for_gpu_event(cudaEvent_t* event, int raw_device_id);
-  ~AsyncReaderImpl();
-
- private:
-  std::string fname_;
-  size_t batch_size_bytes_;
-  size_t num_batches_;
-  const ResourceManager* resource_manager_;
-  int num_devices_, num_threads_, num_batches_per_thread_;
-  size_t io_block_size_;
-  int io_depth_, io_alignment_;
-  InternalBatchBuffer* last_buffer_ = nullptr;
-  size_t total_file_size_;
-  bool wait_for_gpu_idle_;
-  int queue_id_;
-  bool loop_ = true;
-  cudaEvent_t event_success_;
-
-  std::vector<size_t> batch_ids_;
-  std::vector<std::unique_ptr<InternalBatchBuffer>> buffers_;
-  std::vector<std::thread> threads_;
-  std::vector<cudaStream_t> streams_;
-  std::vector<std::vector<size_t>> thread_batch_ids_;
-  std::vector<std::vector<size_t>> thread_buffer_ids_, gpu_thread_ids_;
-  std::vector<std::unique_ptr<ThreadAsyncReader>> local_readers_;
-
-  void create_workers();
-};
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp b/HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp
deleted file mode 100644
index 65f8700e19..0000000000
--- a/HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <common.hpp>
-#include <core23/tensor.hpp>
-#include <data_readers/async_reader/async_reader.hpp>
-#include <data_readers/async_reader/async_reader_common.hpp>
-#include <data_readers/async_reader/split_label_dense_sparse.hpp>
-#include <embeddings/hybrid_embedding/indices_container.hpp>
-#include <graph_wrapper.hpp>
-#include <io/filesystem.hpp>
-#include <scheduleable.hpp>
-#include <sparse_tensor.hpp>
-namespace HugeCTR {
-template <typename SparseType>
-class AsyncReader : public SchedulableDataReader {
-  using LabelType = float;
-  using InputType = int;
-
- public:
-  // Default params: num_threads = num_local_gpus, io_block_size = 512000, io_depth = 2,
-  // io_alignment = 512
-  AsyncReader(std::string fname, size_t batch_size, size_t label_dim, size_t dense_dim,
-              std::vector<DataReaderSparseParam>& params, bool mixed_precision,
-              const std::shared_ptr<ResourceManager>& resource_manager, int num_threads,
-              int num_batches_per_thread, size_t io_block_size, int io_depth, int io_alignment,
-              bool shuffle = false, bool wait_for_gpu_idle = false,
-              Alignment_t aligned = Alignment_t::None);
-
-  long long read_a_batch_to_device_delay_release() override;
-  long long get_full_batchsize() const override;
-
-  cudaStream_t get_split_3_way_stream(int raw_device_id) const {
-    return s3w_streams_.at(raw_device_id);
-  }
-
-  cudaStream_t get_d2d_stream(int raw_device_id) const { return d2d_streams_.at(raw_device_id); }
-
-  void set_schedule_streams(cudaStream_t s3w_stream, cudaStream_t d2d_stream,
-                            int raw_device_id) override;
-
-  void stream_wait_sparse_tensors(cudaStream_t stream, int raw_device_id, bool from_graph) override;
-  void stream_wait_dense_tensors(cudaStream_t stream, int raw_device_id, bool from_graph) override;
-
-  /**
-   * @brief Once the batch is retrieved from the AsyncReaderImpl, the batch needs to be
-   * split into its respective tensor buffers. This allows us to buffer the last N batches
-   * with their respective tensors.
-   */
-  void set_tensor_buffering(size_t num_batches_to_buffer);
-
-  bool current_batch_incomplete() const override;
-  void ready_to_collect() override;
-  long long read_a_batch_to_device() override;
-  void schedule_split_3_way_here(cudaStream_t stream, int raw_device_id, bool from_graph) override;
-  void schedule_d2d_here(cudaStream_t stream, int raw_device_id, bool from_graph) override;
-  void schedule_here(cudaStream_t stream, int raw_device_id) override;
-  void schedule_here_graph(cudaStream_t stream, int raw_device_id) override;
-  void update_schedule_graph(int raw_device_id) override;
-
-  size_t get_max_batches_inflight() const;
-  bool is_mixed_precision();
-  // TODO: need to get rid of this, pass the dims directly from Model to the HybridEmbedding
-  void get_dimensions(size_t& label_dim, size_t& dense_dim, size_t& sparse_dim,
-                      size_t& sample_size_items);
-
-  long long get_current_batchsize_per_device(size_t local_id) override;
-  long long get_current_batchsize() override { return current_batch_size_; };
-  TensorScalarType get_scalar_type() const override;
-  bool is_started() const override;
-  void start() override;
-
-  std::vector<core23::Tensor> get_label_tensor23s() const override;
-  std::vector<core23::Tensor> get_dense_tensor23s() const override;
-  std::vector<SparseTensor23> get_value_tensor23s() const;
-  std::vector<SparseTensor<SparseType>> get_value_tensors() const;
-
-  bool is_batch_cached() const { return current_batch_cached_; }
-  size_t get_current_inflight_id() const { return inflight_id_; }  // TODO: remove?
-
-  // FIXME: This is a temporary fix to get around the fact that HybridSpaseEmbedding
-  // needs to be constructed with the SparseTensor buffers
-  // std::vector<std::vector<SparseTensor23>> get_value_tensor_buffers() const;
-  std::vector<std::vector<SparseTensor<SparseType>>> get_value_tensor_buffers() const;
-  std::vector<std::vector<SparseTensor23>> get_value_tensor_buffer23s() const;
-#ifndef DISABLE_CUDF
-  void create_drwg_parquet(std::string file_list, bool strict_order_of_batches,
-                           const std::vector<long long> slot_offset,
-                           bool start_reading_from_beginning = true,
-                           long long max_samples_per_group = 0, int label_dense_num = 0,
-                           int label_dense_dim = 0) override;
-#endif
-  void set_source(std::string file_list = std::string()) override;
-  ~AsyncReader();
-
- private:
-  std::vector<core23::Tensor> temp_tensors_;
-  struct BatchTensors {
-    size_t tag;
-    std::vector<core23::Tensor> label_tensors;
-    std::vector<core23::Tensor> dense_tensors;
-    std::vector<SparseTensor23> sparse_tensors;
-  };
-
-  void assign_dense_and_label_tensors(core23::Tensor& label_tensor, core23::Tensor& dense_tensor,
-                                      int raw_device_id, cudaStream_t stream);
-
-  void init_batch_tensors(size_t num_inflight);
-
-  const std::shared_ptr<ResourceManager> resource_manager_;
-  std::unique_ptr<AsyncReaderImpl> reader_impl_;
-  int64_t sample_size_items_, current_batch_size_;
-  bool mixed_precision_, wait_for_gpu_idle_;
-  int64_t batch_size_, batch_size_per_dev_;
-  int64_t label_dim_, dense_dim_, sparse_dim_;
-
-  size_t inflight_id_ = 0;
-  std::vector<BatchTensors> inflight_batch_tensors_;  // in-flight batches
-
-  std::vector<core23::Tensor> label_tensors_;
-  std::vector<core23::Tensor> dense_tensors_;
-  std::vector<SparseTensor23> current_sparse_tensors_;
-
-  bool current_batch_cached_ = false;
-
-  std::vector<cudaEvent_t> completion_events_;
-  std::vector<cudaEvent_t> schedule_events_;
-  std::vector<cudaEvent_t> split_schedule_events_;
-  std::vector<cudaEvent_t> d2d_schedule_events_;
-
-  std::vector<cudaStream_t> s3w_streams_;  // split_3_way streams
-  std::vector<cudaStream_t> d2d_streams_;  // d2d copy streams
-
-  bool cache_buffers_ = false;
-};
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/data_readers/async_reader/async_reader_common.hpp b/HugeCTR/include/data_readers/async_reader/async_reader_common.hpp
deleted file mode 100644
index ea70e2d7de..0000000000
--- a/HugeCTR/include/data_readers/async_reader/async_reader_common.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-// For the tensor bags
-#include <atomic>
-#include <core23/tensor.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-struct iocb;
-
-namespace HugeCTR {
-
-enum class BufferStatus : int {
-  IOReady = 0,
-  IOInProcess = 1,
-  UploadInProcess = 2,
-  UploadSubmitted = 3,
-  ReadReady = 4,
-  PermanentlyResident = 5,
-  Finished = 6
-};
-
-struct InternalBatchBuffer {
-  int64_t id = -1;
-  size_t size;
-  int raw_device_id;
-
-  std::vector<char*> dev_data;
-  char* raw_host_ptr = nullptr;
-  char* host_data;
-
-  std::atomic<BufferStatus> status;
-  std::vector<iocb*> io_reqs;
-  int num_outstanding_reqs;
-  std::atomic<cudaEvent_t*> ready_to_upload_event, safe_to_upload_event;
-  int num_submitted_h2d_chunks;
-  int num_submitted_broadcasts;
-  bool preload_done;
-  cudaEvent_t event;
-
-  // Following the rule of 5 just in case
-  // Only need the destructor here
-  InternalBatchBuffer() { status.store(BufferStatus::IOReady); };
-  InternalBatchBuffer(InternalBatchBuffer const& other) = delete;
-  InternalBatchBuffer& operator=(InternalBatchBuffer const& other) = delete;
-
-  InternalBatchBuffer(InternalBatchBuffer&& other) = default;
-  InternalBatchBuffer& operator=(InternalBatchBuffer&& other) = default;
-
-  ~InternalBatchBuffer() {
-    for (auto ptr : dev_data) {
-      HCTR_LIB_CHECK_(cudaFree(ptr));
-    }
-    HCTR_LIB_CHECK_(cudaHostUnregister(raw_host_ptr));
-    free(raw_host_ptr);
-  }
-};
-
-struct BatchDesc {
-  size_t size_bytes;
-  std::vector<char*> dev_data;
-  bool cached;
-  size_t id;
-};
-
-class RawPtrWrapper : public TensorBuffer2 {
- public:
-  RawPtrWrapper(void* ptr) : ptr_(ptr) {}
-  bool allocated() const override { return true; }
-  void* get_ptr() override { return ptr_; }
-
- private:
-  void* ptr_;
-};
-
-class RawPtrBuffer : public TensorBuffer2 {
- public:
-  RawPtrBuffer(size_t size_bytes) { HCTR_LIB_THROW(cudaMalloc(&ptr_, size_bytes)); }
-  bool allocated() const override { return true; }
-  void* get_ptr() override { return ptr_; }
-  ~RawPtrBuffer() override { cudaFree(ptr_); }
-
- private:
-  void* ptr_;
-};
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp b/HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp
deleted file mode 100644
index 7d51bf28d7..0000000000
--- a/HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <core23/tensor.hpp>
-#include <tensor2.hpp>
-namespace HugeCTR {
-template <typename DenseType, typename SparseType>
-void split_3_way(core23::Tensor& label_tensor_per_dev, core23::Tensor& dense_tensor_per_dev,
-                 core23::Tensor& sparse_tensor, const core23::Tensor& label_dense_sparse_buffer,
-                 size_t local_idx_start, size_t local_idx_end, cudaStream_t stream);
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp b/HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp
deleted file mode 100644
index 98ba775300..0000000000
--- a/HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-#include <libaio.h>
-
-#include <atomic>
-#include <string>
-#include <vector>
-
-namespace HugeCTR {
-
-class InternalBatchBuffer;
-class ResourceManager;
-enum class WorkerStatus : int { OK, Terminate };
-
-struct ThreadAsyncReaderParameters {
-  size_t io_block_size;
-  int io_alignment, io_depth;
-  int num_h2d_chunks;
-  bool wait_for_gpu_idle;
-  bool loop;
-};
-
-class ThreadAsyncReader {
- public:
-  ThreadAsyncReader(std::string fname, const ResourceManager* resource_manager,
-                    size_t batch_size_bytes, int device_id, cudaStream_t stream,
-                    std::vector<size_t> batch_ids, std::vector<InternalBatchBuffer*> dest_buffers,
-                    ThreadAsyncReaderParameters params, size_t total_file_size);
-
-  void load();
-  void reset();
-
-  ~ThreadAsyncReader();
-
- private:
-  int fd_;
-  size_t batch_size_bytes_;
-  int device_id_;
-  cudaStream_t stream_;
-  int num_dest_buffers_;
-  int max_num_blocks_per_batch_;
-  size_t total_file_size_;
-  io_context_t ioctx_;
-  std::atomic<WorkerStatus> status_;
-
-  std::vector<size_t> batch_ids_;
-  std::vector<InternalBatchBuffer*> dest_buffers_;
-  ThreadAsyncReaderParameters params_;
-  int num_buffers_waiting_io_;
-
-  void try_submit_io(size_t batch_id, int io_id);
-  void wait_io();
-  bool wait_for_gpu_idle(InternalBatchBuffer* buffer);
-  void try_submit_upload(InternalBatchBuffer* buffer);
-  void try_submit_p2p(InternalBatchBuffer* buffer);
-  bool check_completion(InternalBatchBuffer* buffer);
-};
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/data_readers/async_reader/broadcast.hpp b/HugeCTR/include/data_readers/multi_hot/async_reader_common.hpp
similarity index 66%
rename from HugeCTR/include/data_readers/async_reader/broadcast.hpp
rename to HugeCTR/include/data_readers/multi_hot/async_reader_common.hpp
index d45f937f86..af3785dd29 100644
--- a/HugeCTR/include/data_readers/async_reader/broadcast.hpp
+++ b/HugeCTR/include/data_readers/multi_hot/async_reader_common.hpp
@@ -1,25 +1,38 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-namespace HugeCTR {
-
-void broadcast(float** dev_pointers, const bool* dev_p2p_accessible, int batch_size_floats,
-               int num_dests, int src_id, cudaStream_t stream);
-
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+
+// For the tensor bags
+#include <atomic>
+#include <vector>
+
+#include "HugeCTR/core23/tensor.hpp"
+#include "HugeCTR/include/tensor2.hpp"
+
+namespace HugeCTR {
+
+class RawPtrWrapper : public TensorBuffer2 {
+ public:
+  RawPtrWrapper(void* ptr) : ptr_(ptr) {}
+  bool allocated() const override { return true; }
+  void* get_ptr() override { return ptr_; }
+
+ private:
+  void* ptr_;
+};
 }  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp b/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp
index 28e741c03d..798294935a 100644
--- a/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp
+++ b/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp
@@ -32,7 +32,7 @@
 #include <cudf/types.hpp>
 #include <data_readers/file_source_parquet.hpp>
 #include <data_readers/row_group_reading_thread.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
diff --git a/HugeCTR/include/data_readers/row_group_reading_thread.hpp b/HugeCTR/include/data_readers/row_group_reading_thread.hpp
index 8391df6619..8684881631 100644
--- a/HugeCTR/include/data_readers/row_group_reading_thread.hpp
+++ b/HugeCTR/include/data_readers/row_group_reading_thread.hpp
@@ -17,7 +17,7 @@
 #include <condition_variable>
 #include <data_readers/dataframe_container.hpp>
 #include <data_readers/file_source_parquet.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
diff --git a/HugeCTR/include/embedding.hpp b/HugeCTR/include/embedding.hpp
index ea6e55e5bf..8f7f46d28d 100644
--- a/HugeCTR/include/embedding.hpp
+++ b/HugeCTR/include/embedding.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <embeddings/hybrid_embedding/utils.hpp>
 #include <gpu_learning_rate_scheduler.hpp>
 #include <optimizer.hpp>
 #include <sparse_tensor.hpp>
@@ -93,16 +92,6 @@ struct SparseEmbeddingHashParams {
   }
 };
 
-static size_t get_slot_num(const SparseTensorBag& bag) {
-  const std::vector<size_t>& dimension = bag.get_dimensions();
-  if (dimension.size() == 2) {
-    return dimension[1];
-  }
-  HCTR_OWN_THROW(Error_t::IllegalCall,
-                 "slot_num is available when sparse tensor shape is (batchsize, slot_num)");
-  return 0;
-}
-
 // TODO remove Tensor2 Based BufferBag
 struct BufferBag {
   TensorBag2 keys;
@@ -127,16 +116,4 @@ struct SparseInput {
   SparseInput() {}
 };
 
-struct BufferBag23 {
-  core23::Tensor keys;
-  core23::Tensor slot_id;
-  core23::Tensor embedding;
-  std::vector<core23::Tensor> opt_states;
-
-  std::vector<core23::Tensor> h_value_tensors;
-  std::vector<core23::Tensor> h_slot_id_tensors;
-  std::vector<core23::Tensor> uvm_key_tensor_bags;
-  std::vector<core23::Tensor> d_value_index_tensors;
-};
-
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp b/HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp
deleted file mode 100644
index afe7f83d42..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <common.hpp>
-
-class CudaPreAllocator {
-  void *ptr_;
-  size_t size_;
-
- public:
-  CudaPreAllocator() : ptr_(nullptr), size_(0) {}
-
-  template <typename T>
-  void reserve(const std::vector<size_t> &dimensions) {
-    size_t s = sizeof(T);
-    for (size_t dimension : dimensions) {
-      s *= dimension;
-    }
-    size_ += s;
-  }
-
-  void pre_allocate() { HCTR_LIB_THROW(cudaMalloc(&ptr_, size_)); }
-
-  void *allocate(size_t size) const {
-    if (size > size_) {
-      HCTR_OWN_THROW(Error_t::OutOfMemory, "Out of memory");
-    }
-    return ptr_;
-  }
-  void deallocate(void *ptr) const { HCTR_LIB_THROW(cudaFree(ptr)); }
-};
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp b/HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp
deleted file mode 100644
index ca513918f0..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-// depends on : Statistics, Data
-
-///
-/// This class contains the calibrated measurements for all-to-all and all-reduce
-/// for different data sizes. Each calibration consists of two arrays,
-/// ._data_size array and the ._time array which represent a mapping.
-///
-/// This class will be executed on the cpu instead of the gpu if no
-/// gpu memory is allocated for the calibration data.
-struct CalibrationData {
-  CalibrationData(size_t num_nodes_in, double p_dup_max_in, double max_all_reduce_bandwidth_in,
-                  double max_all_to_all_bandwidth_in, double efficiency_bandwidth_ratio_in)
-      : num_nodes(num_nodes_in),
-        p_dup_max(p_dup_max_in),
-        max_all_reduce_bandwidth(max_all_reduce_bandwidth_in),
-        max_all_to_all_bandwidth(max_all_to_all_bandwidth_in),
-        efficiency_bandwidth_ratio(efficiency_bandwidth_ratio_in){
-            // TBD
-        };
-  ~CalibrationData() {}
-
-  size_t num_nodes;
-
-  // Calibration all-to-all :
-  //   the following two arrays map data sizes to all-to-all times / latencies.
-  std::vector<double> h_all_to_all_data_size;
-  std::vector<double> h_all_to_all_times;
-  Tensor2<float> all_to_all_data_size;  // data size of message per gpu
-  Tensor2<float> all_to_all_times;      // calibrated all-to-all times
-
-  // Calibration all-reduce :
-  //   the following two arrays map data sizes to all-to-all times / latencies.
-  std::vector<double> h_all_reduce_data_size;
-  std::vector<double> h_all_reduce_times;
-  Tensor2<float> all_reduce_data_size;  // data size of message per gpu
-  Tensor2<float> all_reduce_times;      // calibrated all-reduce times
-
-  // Alternative calibration: (if no calibration provided)
-  //   the threshold for frequent categories is calculated from maximum bandwidths
-  //   for the all-reduce and all-to-all respectively.
-  //   This approximation assumes that the communications are bandwidth limited.
-  double p_dup_max;
-  double max_all_reduce_bandwidth;  // algorithm bandwidth all-reduce [data size message per gpu in
-                                    // bytes / sec]
-  double max_all_to_all_bandwidth;  // algorithm bandwidth all-to-all [data size message per gpu in
-                                    // bytes / sec]
-  double efficiency_bandwidth_ratio;
-  // cpu functions
-  double interpolate(const std::vector<double> &calibrated_data_size,
-                     const std::vector<double> &calibrated_times,
-                     const std::vector<double> &data_size,
-                     std::vector<double> &communication_times);
-  double interpolate_all_reduce(const std::vector<double> &data_size,
-                                std::vector<double> &communication_times);
-  double interpolate_all_to_all(const std::vector<double> &data_size,
-                                std::vector<double> &communication_times);
-
-  // gpu functions
-  void interpolate(const Tensor2<float> &calibrated_data_size,
-                   const Tensor2<float> &calibrated_times, const Tensor2<float> &data_size,
-                   Tensor2<float> &communication_times);
-  void interpolate_all_reduce(const Tensor2<float> &data_size, Tensor2<float> &communication_times);
-  void interpolate_all_to_all(const Tensor2<float> &data_size, Tensor2<float> &communication_times);
-};
-
-template <typename dtype>
-class ModelInitializationFunctors {
- public:
-  static double calculate_threshold(const CommunicationType communication_type, double p_dup_max,
-                                    double all_to_all_bandwidth, double all_reduce_bandwidth,
-                                    double efficiency_bandwidth_ratio, size_t num_nodes,
-                                    size_t batch_size, size_t num_networks, size_t num_iterations,
-                                    size_t num_tables);
-  static dtype calculate_num_frequent_categories(const CommunicationType &communication_type,
-                                                 const size_t num_networks,
-                                                 const CalibrationData &calibration,
-                                                 const Statistics<dtype> &statistics,
-                                                 const Data<dtype> &data, dtype *d_num_frequent,
-                                                 cudaStream_t stream);
-  static double calculate_frequent_probability(const Statistics<dtype> &statistics,
-                                               const dtype num_frequent,
-                                               uint32_t *d_total_frequent_count,
-                                               cudaStream_t stream);
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/communication.hpp b/HugeCTR/include/embeddings/hybrid_embedding/communication.hpp
deleted file mode 100644
index 4ed51c9afb..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/communication.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <collectives/all_reduce_comm.hpp>
-#include <collectives/ib_comm.hpp>
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <general_buffer2.hpp>
-#include <gpu_resource.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-class Communication {
- public:
-  Communication(size_t width_data_field);
-  virtual ~Communication() = default;
-  virtual void communicate(cudaStream_t stream) = 0;
-  virtual void update_sizes(cudaStream_t stream){};
-  virtual void initiate_communication(cudaStream_t stream){};
-  virtual void wait_completion(cudaStream_t stream){};
-
- protected:
-  size_t width_data_field_;
-};
-
-/*
- * All to All communications
- */
-template <typename commtype>
-struct AllToAllStorage {
-  AllToAllStorage(GeneralBuffer2<CudaAllocator>* buf, size_t max_buffer_size) {
-    buf->reserve({max_buffer_size}, &send_buffer);
-    buf->reserve({max_buffer_size}, &recv_buffer);
-  }
-  Tensor2<commtype> send_buffer, recv_buffer;
-  Tensor2<commtype*> send_buffer_ptrs;
-};
-
-template <typename commtype>
-class AllToAllVComm : public Communication {
- public:
-  AllToAllVComm(Tensor2<commtype> send_buffer, Tensor2<commtype> recv_buffer,
-                const uint32_t* send_offsets, const uint32_t* recv_offsets,
-                const GPUResource* gpu_resource, size_t width_data_field);
-
- protected:
-  Tensor2<commtype> send_buffer_;
-  Tensor2<commtype> recv_buffer_;
-
-  const uint32_t* send_offsets_;
-  const uint32_t* recv_offsets_;
-
-  const GPUResource* gpu_resource_;
-};
-
-template <typename commtype>
-class AllToAll_Multi_NCCL : public AllToAllVComm<commtype> {
- public:
-  using AllToAllVComm<commtype>::AllToAllVComm;
-  void communicate(cudaStream_t stream) final override;
-  ~AllToAll_Multi_NCCL() = default;
-};
-
-// template <typename commtype>
-// class AllToAll_Single : public AllToAllVComm<commtype> {
-// public:
-//   using AllToAllVComm<commtype>::AllToAllVComm;
-//   void communicate() final override;
-//   ~AllToAll_Single() = default;
-// };
-
-/*
- * All Reduce communications
- */
-template <typename commtype>
-class AllReduceComm : public Communication {
- public:
-  AllReduceComm(AllReduceInPlaceComm* ar_comm, AllReduceInPlaceComm::Handle ar_handle,
-                const GPUResource* gpu_resource);
-  void communicate(cudaStream_t stream) final override;
-  ~AllReduceComm() = default;
-
- private:
-  AllReduceInPlaceComm* ar_comm_;
-  AllReduceInPlaceComm::Handle ar_handle_;
-  const GPUResource* gpu_resource_;
-};
-
-#ifdef ENABLE_MPI
-template <typename commtype>
-class HierAll2Allv_Multi_IB : public Communication {
- public:
-  HierAll2Allv_Multi_IB(uint32_t instance_id, HierA2AvCollHandle coll_handle, size_t** send_sizes,
-                        const GPUResource* gpu_resource, IbComm* ib_comm, cudaStream_t comm_stream);
-
-  void update_sizes(cudaStream_t stream) final override;
-  void communicate(cudaStream_t stream) final override;
-  void initiate_communication(cudaStream_t stream) final override;
-  void wait_completion(cudaStream_t stream) final override;
-  ~HierAll2Allv_Multi_IB();
-
- private:
-  uint32_t instance_id_;
-  HierA2AvCollHandle coll_handle_;
-  size_t** send_sizes_;
-  const GPUResource* gpu_resource_;
-  IbComm* ib_comm_;
-  cudaStream_t comm_stream_;
-  cudaEvent_t comm_event_;
-};
-#endif
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/data.hpp b/HugeCTR/include/embeddings/hybrid_embedding/data.hpp
deleted file mode 100644
index 1147f0815a..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/data.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <general_buffer2.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-struct EmbeddingTableFunctors {
-  static dtype get_num_categories(const std::vector<size_t> &table_sizes);
-  static void get_embedding_offsets(std::vector<dtype> &embedding_offsets,
-                                    const std::vector<size_t> &table_sizes);
-  static size_t get_embedding_table_index(const std::vector<size_t> &table_sizes, dtype category);
-};
-
-// depends on : data reader - or mock data
-
-template <typename dtype>
-struct Data {
-  std::vector<size_t> table_sizes;
-  size_t batch_size;
-  size_t num_iterations;
-  size_t num_categories;
-
-  Tensor2<dtype> embedding_offsets;
-  Tensor2<dtype> samples;
-
-  Data(Tensor2<dtype> samples, const std::vector<size_t> &table_sizes_in, size_t batch_size_in,
-       size_t num_iterations_in)
-      : samples(samples),
-        table_sizes(table_sizes_in),
-        batch_size(batch_size_in),
-        num_iterations(num_iterations_in) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    buf->reserve({table_sizes_in.size()}, &embedding_offsets);
-    buf->allocate();
-
-    std::vector<dtype> h_embedding_offsets;
-    EmbeddingTableFunctors<dtype>::get_embedding_offsets(h_embedding_offsets, table_sizes);
-
-    num_categories = EmbeddingTableFunctors<dtype>::get_num_categories(table_sizes);
-    HCTR_LIB_THROW(cudaMemcpy(embedding_offsets.get_ptr(), h_embedding_offsets.data(),
-                              sizeof(dtype) * h_embedding_offsets.size(), cudaMemcpyHostToDevice));
-  }
-
-  Data(const std::vector<size_t> &table_sizes_in, size_t batch_size_in, size_t num_iterations_in)
-      : table_sizes(table_sizes_in), batch_size(batch_size_in), num_iterations(num_iterations_in) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    reserve(buf);
-    buf->reserve({table_sizes_in.size()}, &embedding_offsets);
-    buf->allocate();
-
-    std::vector<dtype> h_embedding_offsets;
-    EmbeddingTableFunctors<dtype>::get_embedding_offsets(h_embedding_offsets, table_sizes);
-
-    num_categories = EmbeddingTableFunctors<dtype>::get_num_categories(table_sizes);
-    HCTR_LIB_THROW(cudaMemcpy(embedding_offsets.get_ptr(), h_embedding_offsets.data(),
-                              sizeof(dtype) * h_embedding_offsets.size(), cudaMemcpyHostToDevice));
-  }
-
-  Data() {}
-  ~Data() {}
-
-  void reserve(std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf) {
-    const size_t num_tables = table_sizes.size();
-    buf->reserve({num_iterations * batch_size * num_tables, 1}, &samples);
-  }
-
-  // convert raw input data such that categories of different
-  // categorical features have unique indices
-  void data_to_unique_categories(Tensor2<dtype> data, cudaStream_t stream);
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp b/HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp
deleted file mode 100644
index 76ec86fec0..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <core23_wrapper.hpp>
-#include <embeddings/hybrid_embedding/communication.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/hybrid_indices.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <gpu_resource.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-// TODO sort out public/private fields
-// In order to use it easier in the IndicesContainer
-template <typename dtype>
-class FrequentEmbeddingBase {
- public:
-  const Data<dtype> *data_ = nullptr;
-  FrequentEmbeddingCompressionView<dtype> *indices_view_ = nullptr;
-
-  // Frequent indices and device pointer!
-  FrequentEmbeddingCompression<dtype> *indices_;
-
-  void set_current_indices(FrequentEmbeddingCompression<dtype> *indices);
-  FrequentEmbeddingBase();
-  virtual ~FrequentEmbeddingBase();
-};
-
-template <typename dtype, typename emtype>
-class FrequentEmbeddingData {
- public:
-  // copy of the model parameters and the input data
-  const Model<dtype> &model_;
-  const GPUResource &gpu_resource_;
-
-  // locally stored embedding vectors for the data-parallel part of the embedding for each table
-  Tensor2<float> frequent_embedding_vectors_;
-
-  // locally stored reduced gradients into fp32 type
-  Tensor2<float> float_frequent_gradients_;
-  // buffer for communication can have fp16 type instead of fp32: input for all-reduce
-  Tensor2<emtype> frequent_gradients_;
-  template <typename T>
-  using BuffPtr = std::shared_ptr<BufferBlock2<T>>;
-  BuffPtr<emtype> grouped_wgrad_buff_;
-  std::shared_ptr<Core23WrappingBuffer> wgrad_core23_buffer_;
-
-  uint32_t embedding_vec_size_;
-  size_t max_num_frequent_categories_;
-
-  FrequentEmbeddingData(const Model<dtype> &model, const GPUResource &gpu_resource,
-                        BuffPtr<emtype> &grouped_wgrad_buff, uint32_t embedding_vec_size,
-                        size_t max_num_frequent_categories);
-  ~FrequentEmbeddingData() {}
-
-  void initialize_embedding_vectors(const std::vector<size_t> &table_sizes,
-                                    size_t grouped_wgrad_offset);
-  template <typename vectype>
-  void forward_network(const vectype *embedding_vectors, emtype *interaction_layer_input,
-                       FrequentEmbeddingBase<dtype> *base, cudaStream_t stream);
-  void local_reduce(const emtype *gradients, FrequentEmbeddingBase<dtype> *base,
-                    cudaStream_t stream);
-
-  template <typename T = emtype>
-  typename std::enable_if<std::is_same<emtype, float>::value, Tensor2<T>>::type &get_gradients() {
-    return float_frequent_gradients_;
-  }
-
-  template <typename T = emtype>
-  typename std::enable_if<!std::is_same<T, float>::value, Tensor2<T>>::type &get_gradients() {
-    return frequent_gradients_;
-  }
-
-  class ExternalManagedBuffer : public HugeCTR::TensorBuffer2 {
-   public:
-    ExternalManagedBuffer(void *ptr) : ptr_(ptr) {}
-    bool allocated() const override { return true; }
-    void *get_ptr() override { return ptr_; }
-
-   private:
-    void *ptr_;
-  };
-};
-
-template <typename dtype, typename emtype>
-class FrequentEmbeddingSingleNode : public FrequentEmbeddingBase<dtype> {
- public:
-  using FrequentEmbeddingBase<dtype>::data_;
-  FrequentEmbeddingData<dtype, emtype> frequent_data_;
-  Tensor2<emtype> frequent_embedding_vectors_cache_;
-  Tensor2<emtype *> embedding_vectors_cache_pointers_;
-  Tensor2<const emtype *> partial_gradients_pointers_;
-  template <typename T>
-  using BuffPtr = std::shared_ptr<BufferBlock2<T>>;
-
-  FrequentEmbeddingSingleNode(const Model<dtype> &model, const GPUResource &gpu_resource,
-                              BuffPtr<emtype> &grouped_wgrad_buff, uint32_t embedding_vec_size,
-                              size_t max_num_frequent_categories);
-
-  void init();
-  void forward_model(cudaStream_t stream);
-  void forward_model_eval(cudaStream_t stream);
-  void forward_network(emtype *interaction_layer_input, cudaStream_t stream);
-  void local_reduce(const emtype *gradients, cudaStream_t stream);
-  void update_model_direct(float *dev_lr, float scale, cudaStream_t stream);
-
-  template <typename T = emtype>
-  typename std::enable_if<std::is_same<T, float>::value, Tensor2<T>>::type
-  get_embedding_vectors_cache() {
-    return frequent_data_.frequent_embedding_vectors_;
-  }
-
-  template <typename T = emtype>
-  typename std::enable_if<!std::is_same<T, float>::value, Tensor2<T>>::type
-  get_embedding_vectors_cache() {
-    return frequent_embedding_vectors_cache_;
-  }
-};
-
-template <typename dtype, typename emtype>
-class FrequentEmbeddingMultiNode : public FrequentEmbeddingBase<dtype> {
- public:
-  using FrequentEmbeddingBase<dtype>::data_;
-  FrequentEmbeddingData<dtype, emtype> frequent_data_;
-  template <typename T>
-  using BuffPtr = std::shared_ptr<BufferBlock2<T>>;
-  std::unique_ptr<Communication> ar_comm_;
-
-  FrequentEmbeddingMultiNode(const Model<dtype> &model, const GPUResource &gpu_resource,
-                             BuffPtr<emtype> &grouped_wgrad_buff, uint32_t embedding_vec_size,
-                             size_t max_num_frequent_categories)
-      : frequent_data_(model, gpu_resource, grouped_wgrad_buff, embedding_vec_size,
-                       max_num_frequent_categories) {}
-
-  void init();
-  void init_ar_comm(AllReduceInPlaceComm *ar_comm, AllReduceInPlaceComm::Handle &handle,
-                    int local_id);
-  void communicate(cudaStream_t stream);
-  void forward_network(emtype *interaction_layer_input, cudaStream_t stream);
-  void local_reduce(const emtype *gradients, cudaStream_t stream);
-  void update_model(float *dev_lr, float scale, cudaStream_t stream);
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp b/HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp
deleted file mode 100644
index a42f668dff..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <gpu_resource.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-namespace hybrid_embedding {
-
-// ===========================================================================================
-// Frequent Compression
-// ===========================================================================================
-
-template <typename dtype>
-struct FrequentEmbeddingCompressionView {
-  const dtype* samples;
-  bool* cache_masks;
-  uint32_t *model_cache_indices, *model_cache_indices_offsets;
-  uint32_t *network_cache_indices, *network_cache_indices_offsets;
-  uint32_t *d_num_frequent_sample_indices, *frequent_sample_indices;
-};
-
-template <typename dtype>
-class FrequentEmbeddingCompression {
-  void calculate_frequent_sample_indices_temp_storage_bytes(const size_t local_samples_size);
-  void calculate_model_cache_indices_temp_storage_bytes(const size_t num_frequent);
-  void calculate_network_cache_indices_temp_storage_bytes(const size_t num_frequent);
-
-  const Model<dtype>& model_;
-  const Data<dtype>& data_;
-
-  FrequentEmbeddingCompressionView<dtype>* device_indices_view_;
-
- public:
-  // Role:
-  //   push from the locally reduced gradient buffer => update embedding vector
-  //   pull embedding vector from the model => update local cache
-  //
-  // Def:
-  //  1 if frequent category is present in this network batch
-  //  [size num_frequent]
-  Tensor2<bool> cache_masks_;
-
-  // model_cache_indices : list of cache indices of this frequent embedding model instance
-  //                       for each mlp deep learning network.
-  // Definition.
-  //                       given the frequent embedding model of frequent embedding vectors
-  //                       stored and updated by this instance, i.e. the range in
-  //                       frequent_embedding_vectors
-  //                         i * num_frequent /num_instances ... (i+1) * num_frequent /num_instances
-  //                         - 1
-  //                       for each network n, the range within model_cache_indices specified by
-  //                         model_cache_indices_offsets_[n] .. model_cache_indices_offsets_[n] - 1
-  //                       is the list of frequent cache indices that appear in network n.
-  //
-  // Role.
-  //
-  //       1. Forward-model :   cache indices into the frequent_embedding_vector array
-  //                            for each send-message-buffer - per mlp network.
-  //       2. Backward-model :  cache indices for each receive-message-buffer - mlp
-  //
-  Tensor2<uint32_t> model_cache_indices_;
-  Tensor2<uint32_t> model_cache_indices_offsets_;
-
-  // network_cache_indices : list of cache indices contained in this network for each
-  //                         frequent embedding model instance
-  // Def.
-  //                         Given the mlp deep learning network samples for this instance,
-  //                         - network n, sample_ids starting with i * batch_size / num_instances -
-  //                         For each embedding model - model_id - list its cache indices that
-  //                         are present within network n's samples. The range of these indices is
-  //                         given by network_cache_indices_offsets_[i+1] ...
-  //                         network_cache_indices_offsets_[i+1]
-  // Role.
-  //       1. Forward-network :   cache indices into the frequent_embedding_vector array
-  //                              for each receive-message-buffer - per frequent embedding model
-  //       2. Backward-network :  cache indices into the frequent_gradient_vectors_
-  //                              for each send-message-buffer - mlp
-  //
-  Tensor2<uint32_t> network_cache_indices_;
-  Tensor2<uint32_t> network_cache_indices_offsets_;
-
-  // Role:
-  //   from buffer => interaction layer
-  //   sample gradients => gradient buffer
-  //
-  // Def:
-  //   sample id's within this network batch
-  //   containing frequent category [network batch size]
-  // "Network side"
-  Tensor2<uint32_t> d_num_frequent_sample_indices_;
-  Tensor2<uint32_t> frequent_sample_indices_;
-
-  // scratch buffers for index calculations
-  Tensor2<char> frequent_sample_indices_temp_storage_;
-  Tensor2<char> model_cache_indices_temp_storage_;
-  Tensor2<char> network_cache_indices_temp_storage_;
-  size_t frequent_sample_indices_temp_storage_bytes_;
-  size_t model_cache_indices_temp_storage_bytes_;
-  size_t network_cache_indices_temp_storage_bytes_;
-
-  FrequentEmbeddingCompression(size_t max_num_frequent_categories, const Data<dtype>& data,
-                               const Model<dtype>& model);
-
-  void calculate_frequent_sample_indices(cudaStream_t stream);
-  void calculate_model_cache_indices(size_t sm_count, cudaStream_t stream);
-  void calculate_network_cache_mask(cudaStream_t stream);
-  void calculate_network_cache_indices(cudaStream_t stream);
-  void calculate_cache_masks(cudaStream_t stream);
-
-  FrequentEmbeddingCompressionView<dtype>* get_device_view() { return device_indices_view_; };
-  const Data<dtype>* get_data() { return &data_; }
-};
-
-// ===========================================================================================
-// Infrequent Selection
-// ===========================================================================================
-
-template <typename dtype>
-struct InfrequentEmbeddingSelectionView {
-  const dtype* samples;
-  uint32_t *model_indices, *model_indices_offsets;
-  uint32_t *network_indices, *network_indices_offsets, *network_indices_src_model_id;
-};
-
-template <typename dtype>
-class InfrequentEmbeddingSelection {
-  void calculate_model_indices_temp_storage_bytes(size_t max_batch_size, size_t table_size);
-  void calculate_network_indices_temp_storage_bytes(size_t max_batch_size, size_t table_size,
-                                                    const uint32_t num_instances);
-
-  const Model<dtype>& model_;
-  const Data<dtype>& data_;
-  InfrequentEmbeddingSelectionView<dtype>* device_indices_view_;
-
- public:
-  // model_indices : list of samples indices of categories for which the embedding vectors are
-  //                 stored in this infrequent embedding model instance.
-  //                 Sample-id's for entire batch, i.e. sorted by mlp deep learning network.
-  // Definition.
-  //                 Given the infrequent embedding model of infrequent embedding vectors
-  //                 stored and updated by this instance, sample indices for categories such
-  //                 that
-  //                     category_location[2*category] == model_id
-  //                 for each network n, the range within model_cache_indices specified by
-  //                    model_indices_offsets_[n] .. model_indices_offsets_[n+1] - 1
-  //                 is the list of infrequent sample indices in network n.
-  // Role.
-  //       1. Forward-model :   indices in the samples array for each send-message-buffer
-  //                            - per mlp network.
-  //       2. Backward-model :  indices in the samples array for each receive-message-buffer
-  //                            - per mlp network.
-  Tensor2<uint32_t> model_indices_;
-  Tensor2<uint32_t> model_indices_offsets_;
-  // Tensor2<size_t> model_indices_sizes_;
-  // Tensor2<size_t *> model_indices_sizes_ptrs_;
-
-  // network_indices : list of sample indices of infrequent categories ordered per infrequent
-  //                   embedding model - model_id - where they're stored.
-  //                   Sample-id's for local batch (i.e sub-batch of this mlp network)
-  // Definition.
-  //                   Given the mlp deep learning network samples for this instance,
-  //                   - network n, sample_ids starting with i * batch_size / num_instances -
-  //                   For each embedding model - model_id - list its sample indices that
-  //                   are present within network n's samples. The range of these indices is given
-  //                   by
-  //                     network_indices_offsets_[n] .. network_indices_offsets_[n+1] - 1
-  // Role.
-  //       1. Forward-network :   local sample indices for each receive-message-buffer
-  //                              - per infrequent embedding model.
-  //       2. Backward-network :  local sample indices for each send-message-buffer
-  //                              - mlp
-  Tensor2<uint32_t> network_indices_;
-  Tensor2<uint32_t> network_indices_offsets_;
-  Tensor2<uint32_t> network_indices_src_model_id_;
-  // Tensor2<size_t> network_indices_sizes_;
-  // Tensor2<size_t *> network_indices_sizes_ptrs_;
-
-  // scratch buffers for index calculations
-  /// TODO: if not overlapping, we can use the same storage
-  Tensor2<char> model_indices_temp_storage_;
-  size_t model_indices_temp_storage_bytes_;
-  Tensor2<char> network_indices_temp_storage_;
-  size_t network_indices_temp_storage_bytes_;
-
-  InfrequentEmbeddingSelection(const Data<dtype>& data, const Model<dtype>& model);
-
-  void calculate_model_indices(cudaStream_t stream);
-  void calculate_network_indices(size_t sm_count, cudaStream_t stream);
-
-  // For now these functions stay in InfreqeuentEmbedding
-  //  since the communications can only use one offsets tensor
-  // void calculate_model_indices_sizes_from_offsets(  size_t embedding_vec_bytes, cudaStream_t
-  // stream); void calculate_network_indices_sizes_from_offsets(size_t embedding_vec_bytes,
-  // cudaStream_t stream);
-
-  InfrequentEmbeddingSelectionView<dtype>* get_device_view() { return device_indices_view_; }
-  const Data<dtype>* get_data() { return &data_; }
-};
-
-// Single-stream version
-template <typename dtype>
-void compute_indices(FrequentEmbeddingCompression<dtype>& compression,
-                     InfrequentEmbeddingSelection<dtype>& selection,
-                     CommunicationType communication_type, bool compute_network_cache_indices,
-                     cudaStream_t main_stream, int sm_count);
-
-}  // namespace hybrid_embedding
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp b/HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp
deleted file mode 100644
index e0205878db..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <data_readers/async_reader/async_reader_common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/hybrid_indices.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <gpu_resource.hpp>
-#include <graph_wrapper.hpp>
-#include <map>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-namespace hybrid_embedding {
-
-template <typename dtype>
-class BatchIndices {
- public:
-  BatchIndices(std::vector<Model<dtype>>& models, std::vector<SparseTensor<dtype>> data_source,
-               std::shared_ptr<ResourceManager>& resource_manager, size_t batch_size,
-               std::vector<size_t>& slot_size_array, size_t max_num_frequent_categories,
-               CommunicationType communication_type);
-
-  void compute(int raw_device_id, size_t batch_size, cudaStream_t stream);
-
-  FrequentEmbeddingCompression<dtype>& get_frequent(int raw_device_id) {
-    return frequent_compression_[raw_device_id];
-  }
-
-  InfrequentEmbeddingSelection<dtype>& get_infrequent(int raw_device_id) {
-    return infrequent_selection_[raw_device_id];
-  }
-
- private:
-  size_t num_slots_ = 0;
-  std::shared_ptr<ResourceManager> resource_manager_;
-  CommunicationType communication_type_;
-  std::vector<Data<dtype>> data_;
-  std::vector<FrequentEmbeddingCompression<dtype>> frequent_compression_;
-  std::vector<InfrequentEmbeddingSelection<dtype>> infrequent_selection_;
-};
-
-}  // namespace hybrid_embedding
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp b/HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp
deleted file mode 100644
index 80b95d0567..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/communication.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/hybrid_indices.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <gpu_resource.hpp>
-#include <memory>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-// In order to use it easier in the IndicesContainer
-template <typename dtype>
-class InfrequentEmbeddingBase {
- protected:
-  const Data<dtype> *data_ = nullptr;
-  InfrequentEmbeddingSelectionView<dtype> *indices_view_ = nullptr;
-
- public:
-  // Infrequent indices and device pointer!
-  InfrequentEmbeddingSelection<dtype> *indices_;
-
-  void set_current_indices(InfrequentEmbeddingSelection<dtype> *indices);
-  InfrequentEmbeddingBase();
-  virtual ~InfrequentEmbeddingBase();
-
-  InfrequentEmbeddingBase(const InfrequentEmbeddingBase &other);
-
-  InfrequentEmbeddingBase &operator=(const InfrequentEmbeddingBase &other) {
-    if (this == &other) {
-      return *this;
-    }
-
-    HCTR_LIB_THROW(cudaMalloc(&indices_view_, sizeof(*indices_view_)));
-
-    HCTR_LIB_THROW(cudaMemcpy(indices_view_, other.indices_view_, sizeof(*indices_view_),
-                              cudaMemcpyDeviceToDevice));
-
-    return *this;
-  }
-};
-
-template <typename dtype, typename emtype>
-class InfrequentEmbedding_NVLink_SingleNode : public InfrequentEmbeddingBase<dtype> {
- public:
-  using InfrequentEmbeddingBase<dtype>::data_;
-
-  // copy of the model parameters and the input data, managed by HybridSparseEmbedding
-  const Model<dtype> &model_;
-  const GPUResource &gpu_resource_;
-
-  // locally stored infrequent embedding vectors for the model-parallel part of the embedding for
-  // each table
-  Tensor2<float> infrequent_embedding_vectors_;
-
-  Tensor2<emtype *> interaction_layer_input_pointers_train_;
-  Tensor2<emtype *> interaction_layer_input_pointers_eval_;
-  Tensor2<const emtype *> gradients_pointers_;
-
-  // to do, we need to initialize it in the constructor
-  uint32_t embedding_vec_size_;
-
-  void init_pointers(int local_gpu_count, const cudaStream_t stream,
-                     std::vector<emtype *> &interaction_layer_input_pointers_train,
-                     std::vector<emtype *> &interaction_layer_input_pointers_eval,
-                     std::vector<const emtype *> &gradients_pointers);
-
-  InfrequentEmbedding_NVLink_SingleNode(Model<dtype> &model, GPUResource &gpu_resource,
-                                        size_t embedding_vec_size);
-
-  ~InfrequentEmbedding_NVLink_SingleNode() {}
-
-  void initialize_embedding_vectors(const std::vector<size_t> &table_sizes);
-  void forward_network_direct(bool is_train, cudaStream_t stream);
-  void update_model_direct(float *dev_lr, float scale, cudaStream_t stream);
-};
-
-template <typename dtype, typename emtype>
-class InfrequentEmbedding_IB_NVLINK : public InfrequentEmbeddingBase<dtype> {
- public:
-  using InfrequentEmbeddingBase<dtype>::data_;
-
-  // copy of the model parameters and the input data, managed by HybridSparseEmbedding
-  const Model<dtype> &model_;
-  const GPUResource &gpu_resource_;
-
-  // locally stored infrequent embedding vectors for the model-parallel part of the embedding for
-  // each table
-  Tensor2<float> infrequent_embedding_vectors_;
-
-  // Tensors to be passed to the hierarchical comms
-  // TODO: move these to the index containers
-  Tensor2<uint32_t> network_indices_offsets_, model_indices_offsets_;
-
-  // to do, we need to initialize it in the constructor
-  uint32_t embedding_vec_size_;
-
-  // private:
-  std::unique_ptr<AllToAllStorage<emtype>> infrequent_forward_comm_buffers_,
-      infrequent_backward_comm_buffers_;
-  std::unique_ptr<Communication> infrequent_forward_comms_, infrequent_backward_comms_;
-
-  // requires model_ and data_ to be set
-  InfrequentEmbedding_IB_NVLINK(Model<dtype> &model, GPUResource &gpu_resource,
-                                size_t embedding_vec_size);
-
-  //~InfrequentEmbedding_IB_NVLINK(){};
-
-  void init_comms(size_t embedding_vec_size, const GPUResource *gpu_resource,
-                  GeneralBuffer2<CudaAllocator> *i_buf, size_t max_buf_size);
-  void initialize_embedding_vectors(const std::vector<size_t> &table_sizes);
-  void forward_model(emtype *message_buffer, cudaStream_t stream);
-  void forward_network(const emtype *message_buffer, emtype *interaction_layer_input,
-                       cudaStream_t stream);
-  void update_network(const emtype *gradients, emtype *message_buffer, cudaStream_t stream);
-  void update_model(const emtype *message_buffer, float *dev_lr, float scale, cudaStream_t stream);
-
-  const uint32_t *get_model_indices_offsets_ptr() { return model_indices_offsets_.get_ptr(); }
-  const uint32_t *get_network_indices_offsets_ptr() { return network_indices_offsets_.get_ptr(); }
-};
-
-template <typename dtype, typename emtype>
-class InfrequentEmbedding_IB_NVLink_Hier : public InfrequentEmbeddingBase<dtype> {
- public:
-  using InfrequentEmbeddingBase<dtype>::data_;
-
-  // copy of the model parameters and the input data, managed by HybridSparseEmbedding
-  const Model<dtype> &model_;
-  const GPUResource &gpu_resource_;
-
-  // locally stored infrequent embedding vectors for the model-parallel part of the embedding for
-  // each table
-  Tensor2<float> infrequent_embedding_vectors_;
-
-  // Communication buffer sizes
-  dtype max_num_infrequent_per_batch_;
-  dtype max_num_infrequent_per_train_batch_;
-
-  // Tensors to be passed to the hierarchical comms
-  // TODO: move these to the index containers
-  Tensor2<size_t> network_indices_sizes_, model_indices_sizes_;
-  Tensor2<size_t *> network_indices_sizes_ptrs_, model_indices_sizes_ptrs_;
-
-  // to do, we need to initialize it in the constructor
-  uint32_t embedding_vec_size_;
-
-  std::unique_ptr<AllToAllStorage<emtype>> infrequent_forward_comm_buffers_,
-      infrequent_backward_comm_buffers_;
-  std::unique_ptr<Communication> infrequent_forward_comms_, infrequent_backward_comms_;
-
-  // requires model_ and data_ to be set
-  InfrequentEmbedding_IB_NVLink_Hier(Model<dtype> &model, GPUResource &gpu_resource,
-                                     size_t embedding_vec_size);
-  //~InfrequentEmbedding_IB_NVLink_Hier(){};
-
-  void init_comms(int64_t max_num_infrequent_samples, size_t slot_num, size_t embedding_vec_size,
-                  GeneralBuffer2<CudaAllocator> *buf_ptr, size_t batch_size_true,
-                  size_t batch_size_false, size_t local_gpu_count);
-  void initialize_embedding_vectors(const std::vector<size_t> &table_sizes);
-  void calculate_model_indices_sizes_from_offsets(cudaStream_t stream);
-  void calculate_network_indices_sizes_from_offsets(cudaStream_t stream);
-  void fused_intra_forward_model(emtype **message_buffer, cudaStream_t stream);
-  void hier_forward_network(const emtype *message_buffer, emtype *output_ptr, cudaStream_t stream);
-  void fused_intra_update_network(const emtype *gradients, emtype **message_buffer,
-                                  cudaStream_t stream);
-  void hier_update_model(const emtype *message_buffer, float *dev_lr, float scale,
-                         cudaStream_t stream);
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/model.hpp b/HugeCTR/include/embeddings/hybrid_embedding/model.hpp
deleted file mode 100644
index 71184dd492..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/model.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/calibration_data.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <resource_manager.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-// Depends on : Data, Statistics and CalibrationData
-
-///
-/// This class defines the hybrid embedding model:
-///    it indicates which categories are frequent, which are infrequent
-///    and it determines where the corresponding embedding vectors are stored.
-///
-/// Also the mlp network - nodes topology is defined here:
-///    The node_id, instance_id where the current model instance is
-///    associated with is stored. However, keep in mind that these are the only
-///    differentiating variables inside this class that differ from other
-///    instances. As this model describes the same distribution across the nodes
-///    and gpu's (networks).
-///
-template <typename dtype>
-struct Model {
- public:
-  uint32_t node_id;
-  uint32_t instance_id;
-  uint32_t global_instance_id;
-
-  CommunicationType communication_type;
-
-  Tensor2<dtype> d_num_frequent;
-  Tensor2<uint32_t> d_total_frequent_count;
-  dtype num_frequent;
-  dtype num_categories;
-  double frequent_probability;
-
-  uint32_t num_instances;
-  std::vector<uint32_t> h_num_instances_per_node;
-  Tensor2<uint32_t>
-      num_instances_per_node;  // number of gpus for each node, .size() == number of nodes
-
-  Tensor2<dtype> category_location;  // indicator category => location in embedding vector
-  Tensor2<dtype> frequent_categories;
-  std::vector<dtype> h_frequent_model_table_offsets;
-  std::vector<dtype> h_infrequent_model_table_offsets;
-
-  // constructors: overloaded for convenience / unit tests
-  // copy constructor
-  Model(const Model &model);
-  ~Model(){};
-  Model(CommunicationType communication_type_in, uint32_t global_instance_id_in,
-        const std::vector<uint32_t> &num_instances_per_node_in, size_t num_categories_in) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    init_params_and_reserve(communication_type_in, global_instance_id_in, num_instances_per_node_in,
-                            num_categories_in, buf);
-    buf->allocate();
-  }
-  Model(CommunicationType communication_type_in, uint32_t global_instance_id_in,
-        const std::vector<uint32_t> &num_instances_per_node_in, size_t num_categories_in,
-        std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf) {
-    init_params_and_reserve(communication_type_in, global_instance_id_in, num_instances_per_node_in,
-                            num_categories_in, buf);
-  }
-
-  void init_params_and_reserve(CommunicationType communication_type_in,
-                               uint32_t global_instance_id_in,
-                               const std::vector<uint32_t> &num_instances_per_node_in,
-                               size_t num_categories_in,
-                               std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf);
-  void init_hybrid_model(const CalibrationData &calibration, Statistics<dtype> &statistics,
-                         const Data<dtype> &data, Tensor2<dtype> &tmp_categories,
-                         cudaStream_t stream);
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/select.cuh b/HugeCTR/include/embeddings/hybrid_embedding/select.cuh
deleted file mode 100644
index 542fcbedd4..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/select.cuh
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <cub/cub.cuh>
-#include <memory>
-#include <vector>
-
-namespace HugeCTR {
-namespace DeviceSelect {
-namespace detail {
-
-template <typename T, typename IndexType, typename SelectOp, int BlockSize = 1024>
-__global__ void pre_select_if(const T *d_input, unsigned short *d_offset, IndexType *d_block_sum,
-                              size_t len, SelectOp op, T *d_num_selected_out = nullptr) {
-  unsigned short this_thread_sum = 0;
-  unsigned short this_block_sum = 0;
-
-  unsigned int tid = threadIdx.x;
-  unsigned int bid = blockIdx.x;
-  size_t gtid = static_cast<size_t>(blockIdx.x) * BlockSize + static_cast<size_t>(threadIdx.x);
-  // a trick to
-  if (!gtid) {
-    *(d_block_sum - 1) = 0;
-  }
-  if (gtid < len) {
-    IndexType in = d_input ? static_cast<IndexType>(d_input[gtid]) : static_cast<IndexType>(gtid);
-    this_thread_sum = static_cast<unsigned short>(op(in));
-  }
-  __syncthreads();
-  typedef cub::BlockScan<unsigned short, BlockSize> BlockScan;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  BlockScan(temp_storage).ExclusiveSum(this_thread_sum, this_thread_sum, this_block_sum);
-  __syncthreads();
-  if (tid == 0) {
-    d_block_sum[bid] = static_cast<IndexType>(this_block_sum);
-  }
-  if (gtid < len) {
-    d_offset[gtid] = this_thread_sum;
-  }
-}
-template <typename T, typename IndexType, typename SelectOp, int BlockSize = 1024>
-__global__ void post_select_if(const T *d_input, const unsigned short *d_offset,
-                               const IndexType *d_block_offset, size_t len, SelectOp Op, T *output,
-                               T *d_num_selected_out) {
-  int64_t global_index = 0;
-  __shared__ IndexType src_data[BlockSize];
-
-  unsigned int tid = threadIdx.x;
-  unsigned int bid = blockIdx.x;
-  size_t gtid = static_cast<size_t>(blockIdx.x) * BlockSize + static_cast<size_t>(threadIdx.x);
-  if (gtid < len) {
-    // d_offset + d_block_offset to get the global index
-    global_index = static_cast<int64_t>(d_block_offset[bid] + static_cast<int64_t>(d_offset[gtid]));
-    // vectorized load
-    IndexType in = d_input ? static_cast<IndexType>(d_input[gtid]) : static_cast<IndexType>(gtid);
-    src_data[tid] = in;
-  }
-  __syncthreads();
-  // warp divergence
-  if (gtid < len && Op(src_data[tid])) {
-    output[global_index] = src_data[tid];
-  }
-  if (!gtid) {
-    *d_num_selected_out = d_block_offset[gridDim.x];
-  }
-};
-
-}  // namespace detail
-
-template <typename T, typename IndexType, typename InputIteratorT, typename SelectOp>
-void If(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT input, T *output,
-        T *d_num_selected_out, IndexType num_items, SelectOp Op, cudaStream_t stream = 0) {
-  constexpr unsigned int blocksize = 1024;
-  unsigned int gridDim = (num_items - 1) / (blocksize) + 1;
-  using cubCountIt = cub::CountingInputIterator<T>;
-  const T *input_ptr{nullptr};
-  if constexpr (!std::is_same<InputIteratorT, cubCountIt>::value) {
-    input_ptr = reinterpret_cast<const T *>(input);
-  }
-
-  if (!d_temp_storage) {
-    temp_storage_bytes = 0;
-    temp_storage_bytes += sizeof(IndexType) * (gridDim + 1);
-    temp_storage_bytes += sizeof(unsigned short) * (num_items);
-    size_t cub_bytes = 0;
-    HCTR_LIB_THROW(cub::DeviceScan::InclusiveSum((void *)(nullptr), cub_bytes,
-                                                 (IndexType *)(nullptr), (IndexType *)(nullptr),
-                                                 gridDim, stream));
-    temp_storage_bytes += cub_bytes;
-    return;
-  }
-  size_t temp_start = reinterpret_cast<size_t>(d_temp_storage);
-  IndexType *d_block_sum = reinterpret_cast<IndexType *>(temp_start);
-  temp_start += sizeof(IndexType) * (gridDim + 1);
-  unsigned short *d_offset = reinterpret_cast<unsigned short *>(temp_start);
-  temp_start += sizeof(unsigned short) * (num_items);
-  size_t cub_bytes = temp_storage_bytes + reinterpret_cast<size_t>(d_temp_storage) -
-                     reinterpret_cast<size_t>(temp_start);
-  detail::pre_select_if<<<gridDim, blocksize, 0, stream>>>(
-      input_ptr, d_offset, d_block_sum + 1, (size_t)num_items, Op, d_num_selected_out);
-  HCTR_LIB_THROW(cub::DeviceScan::InclusiveSum(reinterpret_cast<void *>(temp_start), cub_bytes,
-                                               d_block_sum + 1, d_block_sum + 1, gridDim, stream));
-  detail::post_select_if<<<gridDim, blocksize, 0, stream>>>(
-      input_ptr, d_offset, d_block_sum, (size_t)num_items, Op, output, d_num_selected_out);
-}
-
-}  // namespace DeviceSelect
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp b/HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp
deleted file mode 100644
index 31535b9719..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <general_buffer2.hpp>
-#include <numeric>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-// depends on : data object
-// => allocate(Data data)
-
-template <typename dtype>
-struct Statistics {
- public:
-  Statistics()
-      : num_samples(0),
-        num_tables(0),
-        num_instances(0),
-        num_categories(0),
-        num_unique_categories(0) {}
-  ~Statistics() {}
-  Statistics(dtype num_samples_in, size_t num_tables_in, size_t num_instances_in,
-             dtype num_categories_in)
-      : num_samples(num_samples_in),
-        num_tables(num_tables_in),
-        num_instances(num_instances_in),
-        num_categories(num_categories_in),
-        num_unique_categories(0) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    reserve(buf);
-    buf->allocate();
-  }
-  Statistics(dtype num_samples_in, size_t num_tables_in, size_t num_instances_in,
-             dtype num_categories_in, std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf)
-      : num_samples(num_samples_in),
-        num_tables(num_tables_in),
-        num_instances(num_instances_in),
-        num_unique_categories(0) {
-    reserve(buf);
-  }
-  Statistics(const Data<dtype> &data, size_t num_instances_in)
-      : num_samples(data.batch_size * data.num_iterations * data.table_sizes.size()),
-        num_tables(data.table_sizes.size()),
-        num_instances(num_instances_in),
-        num_categories(std::accumulate(data.table_sizes.begin(), data.table_sizes.end(), (dtype)0)),
-        num_unique_categories(0) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    reserve(buf);
-    buf->allocate();
-  }
-  Statistics(const Data<dtype> &data, size_t num_instances_in,
-             std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf)
-      : num_samples(data.batch_size * data.num_iterations * data.table_sizes.size()),
-        num_tables(data.table_sizes.size()),
-        num_instances(num_instances_in),
-        num_categories(std::accumulate(data.table_sizes.begin(), data.table_sizes.end(), 0)),
-        num_unique_categories(0) {
-    reserve(buf);
-  }
-
-  void reserve(std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf) {
-    buf->reserve({num_samples, 1}, &categories_sorted);
-    buf->reserve({num_samples, 1}, &counts_sorted);
-    buf->reserve({num_tables + 1, 1}, &table_offsets);
-    buf->reserve({num_tables + 1, 1}, &infrequent_model_table_offsets);
-    buf->reserve({num_instances * (num_tables + 1), 1}, &frequent_model_table_offsets);
-    reserve_temp_storage(buf);
-  }
-
-  size_t num_samples;  // input
-  size_t num_tables;
-  size_t num_instances;
-  dtype num_categories;
-  uint32_t num_unique_categories;  // to be calculated
-
-  // top categories sorted by count
-  Tensor2<dtype> categories_sorted;
-  Tensor2<uint32_t> counts_sorted;
-  Tensor2<dtype> table_offsets;  // cumulative sum of table_sizes
-  Tensor2<dtype> infrequent_model_table_offsets;
-  Tensor2<dtype> frequent_model_table_offsets;
-  std::vector<Tensor2<unsigned char>> sort_categories_by_count_temp_storages_;
-  std::vector<Tensor2<unsigned char>> calculate_frequent_categories_temp_storages_;
-  std::vector<Tensor2<unsigned char>> calculate_infrequent_categories_temp_storages_;
-  void reserve_temp_storage(std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf);
-  void sort_categories_by_count(const dtype *samples, size_t num_samples, dtype *categories_sorted,
-                                uint32_t *counts_sorted, uint32_t &num_unique_categories,
-                                cudaStream_t stream);
-  void sort_categories_by_count(const Tensor2<dtype> &samples, cudaStream_t stream);
-  void calculate_frequent_and_infrequent_categories(
-      dtype *frequent_categories, dtype *infrequent_categories, dtype *category_location,
-      const size_t num_frequent, const size_t num_infrequent, cudaStream_t stream);
-  void calculate_infrequent_model_table_offsets(
-      std::vector<dtype> &h_infrequent_model_table_offsets, const dtype *infrequent_categories,
-      const Tensor2<dtype> &category_location, uint32_t global_instance_id,
-      const dtype num_infrequent, cudaStream_t stream);
-  void calculate_frequent_model_table_offsets(std::vector<dtype> &h_frequent_model_table_offsets,
-                                              const dtype *frequent_categories,
-                                              const dtype num_frequent, cudaStream_t stream);
-  void revoke_temp_storage() {
-    sort_categories_by_count_temp_storages_.clear();
-    calculate_frequent_categories_temp_storages_.clear();
-    calculate_infrequent_categories_temp_storages_.clear();
-  }
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/update.cuh b/HugeCTR/include/embeddings/hybrid_embedding/update.cuh
deleted file mode 100644
index e2ed8acc4f..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/update.cuh
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <tensor2.hpp>
-#include <utils.cuh>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-namespace {
-
-template <typename emtype>
-__global__ void sgd_global_update_kernel(const emtype *__restrict__ gradients,
-                                         float *__restrict__ embedding_vectors,
-                                         uint32_t embedding_vec_size,
-                                         const float *__restrict__ lr_ptr, const float scale) {
-  int bid = blockIdx.x;   // block = one vector
-  int tid = threadIdx.x;  // thread = one element in a vector
-
-  float lr = __ldg(lr_ptr) / scale;
-
-  /// TODO: vectorization possible?
-  embedding_vectors[bid * embedding_vec_size + tid] -=
-      lr * TypeConvertFunc<float, emtype>::convert(gradients[bid * embedding_vec_size + tid]);
-}
-
-template <typename emtype, typename LambdaNum, typename LambdaIdx>
-__global__ void sgd_atomic_update_kernel(const emtype *__restrict__ gradients,
-                                         float *__restrict__ embedding_vectors,
-                                         LambdaNum get_num_indices, LambdaIdx get_index,
-                                         uint32_t embedding_vec_size,
-                                         const float *__restrict__ lr_ptr, const float scale) {
-  const uint32_t num_indices = get_num_indices();
-
-  float lr = __ldg(lr_ptr) / scale;
-
-  for (uint32_t i = blockIdx.x; i < num_indices; i += gridDim.x) {
-    auto index = get_index(i);
-
-    atomicAdd(embedding_vectors + index * embedding_vec_size + threadIdx.x,
-              -lr * TypeConvertFunc<float, emtype>::convert(
-                        gradients[i * embedding_vec_size + threadIdx.x]));
-  }
-}
-
-}  // namespace
-
-template <typename dtype, typename emtype>
-void sgd_global_update(const emtype *gradients, float *embedding_vectors,
-                       dtype num_embedding_vectors, uint32_t embedding_vec_size, float *lr_ptr,
-                       float scale, cudaStream_t stream) {
-  if (num_embedding_vectors < 1) return;
-  sgd_global_update_kernel<<<num_embedding_vectors, embedding_vec_size, 0, stream>>>(
-      gradients, embedding_vectors, embedding_vec_size, lr_ptr, scale);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename emtype, typename LambdaNum, typename LambdaIdx>
-void sgd_atomic_update(const emtype *gradients, float *embedding_vectors, LambdaNum get_num_indices,
-                       LambdaIdx get_index, uint32_t n_blocks, uint32_t embedding_vec_size,
-                       float *lr_ptr, float scale, cudaStream_t stream) {
-  // Note: currently taking the number of blocks as an argument but we can also compute it here with
-  // some heuristics if we think it's better
-  sgd_atomic_update_kernel<<<n_blocks, embedding_vec_size, 0, stream>>>(
-      gradients, embedding_vectors, get_num_indices, get_index, embedding_vec_size, lr_ptr, scale);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/utils.cuh b/HugeCTR/include/embeddings/hybrid_embedding/utils.cuh
deleted file mode 100644
index b62aea92b2..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/utils.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-namespace HugeCTR {
-namespace hybrid_embedding {
-
-__global__ void offsets_kernel(const uint32_t* indices, uint32_t* indices_offsets,
-                               uint32_t num_instances, uint32_t multiplier);
-
-__global__ void model_id_kernel(const uint32_t* indices_offsets, uint32_t* src_model_id,
-                                const uint32_t* d_num_elements);
-
-template <typename dtype, typename stype>
-__global__ void modulo_kernel(dtype* buffer, const stype* d_num_elements, dtype divisor);
-
-}  // namespace hybrid_embedding
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/include/embeddings/hybrid_embedding/utils.hpp b/HugeCTR/include/embeddings/hybrid_embedding/utils.hpp
deleted file mode 100644
index 3fdee29842..0000000000
--- a/HugeCTR/include/embeddings/hybrid_embedding/utils.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-enum class HybridEmbeddingType { Distributed, Unknown };
-enum class CommunicationType { IB_NVLink_Hier, IB_NVLink, NVLink_SingleNode, Unknown };
-enum class CommunicationDirection { CommunicationForward, CommunicationBackward };
-
-template <typename dtype>
-void download_tensor(std::vector<dtype>& h_tensor, const Tensor2<dtype> tensor,
-                     cudaStream_t stream);
-
-template <typename dtype>
-void upload_tensor(const std::vector<dtype>& h_tensor, Tensor2<dtype> tensor, cudaStream_t stream);
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp b/HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp
deleted file mode 100644
index d35be06b79..0000000000
--- a/HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <collectives/all_reduce_comm.hpp>
-#include <collectives/ib_comm.hpp>
-#include <data_readers/async_reader/async_reader_adapter.hpp>
-#include <embedding.hpp>
-#include <embeddings/hybrid_embedding/calibration_data.hpp>
-#include <embeddings/hybrid_embedding/communication.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/indices_container.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <gpu_barrier.hpp>
-#include <queue>
-#include <random>
-#include <resource_manager.hpp>
-#include <string>
-#include <tensor2.hpp>
-#include <unordered_map>
-#include <utils.hpp>
-#include <vector>
-
-using namespace HugeCTR::hybrid_embedding;
-
-namespace HugeCTR {
-
-struct HybridSparseEmbeddingParams {
-  size_t train_batch_size;
-  size_t evaluate_batch_size;
-  size_t num_iterations_statistics;
-  size_t max_num_frequent_categories;  // max(train_batch_size, eval_batch_size) * # of batches for
-                                       // frequent categories
-  int64_t max_num_infrequent_samples;
-  double p_dup_max;
-  size_t embedding_vec_size;
-  size_t slot_num;  // slot number
-  std::vector<size_t> slot_size_array;
-  hybrid_embedding::CommunicationType communication_type;
-  double max_all_reduce_bandwidth;
-  double max_all_to_all_bandwidth;
-  double efficiency_bandwidth_ratio;
-  hybrid_embedding::HybridEmbeddingType hybrid_embedding_type;
-  OptParams opt_params;  // optimizer params
-};
-
-///
-/// Interface class for the hybrid embedding to HugeCTR. It is responsible for
-/// persistent gpu memory allocation.
-///
-template <typename dtype, typename emtype>
-class HybridSparseEmbedding : public SchedulableEmbeding {
- private:
-  // Embedding models, one instance per frequent and the infrequent embedding
-  // for each mlp-network in the train session.
-  //
-
-  // data-parallel embedding model
-  std::vector<FrequentEmbeddingSingleNode<dtype, emtype>> frequent_embeddings_single_node_;
-  std::vector<FrequentEmbeddingMultiNode<dtype, emtype>> frequent_embeddings_multi_node_;
-
-  // model-parallel embedding model
-  std::vector<InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>>
-      infrequent_embeddings_single_node_;
-  std::vector<InfrequentEmbedding_IB_NVLINK<dtype, emtype>> infrequent_embeddings_ib_nvlink_;
-  std::vector<InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>>
-      infrequent_embeddings_ib_nvlink_hier_;
-
-  // Hier A2Av / custom AR impl
-#ifdef ENABLE_MPI
-  std::vector<cudaStream_t> comm_stream_;
-  IbComm* ib_comm_;
-  AllReduceInPlaceComm::Handle barrier_handle_;
-#endif
-  std::unique_ptr<GPUBarrier> gpu_barrier_;
-
-  AllReduceInPlaceComm::Handle frequent_embedding_handle_;
-  Tensors2<uint32_t> d_barrier_store_;
-
-  // model_, data_, calibration_ and statistics_ are replications of the model
-  // and input data on each gpu. The HybridSparseEmbedding class manages
-  // it's scope / frees the memory.
-  std::vector<hybrid_embedding::Model<dtype>> model_;
-  std::vector<Data<dtype>> data_statistics_;
-  std::vector<CalibrationData> calibration_;
-  std::vector<Statistics<dtype>> statistics_;
-
-  // added by kefeng
-  // std::vector<CudaPreAllocator> pre_alloc_bufs_;
-  std::vector<std::shared_ptr<GeneralBuffer2<CudaAllocator>>> bufs_;
-
-  size_t train_inflight_id_ = 0; /**< Which BatchIndices to use. */
-  size_t eval_inflight_id_ = 0;  /**< Which BatchIndices to use. */
-  HybridSparseEmbeddingParams embedding_params_;
-  std::shared_ptr<ResourceManager> resource_manager_;
-
-  Tensors2<emtype> train_output_tensors_;    /**< The output tensors. */
-  Tensors2<emtype> evaluate_output_tensors_; /**< The output tensors. */
-  template <typename T>
-  using BuffPtr = std::shared_ptr<BufferBlock2<T>>;
-  std::vector<BuffPtr<emtype>> grouped_wgrad_buff_;
-  bool grouped_all_reduce_ = false;
-
-  std::vector<OptParams> opt_params_; /**< Optimizer params. */
-
-  GpuLearningRateSchedulers lr_scheds_;
-  bool graph_mode_;
-
-  size_t current_train_batch_size_ =
-      0; /**< Current batch size (since we need to handle incomplete batch). */
-  size_t current_eval_batch_size_ =
-      0; /**< Current batch size (since we need to handle incomplete batch). */
-  bool current_train_batch_cached_ = false; /**< Used to check if BatchIndices already computed. */
-  bool current_eval_batch_cached_ = false;  /**< Used to check if BatchIndices already computed. */
-  std::vector<BatchIndices<dtype>> train_batch_indices_; /**< Stores indices for Batch. */
-  std::vector<BatchIndices<dtype>> eval_batch_indices_;  /**< Stores indices for Batch. */
-
-  // TODO: this parameter is not used by HE at all.
-  // We should be in pursuit of merging SparseEmbeddingHashParams and HybridSparseEmbeddingParams
-  SparseEmbeddingHashParams dummy_params_;
-
-  FrequentEmbeddingBase<dtype>& get_frequent_embedding(size_t i) {
-    if (frequent_embeddings_single_node_.size()) {
-      return frequent_embeddings_single_node_[i];
-    } else {
-      return frequent_embeddings_multi_node_[i];
-    }
-  }
-  FrequentEmbeddingData<dtype, emtype>& get_frequent_embedding_data(size_t i) {
-    if (frequent_embeddings_single_node_.size()) {
-      return frequent_embeddings_single_node_[i].frequent_data_;
-    } else {
-      return frequent_embeddings_multi_node_[i].frequent_data_;
-    }
-  }
-
-  InfrequentEmbeddingBase<dtype>& get_infrequent_embedding(size_t i) {
-    switch (embedding_params_.communication_type) {
-      case CommunicationType::NVLink_SingleNode:
-        return infrequent_embeddings_single_node_[i];
-      case CommunicationType::IB_NVLink:
-        return infrequent_embeddings_ib_nvlink_[i];
-      case CommunicationType::IB_NVLink_Hier:
-        return infrequent_embeddings_ib_nvlink_hier_[i];
-      default:
-        throw std::runtime_error("Unsupported communication type");
-    }
-  }
-
- protected:
-  size_t get_batch_size(bool is_train) const {
-    if (is_train) {
-      return embedding_params_.train_batch_size;
-    } else {
-      return embedding_params_.evaluate_batch_size;
-    }
-  }
-  size_t get_universal_batch_size() const {
-    return std::max(embedding_params_.train_batch_size, embedding_params_.evaluate_batch_size);
-  }
-  size_t get_batch_size_per_gpu(bool is_train) const {
-    return get_batch_size(is_train) / resource_manager_->get_global_gpu_count();
-  }
-  size_t get_embedding_vec_size() const { return embedding_params_.embedding_vec_size; }
-  size_t get_slot_num() const { return embedding_params_.slot_num; }
-  void get_num_instances_per_node(std::vector<uint32_t>& num_instances_per_node) {
-    uint32_t total_gpu_count = resource_manager_->get_global_gpu_count();
-    for (uint32_t gid = 0; gid < total_gpu_count; ++gid) {
-      uint32_t nodeid = resource_manager_->get_process_id_from_gpu_global_id(gid);
-      num_instances_per_node[nodeid] = num_instances_per_node[nodeid] + 1;
-    }
-    return;
-  }
-
-  GPUResource& get_local_gpu(int i) const { return *resource_manager_->get_local_gpu(i); }
-
-  size_t get_categories_num() {
-    size_t num_categories = 0;
-    for (size_t i = 0; i < embedding_params_.slot_size_array.size(); ++i) {
-      num_categories += embedding_params_.slot_size_array[i];
-    }
-    return num_categories;
-  }
-
- public:
-  HybridSparseEmbedding(const SparseTensors<dtype>& train_input_tensors,
-                        const SparseTensors<dtype>& evaluate_input_tensors,
-                        const HybridSparseEmbeddingParams& embedding_params,
-                        const std::vector<BuffPtr<emtype>>& grouped_wgrad_buff,
-                        const GpuLearningRateSchedulers lr_scheds, bool graph_mode,
-                        const std::shared_ptr<ResourceManager>& resource_manager);
-  ~HybridSparseEmbedding() = default;
-
-  // TODO: consider to merge it with init_params
-  void init_model(const SparseTensors<dtype>& data, size_t& wgrad_offset);
-
-  void setup_buffered_indices(bool is_train, AsyncReader<dtype>* data_reader);
-
-  void forward(bool is_train) override;
-  void backward() override;
-  void update_params() override;
-  void init_params() override;
-  void load_parameters(std::string sparse_model) override;
-  void dump_parameters(std::string sparse_model) const override;
-  void set_learning_rate(float lr) override;
-  // TODO: a workaround to enable GPU LR for HE only; need a better way
-  GpuLearningRateSchedulers get_learning_rate_schedulers() const override;
-
-  size_t get_params_num() const override;
-  size_t get_vocabulary_size() const override;
-  size_t get_max_vocabulary_size() const override;
-
-  Embedding_t get_embedding_type() const override { return Embedding_t::HybridSparseEmbedding; }
-  // TODO: implemented the empty virtual functions below and in the corresponding CU file.
-  void load_parameters(BufferBag& keys, size_t num) override {}
-  void dump_parameters(BufferBag& keys, size_t* num) const override {}
-
-  void dump_opt_states(std::string sparse_model) override {}
-  void load_opt_states(std::string read_path) override {}
-  void reset_optimizer() override {}
-  void reset() override {}
-
-  const SparseEmbeddingHashParams& get_embedding_params() const override { return dummy_params_; }
-  void check_overflow() const override {}
-  void get_forward_results_tf(const bool is_train, const bool on_gpu,
-                              void* const forward_result) override {}
-
-  std::vector<TensorBag2> get_train_output_tensors() const override;
-  std::vector<TensorBag2> get_evaluate_output_tensors() const override;
-
-  cudaError_t update_top_gradients(const bool on_gpu, const void* const top_gradients) override {
-    throw;
-  }
-
-  void freeze() override { HCTR_LOG(WARNING, ROOT, "Hybrid embedding cannot be freezed.\n"); }
-
-  void unfreeze() override {
-    HCTR_LOG(WARNING, ROOT, "Hybrid embedding do not need to be unfreezed.\n");
-  }
-
-  bool is_trainable() const override { return true; }
-
-  void assign_input_tensors(bool is_train, size_t batch_size, size_t inflight_id,
-                            bool cached) override;
-  void index_calculation(bool is_train, int i) override;
-  void freq_forward(bool is_train, int i, bool is_first_eval_batch) override;
-  void freq_backward(int i) override;
-  void freq_update_params(int i) override;
-  void infreq_model_forward(int i) override;
-  void infreq_network_forward(bool is_train, int i) override;
-  void global_barrier(bool is_train, int i) override;
-  void infreq_network_backward(int i) override;
-  void infreq_model_backward(int i) override;
-};
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp b/HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp
deleted file mode 100644
index 675aed0b5d..0000000000
--- a/HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <omp.h>
-
-#include <common.hpp>
-#include <embeddings/embedding_data.hpp>
-#include <embeddings/sparse_embedding_functors.hpp>
-#include <utils.hpp>
-
-namespace HugeCTR {
-/**
- * The LocalizedSlotSparseEmbeddingOneHot class inherits from Embedding class, which is the base
- * class for implementing all embedding layers. In this class, the slots in the embedding table
- * are assigned to a single GPU separately, which are called localized slots. For example, slot-0 on
- * GPU-0, slot-1 on GPU-1, slot-2 on GPU-0, slot-3 on GPU-1, etc. This class is very simple to the
- * LocalizedSlotSparseEmbeddingHash, but optimized for performance according to the "one-hot"
- * feature. So, there are several assumptions in this class: 1) The mapping method from keys to
- * embedding row_indices is linear, so there is no hashtable in this class; 2) all the features are
- * one-hot, while multi-hot is not supported in this class; 3) Implement P2P access in forward prop,
- * fused forward_sum+all2all+reorder, so there is no all2all in forward and backward prop, and can
- * only support single node. 4) only support SGD optimizer by now.
- */
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-class LocalizedSlotSparseEmbeddingOneHot : public IEmbedding {
- private:
-  // define tensors
-  EmbeddingData<TypeHashKey, TypeEmbeddingComp> embedding_data_;
-  Tensors2<float> hash_table_value_tensors_; /**< Hash table value. */
-  std::vector<Tensors2<float>> value_table_tensors_;
-
-  Tensors2<size_t> hash_table_slot_id_tensors_; /**< the tensors for storing slot ids */
-  Tensors2<size_t> hash_value_index_tensors_;   /**< Hash value index. The index is corresponding to
-                                                   the line   number of the value. */
-  Tensors2<TypeEmbeddingComp>
-      embedding_feature_tensors_; /**< the output tensor of the forward(). */
-  Tensor2<TypeEmbeddingComp *> train_embedding_features_;
-  Tensor2<TypeEmbeddingComp *> evaluate_embedding_features_;
-  Tensors2<TypeEmbeddingComp> wgrad_tensors_; /**< the input tensor of the backward(). */
-
-  Tensors2<size_t> top_categories_;
-  std::vector<size_t> size_top_categories_;
-
-  size_t max_vocabulary_size_;
-  size_t max_vocabulary_size_per_gpu_;   /**< Max vocabulary size for each GPU. */
-  std::vector<size_t> slot_num_per_gpu_; /* slot_num per GPU */
-  std::vector<size_t> slot_size_array_;
-
-  SparseEmbeddingFunctors functors_;
-
-  Tensors2<TypeEmbeddingComp> all2all_tensors_; /**< the temple buffer to store all2all results */
-  Tensors2<TypeEmbeddingComp> utest_all2all_tensors_;
-  Tensors2<TypeEmbeddingComp> utest_reorder_tensors_;
-  Tensors2<TypeEmbeddingComp> utest_backward_temp_tensors_;
-  Tensors2<TypeEmbeddingComp> utest_forward_temp_tensors_;
-
-  Tensors2<uint32_t> mapping_offsets_per_gpu_tensors_;
-
-  Tensor2<TypeEmbeddingComp *> &get_embedding_features(bool is_train) {
-    if (is_train) {
-      return train_embedding_features_;
-    } else {
-      return evaluate_embedding_features_;
-    }
-  }
-
-  /**
-   * Calculate the max vocabulary size per GPU.
-   * @param total_gpu_count total GPU count.
-   * @param local_gpu_count local GPU count.
-   * @param slot_sizes an array which stores the size of the slots to be initialized.
-   * @param device_resources GPU device resources.
-   */
-  static size_t cal_max_voc_size_per_gpu(const std::vector<size_t> slot_sizes,
-                                         const ResourceManager &resource_manager) {
-    size_t local_gpu_count = resource_manager.get_local_gpu_count();
-    size_t total_gpu_count = resource_manager.get_global_gpu_count();
-
-    size_t max_voc_size = 0;
-    for (size_t id = 0; id < local_gpu_count; id++) {
-      size_t global_id = resource_manager.get_local_gpu(id)->get_global_id();
-
-      size_t total_size = 0;
-      for (size_t i = 0; i < slot_sizes.size(); i++) {
-        if ((i % total_gpu_count) == global_id) {
-          total_size += slot_sizes[i];
-        }
-      }
-
-      if (total_size > max_voc_size) {
-        max_voc_size = total_size;
-      }
-    }
-
-    return max_voc_size;
-  }
-
-  /**
-   * Initialize the hash table and embedding table on local GPUs. This function is only used
-   * by LocalizedSparseEmbeddingHash.
-   * @param slot_sizes an array which stores the size of the slots to be initialized.
-   * @param embedding_vec_size embedding vector size.
-   * @param hash_table_value_tensors embedding table tensors.
-   * @param hash_table_slot_id_tensors slot ids tensors.
-   */
-  void init_embedding(const std::vector<size_t> slot_sizes, size_t embedding_vec_size,
-                      std::vector<Tensors2<float>> &hash_table_value_tensors,
-                      Tensors2<size_t> &hash_table_slot_id_tensors);
-
-  /**
-   * load_parameters() for LocalizedSlotSparseEmbeddingOnehot
-   * @param keys the memory buffer storing keys.
-   * @param slot_id the memory buffer storing slot_id.
-   * @param embeddings the memory buffer storing embedding vectors.
-   * @param num the number of unique keys (embedding vectors) in keys (embeddings).
-   * @param embedding_vec_size embedding vector size.
-   * @param hash_table_value_tensors the hash table value on multi GPUs.
-   * @param slot_sizes the size for each slot
-   * @param mapping_offsets_per_gpu_tensors the mapping offset of each slot on every GPU
-   */
-  void load_parameters(const Tensor2<TypeHashKey> &keys, const Tensor2<size_t> &slot_id,
-                       const Tensor2<float> &embeddings, size_t num, size_t embedding_vec_size,
-                       Tensors2<float> &hash_table_value_tensors,
-                       const std::vector<size_t> &slot_sizes,
-                       const Tensors2<uint32_t> &mapping_offsets_per_gpu_tensors);
-
-  /**
-   * dump_parameters for LocalizedSlotSparseEmbeddingOnehot.
-   * @param sparse_model the folder name of sparse model.
-   * @param embedding_vec_size embedding vector size.
-   * @param hash_table_value_tensors the hash table value on multi-GPU.
-   * @param slot_sizes the size for each slot
-   */
-  void dump_parameters(const std::string &sparse_model, size_t embedding_vec_size,
-                       const Tensors2<float> &hash_table_value_tensors,
-                       const std::vector<size_t> &slot_sizes) const;
-
-  /**
-   * dump_parameters for LocalizedSlotSparseEmbeddingOnehot.
-   * @param keys the memory buffer to store keys.
-   * @param slot_id the memory buffer to store slot_id.
-   * @param embeddings the memory buffer to store embedding vectors.
-   * @param num pointer to store the number of unique keys (embedding vectors).
-   * @param embedding_vec_size embedding vector size.
-   * @param hash_table_value_tensors the hash table value on multi-GPU.
-   * @param slot_sizes the size for each slot
-   */
-  void dump_parameters(Tensor2<TypeHashKey> &keys, Tensor2<size_t> &slot_id,
-                       Tensor2<float> &embeddings, size_t *num, size_t embedding_vec_size,
-                       const Tensors2<float> &hash_table_value_tensors,
-                       const std::vector<size_t> &slot_sizes) const;
-
- public:
-  /**
-   * The constructor of LocalizedSlotSparseEmbeddingOneHot.
-   * @param row_offsets_tensors row offsets of the input tensor(refer to row offset vector in sparse
-   * matrix CSR format).
-   * @param hash_key_tensors hash keys of the input tensor(refer to value vector in sparse matrix
-   * CSR format).
-   * @param embedding_params embedding params for initialization.
-   * @param resource_manager the GPU resource group
-   */
-  LocalizedSlotSparseEmbeddingOneHot(const Tensors2<TypeHashKey> &train_row_offsets_tensors,
-                                     const Tensors2<TypeHashKey> &train_value_tensors,
-                                     const std::vector<std::shared_ptr<size_t>> &train_nnz_array,
-                                     const Tensors2<TypeHashKey> &evaluate_row_offsets_tensors,
-                                     const Tensors2<TypeHashKey> &evaluate_value_tensors,
-                                     const std::vector<std::shared_ptr<size_t>> &evaluate_nnz_array,
-                                     const SparseEmbeddingHashParams &embedding_params,
-                                     const std::shared_ptr<ResourceManager> &resource_manager);
-
-  LocalizedSlotSparseEmbeddingOneHot(const SparseTensors<TypeHashKey> &train_keys,
-                                     const SparseTensors<TypeHashKey> &evaluate_keys,
-                                     const SparseEmbeddingHashParams &embedding_params,
-                                     const std::shared_ptr<ResourceManager> &resource_manager);
-
-  void filter_keys_per_gpu(bool is_train, size_t id, size_t global_id, size_t global_num);
-
-  void data_to_unique_categories_per_gpu(bool is_train, size_t id);
-  /**
-   * The forward propagation of embedding layer.
-   */
-  void forward(bool is_train) override {
-    CudaDeviceContext context;
-
-#pragma omp parallel for num_threads(embedding_data_.get_resource_manager().get_local_gpu_count())
-    for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) {
-      context.set_device(
-          embedding_data_.get_local_gpu(i).get_device_id());  // set device
-                                                              // for forward_fuse method
-      if (embedding_data_.embedding_params_.do_unique_key_flag) {
-        data_to_unique_categories_per_gpu(is_train, i);
-      }
-      if (embedding_data_.embedding_params_.is_data_parallel) {
-        filter_keys_per_gpu(is_train, i, embedding_data_.get_local_gpu(i).get_global_id(),
-                            embedding_data_.get_resource_manager().get_global_gpu_count());
-      }
-      functors_.forward_mapping_per_gpu(
-          embedding_data_.embedding_params_.get_batch_size(is_train), slot_num_per_gpu_[i],
-          embedding_data_.get_value_tensors(is_train)[i],
-          *embedding_data_.get_nnz_array(is_train)[i], mapping_offsets_per_gpu_tensors_[i],
-          hash_value_index_tensors_[i], embedding_data_.get_local_gpu(i).get_stream());
-
-      // fuse forward+all2all+reorder into one kernel
-      functors_.forward_fuse_per_gpu(
-          i, embedding_data_.get_resource_manager().get_local_gpu_count(),
-          embedding_data_.embedding_params_.get_batch_size(is_train),
-          embedding_data_.get_batch_size_per_gpu(is_train),
-          embedding_data_.embedding_params_.slot_num, slot_num_per_gpu_[i],
-          embedding_data_.embedding_params_.embedding_vec_size,
-          embedding_data_.embedding_params_.combiner,
-          embedding_data_.get_row_offsets_tensors(is_train)[i], hash_value_index_tensors_[i],
-          hash_table_value_tensors_[i], get_embedding_features(is_train),
-          embedding_data_.get_local_gpu(i).get_sm_count(),
-          embedding_data_.get_local_gpu(i).get_stream());
-    }
-
-    functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-    return;
-  }
-
-  /**
-   * The first stage of backward propagation of embedding layer,
-   * which computes the wgrad by the dgrad from the top layer.
-   */
-  void backward() override {
-    functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-    CudaDeviceContext context;
-    for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) {
-      context.set_device(embedding_data_.get_local_gpu(i).get_device_id());
-
-      functors_.backward_fuse_per_gpu(
-          i, embedding_data_.get_resource_manager().get_local_gpu_count(),
-          embedding_data_.embedding_params_.get_batch_size(true),
-          embedding_data_.get_batch_size_per_gpu(true), embedding_data_.embedding_params_.slot_num,
-          slot_num_per_gpu_[i], embedding_data_.embedding_params_.embedding_vec_size,
-          embedding_data_.embedding_params_.combiner, get_embedding_features(true),
-          wgrad_tensors_[i], embedding_data_.get_local_gpu(i).get_sm_count(),
-          embedding_data_.get_local_gpu(i).get_stream());
-    }
-
-    return;
-  }
-
-  /**
-   * The second stage of backward propagation of embedding layer, which
-   * updates the hash table by wgrad(from backward()) and optimizer.
-   */
-  void update_params() override {
-    // accumulate times for adam optimizer
-    embedding_data_.embedding_params_.opt_params.hyperparams.adam.times++;
-#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count())
-    {
-      size_t id = omp_get_thread_num();
-      CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id());
-
-      // do update params operation: only support SGD
-      functors_.update_params(
-          embedding_data_.embedding_params_.embedding_vec_size,
-          embedding_data_.embedding_params_.opt_params, *embedding_data_.get_nnz_array(true)[id],
-          hash_value_index_tensors_[id], wgrad_tensors_[id], hash_table_value_tensors_[id],
-          top_categories_[id], size_top_categories_[id],
-          embedding_data_.get_local_gpu(id).get_sm_count(),
-          embedding_data_.get_local_gpu(id).get_stream());
-    }
-
-    return;
-  }
-
-  /**
-   * Initialize the embedding table
-   */
-  void init_params() override {
-    // do hash table value initialization
-    if (slot_size_array_.size() == embedding_data_.embedding_params_.slot_num) {
-      init_embedding(slot_size_array_, embedding_data_.embedding_params_.embedding_vec_size,
-                     value_table_tensors_, hash_table_slot_id_tensors_);
-    } else {
-      throw std::runtime_error(
-          std::string("[HCDEBUG][ERROR] Runtime error: the size of slot_sizes != slot_num\n"));
-    }
-  }
-
-  /**
-   * Read the hash table from the weight_stream on the host, and
-   * upload it onto multi-GPUs global memory.
-   * @param sparse_model the folder name of sparse model.
-   */
-  void load_parameters(std::string sparse_model) override;
-  void load_parameters(BufferBag &buf_bag, size_t num) override;
-  /**
-   * Download the hash table from multi-GPUs global memory to CPU memory
-   * and write it to the weight_stream on the host.
-   * @param sparse_model the folder name of sparse model.
-   */
-  void dump_parameters(std::string sparse_model) const override;
-  void dump_parameters(BufferBag &buf_bag, size_t *num) const override;
-
-  void dump_opt_states(std::string sparse_model) override {}
-  void load_opt_states(std::string read_path) override {}
-  void reset_optimizer() override {}
-
-  /**
-   * Reset the embedding
-   */
-  void reset() override;
-
-  /**
-   * Get the total size of hash tables on all GPUs.
-   */
-  size_t get_params_num() const override {
-    return (max_vocabulary_size_ * embedding_data_.embedding_params_.embedding_vec_size);
-  }
-
-  size_t get_vocabulary_size() const override { return max_vocabulary_size_; }
-
-  size_t get_max_vocabulary_size() const override { return max_vocabulary_size_; }
-
-  // only used for results check
-  /**
-   * Get the forward() results from GPUs and copy them to the host pointer
-   * embedding_feature. This function is only used for unit test.
-   * @param embedding_feature the host pointer for storing the forward()
-   * results.
-   */
-  void get_forward_results(bool is_train, Tensor2<TypeEmbeddingComp> &embedding_feature) {
-    size_t memcpy_size = embedding_data_.get_batch_size_per_gpu(is_train) *
-                         embedding_data_.embedding_params_.slot_num *
-                         embedding_data_.embedding_params_.embedding_vec_size;
-
-    functors_.get_forward_results(memcpy_size, embedding_data_.get_output_tensors(is_train),
-                                  embedding_feature, utest_forward_temp_tensors_,
-                                  embedding_data_.get_resource_manager());
-
-    return;
-  }
-
-  /**
-   * Get the forward() results from GPUs and copy them to tensorflow's tensor.
-   */
-  void get_forward_results_tf(const bool is_train, const bool on_gpu,
-                              void *const forward_result) override {
-    size_t memcpy_size = embedding_data_.get_batch_size_per_gpu(is_train) *
-                         embedding_data_.embedding_params_.slot_num *
-                         embedding_data_.embedding_params_.embedding_vec_size;
-    functors_.get_forward_results(memcpy_size, embedding_data_.get_output_tensors(is_train),
-                                  forward_result, utest_forward_temp_tensors_,
-                                  embedding_data_.get_resource_manager(), on_gpu);
-    return;
-  }
-
-  /**
-   * Get the backward() results from GPUs and copy them to the host pointer
-   * wgrad. The wgrad on each GPU should be the same. This function is only
-   * used for unit test.
-   * @param wgrad the host pointer for storing the backward() results.
-   * @param devIndex the GPU device id.
-   */
-  void get_backward_results(Tensor2<TypeEmbeddingComp> &wgrad, int devIndex) {
-    CudaDeviceContext context(embedding_data_.get_local_gpu(0).get_device_id());
-
-#ifndef ENABLE_MPI
-    if (embedding_data_.get_resource_manager().get_global_gpu_count() > 1) {
-      functors_.all2all_forward(embedding_data_.get_batch_size_per_gpu(true), slot_num_per_gpu_,
-                                embedding_data_.embedding_params_.embedding_vec_size,
-                                wgrad_tensors_, utest_all2all_tensors_,
-                                embedding_data_.get_resource_manager());
-    } else {
-      HCTR_LIB_THROW(cudaMemcpyAsync(
-          utest_all2all_tensors_[0].get_ptr(), wgrad_tensors_[0].get_ptr(),
-          embedding_data_.get_batch_size_per_gpu(true) * slot_num_per_gpu_[0] *
-              embedding_data_.embedding_params_.embedding_vec_size * sizeof(TypeEmbeddingComp),
-          cudaMemcpyDeviceToDevice, embedding_data_.get_local_gpu(0).get_stream()));
-    }
-#else
-    if (embedding_data_.get_resource_manager().get_global_gpu_count() > 1) {
-      functors_.all2all_forward(
-          embedding_data_.get_batch_size_per_gpu(true), embedding_data_.embedding_params_.slot_num,
-          embedding_data_.embedding_params_.embedding_vec_size, wgrad_tensors_,
-          utest_all2all_tensors_, embedding_data_.get_resource_manager());
-    } else {
-      HCTR_LIB_THROW(cudaMemcpyAsync(
-          utest_all2all_tensors_[0].get_ptr(), wgrad_tensors_[0].get_ptr(),
-          embedding_data_.get_batch_size_per_gpu(true) * slot_num_per_gpu_[0] *
-              embedding_data_.embedding_params_.embedding_vec_size * sizeof(TypeEmbeddingComp),
-          cudaMemcpyDeviceToDevice, embedding_data_.get_local_gpu(0).get_stream()));
-    }
-#endif
-
-    // reorder
-    functors_.forward_reorder(
-        embedding_data_.get_batch_size_per_gpu(true), embedding_data_.embedding_params_.slot_num,
-        embedding_data_.embedding_params_.embedding_vec_size, utest_all2all_tensors_,
-        utest_reorder_tensors_, embedding_data_.get_resource_manager());
-
-    // there are batch_size_per_gpu samples' wgard on each GPU
-    size_t memcpy_size = embedding_data_.get_batch_size_per_gpu(true) *
-                         embedding_data_.embedding_params_.slot_num *
-                         embedding_data_.embedding_params_.embedding_vec_size;
-
-    // nccl gather
-    functors_.all_gather(memcpy_size,
-                         utest_reorder_tensors_,        // send
-                         utest_backward_temp_tensors_,  // recv
-                         embedding_data_.get_resource_manager());
-
-    // memcpy H2D
-    functors_.get_backward_results(
-        devIndex, embedding_data_.get_resource_manager().get_global_gpu_count() * memcpy_size,
-        utest_backward_temp_tensors_, wgrad, embedding_data_.get_resource_manager());
-
-    return;
-  }
-
-  /**
-   * Get the update_params() results(the hash table, including hash_table_keys
-   * and hash_table_values) from GPUs and copy them to the host pointers.
-   * This function is only used for unit test.
-   * @param hash_table_key the host pointer for storing the hash table keys.
-   * @param hash_table_value the host pointer for storing the hash table values.
-   */
-  void get_update_params_results(Tensor2<TypeHashKey> &hash_table_key,
-                                 Tensor2<float> &hash_table_value) {}
-
-  void check_overflow() const override {}
-
-  /** only used in tf embedding plugin to distribute top_gradients to each GPUs' output tensor.
-   */
-  cudaError_t update_top_gradients(const bool on_gpu, const void *const top_gradients) override {
-    auto output_tensors = embedding_data_.get_output_tensors(true);
-    CudaDeviceContext context;
-
-    const auto top_gradients_internel = reinterpret_cast<const TypeEmbeddingComp *>(top_gradients);
-    cudaMemcpyKind direction = (on_gpu ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice);
-
-    cudaError_t error = cudaError_t::cudaSuccess;
-    for (size_t dev_id = 0; dev_id < embedding_data_.get_resource_manager().get_local_gpu_count();
-         ++dev_id) {
-      context.set_device(embedding_data_.get_local_gpu(dev_id).get_device_id());
-
-      error = cudaMemcpyAsync(
-          output_tensors[dev_id].get_ptr(),
-          top_gradients_internel + dev_id * output_tensors[dev_id].get_num_elements(),
-          output_tensors[dev_id].get_size_in_bytes(), direction,
-          embedding_data_.get_local_gpu(dev_id).get_stream());
-      if (error != cudaError_t::cudaSuccess) return error;
-    }
-
-    for (size_t dev_id = 0; dev_id < embedding_data_.get_resource_manager().get_local_gpu_count();
-         ++dev_id) {
-      context.set_device(embedding_data_.get_local_gpu(dev_id).get_device_id());
-      error = cudaStreamSynchronize(embedding_data_.get_local_gpu(dev_id).get_stream());
-      if (error != cudaError_t::cudaSuccess) return error;
-    }
-
-    return cudaError_t::cudaSuccess;
-  }
-
-  void freeze() override { embedding_data_.is_trainable_ = false; }
-
-  void unfreeze() override { embedding_data_.is_trainable_ = true; }
-
-  bool is_trainable() const override { return embedding_data_.is_trainable_; }
-
-  USE_EMBEDDING_DATA_FUNCTION(embedding_data_)
-};  // end of class LocalizedSlotSparseEmbeddingOneHot
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/embeddings/sparse_embedding_functors.hpp b/HugeCTR/include/embeddings/sparse_embedding_functors.hpp
index e00f00b90f..24b422f963 100644
--- a/HugeCTR/include/embeddings/sparse_embedding_functors.hpp
+++ b/HugeCTR/include/embeddings/sparse_embedding_functors.hpp
@@ -284,47 +284,6 @@ class SparseEmbeddingFunctors {
                              const Tensor2<TypeEmbeddingComp *> &embedding_features,
                              Tensor2<TypeEmbeddingComp> &wgrad, size_t sm, cudaStream_t stream);
 
-  /**
-   * update_params for LocalizedSlotSparseEmbeddingOneHot.
-   * overload for fp16. Only support atomic SGD currently.
-   * The second step of backward propagation: update embedding tables(weights)
-   * @param stream cuda stream corresponding to the current GPU.
-   * @param embedding_vec_size embedding vector size.
-   * @param opt_params optimizer params.
-   * @param nnz non-zero feature number in one batch
-   * @param hash_value_index the pointer of hash value_index
-   * @param wgrad the pointer of wgrad
-   * @param hash_table_value the pointer of hash table value, which will be updated
-   */
-  template <typename TypeEmbeddingComp>
-  void update_params(size_t embedding_vec_size, const OptParams &opt_params, size_t nnz,
-                     const Tensor2<size_t> &hash_value_index,
-                     const Tensor2<TypeEmbeddingComp> &wgrad, Tensor2<float> &hash_table_value,
-                     Tensor2<size_t> &top_categories, size_t &size_top_categories, size_t sm_count,
-                     cudaStream_t stream, bool force_stats = false);
-
-  /**
-   * Atomic cached sgd update.
-   *
-   * @param num_samples number of samples for which to accumulate the gradient
-   * @param embedding_vec_size size of the embedding vector per category
-   * @param hash_value_index
-   * @param lr
-   * @param scaler
-   * @param wgrad
-   * @param hash_table_value
-   * @param top_categories
-   * @param size_top_categories
-   * @param stream
-   *
-   */
-  template <typename TypeEmbeddingComp>
-  static void opt_sgd_atomic_cached(size_t num_samples, size_t embedding_vec_size,
-                                    const size_t *hash_value_index, float lr, float scaler,
-                                    const TypeEmbeddingComp *wgrad, float *hash_table_value,
-                                    size_t *top_categories, size_t &size_top_categories,
-                                    cudaStream_t stream, bool force_stats = false);
-
   /**
    * collection communication: reduce_scatter f or DistributedSlotSparseEmbeddingHash
    * @param recv_count the count of elements will be received.
diff --git a/HugeCTR/include/exchange_wgrad.hpp b/HugeCTR/include/exchange_wgrad.hpp
index 9bfb1c01c6..3ced5f4aa7 100644
--- a/HugeCTR/include/exchange_wgrad.hpp
+++ b/HugeCTR/include/exchange_wgrad.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <collectives/collective.hpp>
 #include <general_buffer2.hpp>
 #include <resource_manager.hpp>
 
@@ -43,14 +44,15 @@ class NetworkExchangeWgrad : public ExchangeWgrad {
   void init_ar_comm(const std::vector<void*>& ptr, size_t size) final;
   void update_embed_wgrad_size(size_t size) final;
   void allreduce(size_t device_id, cudaStream_t stream);
-  NetworkExchangeWgrad(const std::shared_ptr<ResourceManager>& resource_manager);
+  NetworkExchangeWgrad(const std::shared_ptr<ResourceManager>& resource_manager,
+                       const std::shared_ptr<CollectiveManager>& collective_manager);
   ~NetworkExchangeWgrad() = default;
 
  private:
   // TODO remove them after hybrid embedding is deprecated
   BuffPtrs<TypeFP> network_wgrad_buffs_;
   BuffPtrs<TypeFP> null_wgrad_buffs_;
-  std::shared_ptr<ResourceManager> resource_manager_;
+  std::shared_ptr<CollectiveManager> collective_manager_;
 
   AllReduceInPlaceComm::Handle ar_handle_;
 
@@ -67,7 +69,8 @@ class GroupedExchangeWgrad : public ExchangeWgrad {
   void init_ar_comm(const std::vector<void*>& ptr, size_t size) final;
   void update_embed_wgrad_size(size_t size) final;
   void allreduce(size_t device_id, cudaStream_t stream);
-  GroupedExchangeWgrad(const std::shared_ptr<ResourceManager>& resource_manager);
+  GroupedExchangeWgrad(const std::shared_ptr<ResourceManager>& resource_manager,
+                       const std::shared_ptr<CollectiveManager>& collective_manager);
   ~GroupedExchangeWgrad() = default;
 
  private:
@@ -75,7 +78,7 @@ class GroupedExchangeWgrad : public ExchangeWgrad {
   BuffPtrs<TypeFP> network_wgrad_buffs_;
   BuffPtrs<TypeFP> embed_wgrad_buffs_;
 
-  std::shared_ptr<ResourceManager> resource_manager_;
+  std::shared_ptr<CollectiveManager> collective_manager_;
 
   AllReduceInPlaceComm::Handle ar_handle_;
 
diff --git a/HugeCTR/include/parser.hpp b/HugeCTR/include/parser.hpp
index 51b4e71c50..561b61174d 100644
--- a/HugeCTR/include/parser.hpp
+++ b/HugeCTR/include/parser.hpp
@@ -199,9 +199,7 @@ const std::map<std::string, Layer_t> LAYER_TYPE_MAP_MP = {
     {"SequenceMask", Layer_t::SequenceMask}};
 const std::map<std::string, Embedding_t> EMBEDDING_TYPE_MAP = {
     {"DistributedSlotSparseEmbeddingHash", Embedding_t::DistributedSlotSparseEmbeddingHash},
-    {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash},
-    {"LocalizedSlotSparseEmbeddingOneHot", Embedding_t::LocalizedSlotSparseEmbeddingOneHot},
-    {"HybridSparseEmbedding", Embedding_t::HybridSparseEmbedding}};
+    {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}};
 const std::map<std::string, Initializer_t> INITIALIZER_TYPE_MAP = {
     {"Uniform", Initializer_t::Uniform},
     {"XavierNorm", Initializer_t::XavierNorm},
@@ -244,15 +242,6 @@ static const std::map<std::string, Alignment_t> ALIGNED_TYPE_MAP = {
     {"None", Alignment_t::None},
 };
 
-static const std::map<std::string, hybrid_embedding::CommunicationType> COMMUNICATION_TYPE_MAP = {
-    {"IB_NVLink_Hierarchical", hybrid_embedding::CommunicationType::IB_NVLink_Hier},
-    {"IB_NVLink", hybrid_embedding::CommunicationType::IB_NVLink},
-    {"NVLink_SingleNode", hybrid_embedding::CommunicationType::NVLink_SingleNode}};
-
-static const std::map<std::string, hybrid_embedding::HybridEmbeddingType>
-    HYBRID_EMBEDDING_TYPE_MAP = {
-        {"Distributed", hybrid_embedding::HybridEmbeddingType::Distributed}};
-
 inline bool has_key_(const nlohmann::json& j_in, const std::string& key_in) {
   if (j_in.find(key_in) == j_in.end()) {
     return false;
diff --git a/HugeCTR/include/pybind/common_wrapper.hpp b/HugeCTR/include/pybind/common_wrapper.hpp
index 9bca4c31ef..5885e7d579 100644
--- a/HugeCTR/include/pybind/common_wrapper.hpp
+++ b/HugeCTR/include/pybind/common_wrapper.hpp
@@ -21,7 +21,6 @@
 #include <collectives/all_reduce_comm.hpp>
 #include <common.hpp>
 #include <device_map.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
 #include <hps/inference_utils.hpp>
 #include <io/filesystem.hpp>
 #include <metrics.hpp>
@@ -59,10 +58,6 @@ void CommonPybind(pybind11::module& m) {
       .value("Sum", HugeCTR::Check_t::Sum)
       .value("Non", HugeCTR::Check_t::None)
       .export_values();
-  pybind11::enum_<HugeCTR::DataReaderSparse_t>(m, "DataReaderSparse_t")
-      .value("Distributed", HugeCTR::DataReaderSparse_t::Distributed)
-      .value("Localized", HugeCTR::DataReaderSparse_t::Localized)
-      .export_values();
   pybind11::enum_<HugeCTR::DataReaderType_t>(m, "DataReaderType_t")
       .value("Norm", HugeCTR::DataReaderType_t::Norm)
       .value("Raw", HugeCTR::DataReaderType_t::Raw)
@@ -90,9 +85,6 @@ void CommonPybind(pybind11::module& m) {
              HugeCTR::Embedding_t::DistributedSlotSparseEmbeddingHash)
       .value("LocalizedSlotSparseEmbeddingHash",
              HugeCTR::Embedding_t::LocalizedSlotSparseEmbeddingHash)
-      .value("LocalizedSlotSparseEmbeddingOneHot",
-             HugeCTR::Embedding_t::LocalizedSlotSparseEmbeddingOneHot)
-      .value("HybridSparseEmbedding", HugeCTR::Embedding_t::HybridSparseEmbedding)
       .export_values();
   pybind11::enum_<HugeCTR::Initializer_t>(m, "Initializer_t")
       .value("Default", HugeCTR::Initializer_t::Default)
@@ -157,15 +149,6 @@ void CommonPybind(pybind11::module& m) {
            pybind11::arg("io_alignment") = 0, pybind11::arg("shuffle"),
            pybind11::arg("aligned_type") = Alignment_t::None,
            pybind11::arg("multi_hot_reader") = true, pybind11::arg("is_dense_float") = true);
-  pybind11::class_<HugeCTR::HybridEmbeddingParam>(m, "HybridEmbeddingParam")
-      .def(pybind11::init<size_t, int64_t, double, double, double, double,
-                          hybrid_embedding::CommunicationType,
-                          hybrid_embedding::HybridEmbeddingType>(),
-           pybind11::arg("max_num_frequent_categories"),
-           pybind11::arg("max_num_infrequent_samples"), pybind11::arg("p_dup_max"),
-           pybind11::arg("max_all_reduce_bandwidth"), pybind11::arg("max_all_to_all_bandwidth"),
-           pybind11::arg("efficiency_bandwidth_ratio"), pybind11::arg("communication_type"),
-           pybind11::arg("hybrid_embedding_type"));
   pybind11::enum_<HugeCTR::LrPolicy_t>(m, "LrPolicy_t")
       .value("fixed", HugeCTR::LrPolicy_t::fixed)
       .export_values();
@@ -218,14 +201,6 @@ void CommonPybind(pybind11::module& m) {
       .value("OneShot", HugeCTR::AllReduceAlgo::ONESHOT)
       .value("NCCL", HugeCTR::AllReduceAlgo::NCCL)
       .export_values();
-  pybind11::enum_<HugeCTR::hybrid_embedding::HybridEmbeddingType>(m, "HybridEmbeddingType")
-      .value("Distributed", HugeCTR::hybrid_embedding::HybridEmbeddingType::Distributed)
-      .export_values();
-  pybind11::enum_<HugeCTR::hybrid_embedding::CommunicationType>(m, "CommunicationType")
-      .value("IB_NVLink_Hier", HugeCTR::hybrid_embedding::CommunicationType::IB_NVLink_Hier)
-      .value("IB_NVLink", HugeCTR::hybrid_embedding::CommunicationType::IB_NVLink)
-      .value("NVLink_SingleNode", HugeCTR::hybrid_embedding::CommunicationType::NVLink_SingleNode)
-      .export_values();
   pybind11::enum_<HugeCTR::Distribution_t>(m, "Distribution_t")
       .value("Uniform", HugeCTR::Distribution_t::Uniform)
       .value("PowerLaw", HugeCTR::Distribution_t::PowerLaw)
diff --git a/HugeCTR/include/pybind/model.hpp b/HugeCTR/include/pybind/model.hpp
index def5a93327..3f1c73e34b 100644
--- a/HugeCTR/include/pybind/model.hpp
+++ b/HugeCTR/include/pybind/model.hpp
@@ -115,13 +115,7 @@ std::set<Layer_t> TRAINABLE_LAYERS = {
 
 std::map<Embedding_t, std::string> EMBEDDING_TYPE_TO_STRING = {
     {Embedding_t::DistributedSlotSparseEmbeddingHash, "DistributedSlotSparseEmbeddingHash"},
-    {Embedding_t::LocalizedSlotSparseEmbeddingHash, "LocalizedSlotSparseEmbeddingHash"},
-    {Embedding_t::LocalizedSlotSparseEmbeddingOneHot, "LocalizedSlotSparseEmbeddingOneHot"},
-    {Embedding_t::HybridSparseEmbedding, "HybridSparseEmbedding"}};
-
-std::map<DataReaderSparse_t, std::string> READER_SPARSE_TYPE_TO_STRING = {
-    {DataReaderSparse_t::Distributed, "DistributedSlot"},
-    {DataReaderSparse_t::Localized, "LocalizedSlot"}};
+    {Embedding_t::LocalizedSlotSparseEmbeddingHash, "LocalizedSlotSparseEmbeddingHash"}};
 
 std::map<Initializer_t, std::string> INITIALIZER_TYPE_TO_STRING = {
     {Initializer_t::Uniform, "Uniform"},
@@ -132,14 +126,6 @@ std::map<Initializer_t, std::string> INITIALIZER_TYPE_TO_STRING = {
 std::map<AllReduceAlgo, std::string> ALLREDUCE_ALGO_TO_STRING = {
     {AllReduceAlgo::ONESHOT, "OneShot"}, {AllReduceAlgo::NCCL, "NCCL"}};
 
-std::map<hybrid_embedding::CommunicationType, std::string> HE_COMM_TYPE_TO_STRING = {
-    {hybrid_embedding::CommunicationType::IB_NVLink_Hier, "IB_NVLink_Hierarchical"},
-    {hybrid_embedding::CommunicationType::IB_NVLink, "IB_NVLink"},
-    {hybrid_embedding::CommunicationType::NVLink_SingleNode, "NVLink_SingleNode"}};
-
-std::map<hybrid_embedding::HybridEmbeddingType, std::string> HE_TYPE_TO_STRING = {
-    {hybrid_embedding::HybridEmbeddingType::Distributed, "Distributed"}};
-
 std::map<FcPosition_t, std::string> FC_POSITION_TO_STRING = {
     {FcPosition_t::Head, "Head"}, {FcPosition_t::Body, "Body"},
     {FcPosition_t::Tail, "Tail"}, {FcPosition_t::Isolated, "Isolated"},
@@ -206,13 +192,12 @@ struct SparseEmbedding {
   std::string bottom_name;
   std::vector<size_t> slot_size_array;
   std::shared_ptr<OptParamsPy> embedding_opt_params;
-  HybridEmbeddingParam hybrid_embedding_param;
+
   SparseEmbedding(Embedding_t embedding_type, size_t workspace_size_per_gpu_in_mb,
                   size_t embedding_vec_size, const std::string& combiner_str,
                   std::string sparse_embedding_name, std::string bottom_name,
                   std::vector<size_t>& slot_size_array,
-                  std::shared_ptr<OptParamsPy>& embedding_opt_params,
-                  const HybridEmbeddingParam& hybrid_embedding_param);
+                  std::shared_ptr<OptParamsPy>& embedding_opt_params);
 
   void initialize_max_vocabulary_size_per_gpu();
 };
@@ -339,8 +324,7 @@ void add_input(Input& input, DataReaderParams& reader_params,
                std::vector<std::vector<TensorEntity>>& train_tensor_entities_list,
                std::vector<std::vector<TensorEntity>>& evaluate_tensor_entities_list,
                std::shared_ptr<IDataReader>& train_data_reader,
-               std::shared_ptr<IDataReader>& evaluate_data_reader,
-               std::shared_ptr<IDataReader>& init_data_reader, size_t batch_size,
+               std::shared_ptr<IDataReader>& evaluate_data_reader, size_t batch_size,
                size_t batch_size_eval, bool use_mixed_precision, bool repeat_dataset,
                bool train_intra_iteration_overlap, size_t num_iterations_statistics,
                const std::shared_ptr<ResourceManager>);
@@ -352,6 +336,7 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding,
                           std::vector<std::vector<TensorEntity>>& evaluate_tensor_entities_list,
                           std::vector<std::shared_ptr<IEmbedding>>& embeddings,
                           const std::shared_ptr<ResourceManager>& resource_manager,
+                          const std::shared_ptr<CollectiveManager>& collective_manager,
                           size_t batch_size, size_t batch_size_eval,
                           OptParams& embedding_opt_params,
                           std::shared_ptr<ExchangeWgrad>& exchange_wgrad, bool use_cuda_graph,
@@ -608,6 +593,7 @@ class Model final {
   std::shared_ptr<IDataReader> evaluate_data_reader_; /**< data reader for evaluation. */
   std::shared_ptr<ResourceManager>
       resource_manager_; /**< GPU resources include handles and streams etc.*/
+  std::shared_ptr<CollectiveManager> collective_manager_;
   std::shared_ptr<embedding::EmbeddingParameterIO> embedding_para_io_;
   metrics::Metrics metrics_; /**< evaluation metrics. */
 
@@ -648,33 +634,6 @@ class Model final {
 
   size_t number_of_networks() const;
 
-  struct GraphScheduler {
-   private:
-    volatile size_t* executed_iter;
-    size_t launched_iter;
-
-   public:
-    GraphScheduler(std::shared_ptr<ResourceManager> resource_manager) : launched_iter(0) {
-      // set up trickling launch
-      CudaCPUDeviceContext ctx(resource_manager->get_local_gpu(0)->get_device_id());
-      HCTR_LIB_THROW(cudaMallocHost((void**)&executed_iter, sizeof(size_t)));
-      *executed_iter = 0;
-    }
-    ~GraphScheduler() { cudaFreeHost(const_cast<size_t*>(executed_iter)); }
-    void trickling() {
-      // this function is called by the only thread, hence no need to specify the rank
-      while (launched_iter > *(executed_iter) + 1) {
-        usleep(10);
-      }
-      launched_iter++;
-    }
-    void record_execution(size_t local_rank, cudaStream_t stream) {
-      // Only rank 0 needs to do the work
-      if (local_rank == 0) inc_var(executed_iter, stream);
-    }
-  };
-  std::unique_ptr<GraphScheduler> graph_scheduler_;
-
   struct Graph {
     // train and eval can be called directly by user
     bool is_first_train_batch_ = true;
@@ -696,22 +655,10 @@ class Model final {
   bool is_scheduled_datareader() {
     return (reader_params_.data_reader_type == DataReaderType_t::RawAsync);
   }
-  bool is_scheduled_embedding() {
-    return (embeddings_.size() == 1 &&
-            embeddings_[0]->get_embedding_type() == Embedding_t::HybridSparseEmbedding);
-  }
-  template <typename NetworkType>
-  void create_train_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks);
-  template <typename NetworkType>
-  void create_evaluate_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks);
-  template <typename NetworkType>
-  void create_train_network_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks);
-  template <typename Network>
+  void create_train_network_pipeline(std::vector<std::shared_ptr<Network>>& networks);
   void create_eval_network_pipeline(std::vector<std::shared_ptr<Network>>& networks);
-  template <typename NetworkType>
-  void create_train_pipeline_with_ebc(std::vector<std::shared_ptr<NetworkType>>& networks);
-  template <typename NetworkType>
-  void create_evaluate_pipeline_with_ebc(std::vector<std::shared_ptr<NetworkType>>& networks);
+  void create_train_pipeline_with_ebc(std::vector<std::shared_ptr<Network>>& networks);
+  void create_evaluate_pipeline_with_ebc(std::vector<std::shared_ptr<Network>>& networks);
 
   bool skip_prefetch_in_last_batch(bool is_train);
   long long read_a_batch(bool is_train);
diff --git a/HugeCTR/include/pybind/model_wrapper.hpp b/HugeCTR/include/pybind/model_wrapper.hpp
index e048f88de9..9b842300e9 100644
--- a/HugeCTR/include/pybind/model_wrapper.hpp
+++ b/HugeCTR/include/pybind/model_wrapper.hpp
@@ -70,17 +70,12 @@ void ModelPybind(pybind11::module &m) {
   pybind11::class_<HugeCTR::SparseEmbedding, std::shared_ptr<HugeCTR::SparseEmbedding>>(
       m, "SparseEmbedding")
       .def(pybind11::init<Embedding_t, size_t, size_t, const std::string &, std::string,
-                          std::string, std::vector<size_t> &, std::shared_ptr<OptParamsPy> &,
-                          const HybridEmbeddingParam &>(),
+                          std::string, std::vector<size_t> &, std::shared_ptr<OptParamsPy> &>(),
            pybind11::arg("embedding_type"), pybind11::arg("workspace_size_per_gpu_in_mb") = 0,
            pybind11::arg("embedding_vec_size"), pybind11::arg("combiner"),
            pybind11::arg("sparse_embedding_name"), pybind11::arg("bottom_name"),
            pybind11::arg("slot_size_array") = std::vector<size_t>(),
-           pybind11::arg("optimizer") = std::shared_ptr<OptParamsPy>(new OptParamsPy()),
-           pybind11::arg("hybrid_embedding_param") =
-               HybridEmbeddingParam{1, -1, 0.01, 1.3e11, 2.6e11, 1.0,
-                                    hybrid_embedding::CommunicationType::NVLink_SingleNode,
-                                    hybrid_embedding::HybridEmbeddingType::Distributed});
+           pybind11::arg("optimizer") = std::shared_ptr<OptParamsPy>(new OptParamsPy()));
   pybind11::class_<HugeCTR::DenseLayerComputeConfig>(m, "DenseLayerComputeConfig")
       .def(pybind11::init<bool, bool>(), pybind11::arg("async_wgrad") = false,
            pybind11::arg("fuse_wb") = false);
diff --git a/HugeCTR/include/resource_manager.hpp b/HugeCTR/include/resource_manager.hpp
index 491595ae43..cfa0090c66 100644
--- a/HugeCTR/include/resource_manager.hpp
+++ b/HugeCTR/include/resource_manager.hpp
@@ -32,9 +32,7 @@ namespace HugeCTR {
  */
 class ResourceManager : public ResourceManagerBase {
  public:
-  static std::shared_ptr<ResourceManager> create(
-      const std::vector<std::vector<int>>& visible_devices, unsigned long long seed,
-      DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST);
+  virtual ~ResourceManager() = default;
   virtual int get_num_process() const = 0;
   virtual int get_process_id() const = 0;
   virtual int get_master_process_id() const = 0;
@@ -49,14 +47,6 @@ class ResourceManager : public ResourceManagerBase {
 
   virtual const std::shared_ptr<rmm::mr::device_memory_resource>&
   get_device_rmm_device_memory_resource(int local_gpu_id) const = 0;
-
-#ifdef ENABLE_MPI
-  virtual void init_ib_comm() = 0;
-  virtual IbComm* get_ib_comm() const = 0;
-  virtual void set_ready_to_transfer() = 0;
-#endif
-  virtual void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) = 0;
-  virtual AllReduceInPlaceComm* get_ar_comm() const = 0;
 };
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/resource_manager_base.hpp b/HugeCTR/include/resource_manager_base.hpp
index 735eb5e7be..754b1a67bd 100644
--- a/HugeCTR/include/resource_manager_base.hpp
+++ b/HugeCTR/include/resource_manager_base.hpp
@@ -27,6 +27,7 @@ namespace HugeCTR {
  */
 class ResourceManagerBase {
  public:
+  virtual ~ResourceManagerBase() = default;
   virtual void set_local_gpu(std::shared_ptr<GPUResource> gpu_resource, size_t local_gpu_id) = 0;
   virtual const std::shared_ptr<GPUResource>& get_local_gpu(size_t local_gpu_id) const = 0;
   virtual const std::shared_ptr<GPUResource>& get_local_gpu_from_device_id(
diff --git a/HugeCTR/include/resource_managers/resource_manager_core.hpp b/HugeCTR/include/resource_managers/resource_manager_core.hpp
index 2118f3c89a..438319e194 100644
--- a/HugeCTR/include/resource_managers/resource_manager_core.hpp
+++ b/HugeCTR/include/resource_managers/resource_manager_core.hpp
@@ -44,8 +44,11 @@ class ResourceManagerCore : public ResourceManager {
  public:
   ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map,
                       unsigned long long seed);
-  ResourceManagerCore(const ResourceManagerCore&) = delete;
-  ResourceManagerCore& operator=(const ResourceManagerCore&) = delete;
+  static std::shared_ptr<ResourceManager> create(
+      const std::vector<std::vector<int>>& visible_devices, unsigned long long seed,
+      DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST);
+
+  HCTR_DISALLOW_COPY_AND_MOVE(ResourceManagerCore);
   ~ResourceManagerCore();
 
   // from ResourceManagerBase
@@ -111,25 +114,5 @@ class ResourceManagerCore : public ResourceManager {
 
   const std::shared_ptr<rmm::mr::device_memory_resource>& get_device_rmm_device_memory_resource(
       int local_gpu_id) const override;
-
-#ifdef ENABLE_MPI
-  void init_ib_comm() override {
-    HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached");
-  }
-  IbComm* get_ib_comm() const override {
-    HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached");
-    return nullptr;
-  }
-  void set_ready_to_transfer() override {
-    HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached");
-  }
-#endif
-  void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) override {
-    HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached");
-  }
-  AllReduceInPlaceComm* get_ar_comm() const override {
-    HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached");
-    return nullptr;
-  }
 };
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/resource_managers/resource_manager_ext.hpp b/HugeCTR/include/resource_managers/resource_manager_ext.hpp
deleted file mode 100644
index 1e68af8c5a..0000000000
--- a/HugeCTR/include/resource_managers/resource_manager_ext.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <resource_manager.hpp>
-#include <resource_managers/resource_manager_core.hpp>
-
-namespace HugeCTR {
-
-/**
- * @brief GPU resources manager which holds all the resources required by training
- *
- * An extended GPU Resource manager
- */
-class ResourceManagerExt : public ResourceManager {
-  std::shared_ptr<ResourceManager> core_;
-
-#ifdef ENABLE_MPI
-  std::unique_ptr<IbComm> ib_comm_ = NULL;
-#endif
-  std::shared_ptr<AllReduceInPlaceComm> ar_comm_ = NULL;
-
-  ResourceManagerExt(std::shared_ptr<ResourceManager> core) : core_(core) {}
-
- public:
-  static std::shared_ptr<ResourceManager> create(
-      const std::vector<std::vector<int>>& visible_devices, unsigned long long seed,
-      DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST);
-
-  ResourceManagerExt(const ResourceManagerExt&) = delete;
-  ResourceManagerExt& operator=(const ResourceManagerExt&) = delete;
-
-  // from ResourceManagerBase
-  void set_local_gpu(std::shared_ptr<GPUResource> gpu_resource, size_t local_gpu_id) override {
-    core_->set_local_gpu(gpu_resource, local_gpu_id);
-  }
-  const std::shared_ptr<GPUResource>& get_local_gpu(size_t local_gpu_id) const override {
-    return core_->get_local_gpu(local_gpu_id);
-  }
-  const std::shared_ptr<GPUResource>& get_local_gpu_from_device_id(
-      size_t device_id) const override {
-    return core_->get_local_gpu_from_device_id(device_id);
-  }
-  size_t get_local_gpu_count() const override { return core_->get_local_gpu_count(); }
-  size_t get_global_gpu_count() const override { return core_->get_global_gpu_count(); }
-
-  // from ResourceManager
-  int get_num_process() const override { return core_->get_num_process(); }
-  int get_process_id() const override { return core_->get_process_id(); }
-  int get_master_process_id() const override { return core_->get_master_process_id(); }
-  bool is_master_process() const override { return core_->is_master_process(); }
-
-  const std::shared_ptr<CPUResource>& get_local_cpu() const override {
-    return core_->get_local_cpu();
-  }
-
-  const std::vector<std::shared_ptr<GPUResource>>& get_local_gpus() const override {
-    return core_->get_local_gpus();
-  }
-
-  const std::vector<int>& get_local_gpu_device_id_list() const override {
-    return core_->get_local_gpu_device_id_list();
-  }
-
-  int get_process_id_from_gpu_global_id(size_t global_gpu_id) const override {
-    return core_->get_process_id_from_gpu_global_id(global_gpu_id);
-  }
-
-  size_t get_gpu_local_id_from_global_id(size_t global_gpu_id) const override {
-    return core_->get_gpu_local_id_from_global_id(global_gpu_id);
-  }
-
-  size_t get_gpu_global_id_from_local_id(size_t local_gpu_id) const override {
-    return core_->get_gpu_global_id_from_local_id(local_gpu_id);
-  }
-
-  bool p2p_enabled(int src_dev, int dst_dev) const override {
-    return core_->p2p_enabled(src_dev, dst_dev);
-  }
-  bool all_p2p_enabled() const override { return core_->all_p2p_enabled(); }
-
-  DeviceMap::Layout get_device_layout() const override { return core_->get_device_layout(); }
-
-  const std::shared_ptr<rmm::mr::device_memory_resource>& get_device_rmm_device_memory_resource(
-      int local_gpu_id) const override {
-    return core_->get_device_rmm_device_memory_resource(local_gpu_id);
-  }
-
-#ifdef ENABLE_MPI
-  void init_ib_comm() override;
-  IbComm* get_ib_comm() const override { return ib_comm_.get(); }
-  void set_ready_to_transfer() override {
-    if (ib_comm_) ib_comm_->set_ready_to_transfer();
-  }
-#endif
-  void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) override;
-  AllReduceInPlaceComm* get_ar_comm() const override { return ar_comm_.get(); }
-};
-}  // namespace HugeCTR
diff --git a/HugeCTR/include/scheduleable.hpp b/HugeCTR/include/scheduleable.hpp
index 6633670afa..1ab96638bc 100644
--- a/HugeCTR/include/scheduleable.hpp
+++ b/HugeCTR/include/scheduleable.hpp
@@ -45,22 +45,4 @@ class SchedulableDataReader : public IDataReader {
   virtual std::vector<core23::Tensor> get_label_tensor23s() const = 0;
   virtual std::vector<core23::Tensor> get_dense_tensor23s() const = 0;
 };
-
-class SchedulableEmbeding : public IEmbedding {
- public:
-  virtual ~SchedulableEmbeding() = default;
-
-  virtual void assign_input_tensors(bool is_train, size_t batch_size, size_t inflight_id,
-                                    bool cached) = 0;
-  virtual void index_calculation(bool is_train, int i) = 0;
-  virtual void freq_forward(bool is_train, int i, bool is_first_eval_batch = true) = 0;
-  virtual void freq_backward(int i) = 0;
-  virtual void freq_update_params(int i) = 0;
-  virtual void infreq_model_forward(int i) = 0;
-  virtual void infreq_network_forward(bool is_train, int i) = 0;
-  virtual void global_barrier(bool is_train, int i) = 0;
-  virtual void infreq_network_backward(int i) = 0;
-  virtual void infreq_model_backward(int i) = 0;
-};
-
 }  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/src/collectives/collective.cpp b/HugeCTR/src/collectives/collective.cpp
new file mode 100644
index 0000000000..a5869a84be
--- /dev/null
+++ b/HugeCTR/src/collectives/collective.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <collectives/collective.hpp>
+#include <core23/logger.hpp>
+
+namespace HugeCTR {
+
+#ifdef ENABLE_MPI
+void CollectiveManager::init_ib_comm() {
+  int num_process = core_->get_num_process();
+  if (num_process > 1) {
+    int process_id = core_->get_process_id();
+    ib_comm_ = std::make_unique<IbComm>();
+    ib_comm_->init(num_process, core_->get_local_gpu_count(), process_id,
+                   core_->get_local_gpu_device_id_list());
+  }
+}
+#endif
+
+void CollectiveManager::set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) {
+  int num_process = core_->get_num_process();
+#ifdef ENABLE_MPI
+  IbComm* ib_comm_ptr = nullptr;
+  if (algo == AllReduceAlgo::ONESHOT) {
+    init_ib_comm();
+    ib_comm_ptr = ib_comm_.get();
+  }
+  ar_comm_ = AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision,
+                                          core_->get_local_gpus(), ib_comm_ptr);
+#else
+  ar_comm_ =
+      AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, core_->get_local_gpus());
+#endif
+}
+
+}  // namespace HugeCTR
diff --git a/HugeCTR/src/data_readers/async_reader/async_reader.cpp b/HugeCTR/src/data_readers/async_reader/async_reader.cpp
deleted file mode 100644
index cc83394614..0000000000
--- a/HugeCTR/src/data_readers/async_reader/async_reader.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-#include <numa.h>
-#include <nvml.h>
-
-#include <cassert>
-#include <common.hpp>
-#include <cstdio>
-#include <data_readers/async_reader/async_reader.hpp>
-#include <filesystem>
-#include <fstream>
-#include <map>
-#include <numeric>
-#include <random>
-#include <resource_manager.hpp>
-#include <utils.hpp>
-
-namespace HugeCTR {
-
-AsyncReaderImpl::AsyncReaderImpl(std::string fname, size_t batch_size_bytes,
-                                 const ResourceManager* resource_manager, int num_threads,
-                                 int num_batches_per_thread, size_t io_block_size, int io_depth,
-                                 int io_alignment, bool shuffle, bool wait_for_gpu_idle)
-    :
-
-      fname_(fname),
-      batch_size_bytes_(batch_size_bytes),
-      resource_manager_(resource_manager),
-      num_devices_(resource_manager_->get_local_gpu_count()),
-      num_threads_(num_threads),
-      num_batches_per_thread_(num_batches_per_thread),
-      io_block_size_(io_block_size),
-      io_depth_(io_depth),
-      io_alignment_(io_alignment),
-      wait_for_gpu_idle_(wait_for_gpu_idle),
-      queue_id_(0),
-      thread_batch_ids_(num_threads_),
-      thread_buffer_ids_(num_threads_),
-      gpu_thread_ids_(num_devices_),
-      local_readers_(num_threads_) {
-  total_file_size_ = std::filesystem::file_size(fname);
-  num_batches_ = (total_file_size_ + batch_size_bytes_ - 1) / batch_size_bytes;
-  batch_ids_.resize(num_batches_);
-  std::iota(batch_ids_.begin(), batch_ids_.end(), 0);
-
-  if (shuffle) {
-    std::mt19937 gen(resource_manager_->get_local_cpu()->get_replica_uniform_seed());
-    std::shuffle(batch_ids_.begin(), batch_ids_.end(), gen);
-  }
-
-  // Don't allocate more buffers that number of batches in the file
-  buffers_.resize(std::min((size_t)num_threads_ * num_batches_per_thread, num_batches_));
-  for (auto& buf : buffers_) {
-    buf = std::make_unique<InternalBatchBuffer>();
-    buf->dev_data.resize(num_devices_);
-    for (int id = 0; id < num_devices_; id++) {
-      auto device_id = resource_manager_->get_local_gpu(id)->get_device_id();
-      CudaDeviceContext ctx(device_id);
-      HCTR_LIB_THROW(cudaMalloc(&buf->dev_data[id], batch_size_bytes_));
-    }
-  }
-
-  streams_.resize(num_devices_);
-  for (int id = 0; id < num_devices_; id++) {
-    auto device_id = resource_manager_->get_local_gpu(id)->get_device_id();
-    CudaDeviceContext ctx(device_id);
-    HCTR_LIB_THROW(cudaStreamCreateWithPriority(&streams_[id], cudaStreamNonBlocking, -100));
-  }
-  HCTR_LIB_THROW(cudaEventCreateWithFlags(&event_success_, cudaEventDisableTiming));
-
-  // For correct perf benchmarking create the thread readers upfront
-  create_workers();
-}
-// create_workers() will be called only once
-void AsyncReaderImpl::create_workers() {
-  // Use round-robin distribution
-  for (size_t i = 0; i < num_batches_; i++) {
-    int thid = i % num_threads_;
-    thread_batch_ids_[thid].push_back(batch_ids_[i]);
-  }
-
-  for (auto& id : gpu_thread_ids_) {
-    id.clear();
-  }
-  for (auto& id : thread_buffer_ids_) {
-    id.clear();
-  }
-  threads_.reserve(num_threads_);
-
-  for (int thid = 0; thid < num_threads_; thid++) {
-    int raw_id = thid % num_devices_;
-    int device_id = resource_manager_->get_local_gpu(raw_id)->get_device_id();
-    gpu_thread_ids_.at(raw_id).push_back(thid);
-
-    std::vector<InternalBatchBuffer*> thread_buffer_ptrs;
-    for (int i = 0; i < num_batches_per_thread_; i++) {
-      size_t buf_id = i * num_threads_ + thid;
-      if (buf_id < buffers_.size()) {
-        buffers_[buf_id]->raw_device_id = raw_id;
-        thread_buffer_ptrs.push_back(buffers_[buf_id].get());
-        thread_buffer_ids_.at(thid).push_back(buf_id);
-      }
-    }
-    // Use omp parallel is fine as well?
-    threads_.emplace_back(std::thread([thid, raw_id, device_id, thread_buffer_ptrs, this]() {
-      CudaCPUDeviceContext ctx(device_id);
-
-      local_readers_[thid] = std::make_unique<ThreadAsyncReader>(
-          fname_, resource_manager_, batch_size_bytes_, raw_id, streams_[raw_id],
-          thread_batch_ids_[thid], thread_buffer_ptrs,
-          ThreadAsyncReaderParameters{io_block_size_, io_alignment_, io_depth_, num_devices_,
-                                      wait_for_gpu_idle_, loop_},
-          total_file_size_);
-    }));
-  }
-  for (auto& thread : threads_) {
-    thread.join();
-  }
-  // this clear is important
-  threads_.clear();
-}
-
-bool AsyncReaderImpl::is_currently_loading() { return !threads_.empty(); }
-
-size_t AsyncReaderImpl::get_num_buffers() const { return buffers_.size(); }
-
-size_t AsyncReaderImpl::get_num_batches() const { return num_batches_; }
-
-void AsyncReaderImpl::load_async() {
-  if (is_currently_loading()) {
-    throw std::runtime_error("load_async() is called before the previous load_async finished!");
-  }
-
-  for (int thid = 0; thid < num_threads_; thid++) {
-    threads_.emplace_back(std::thread([thid, this]() {
-      int raw_id = thid % num_devices_;
-      int device_id = resource_manager_->get_local_gpu(raw_id)->get_device_id();
-      CudaCPUDeviceContext ctx(device_id);
-
-      local_readers_[thid]->load();
-    }));
-  }
-}
-
-BatchDesc AsyncReaderImpl::get_batch() {
-  if (!is_currently_loading()) {
-    throw std::runtime_error(
-        "Requested a batch from a file that is not being loaded. Please call load_async() first!");
-  }
-
-  for (size_t attempt = 0; attempt < buffers_.size(); attempt++) {
-    last_buffer_ = buffers_[queue_id_].get();
-
-    auto status = last_buffer_->status.load();
-    while (status != BufferStatus::Finished) {
-      if (status == BufferStatus::ReadReady || status == BufferStatus::PermanentlyResident) {
-        return {last_buffer_->size, last_buffer_->dev_data,
-                status == BufferStatus::PermanentlyResident, static_cast<size_t>(last_buffer_->id)};
-      }
-      if (wait_for_gpu_idle_) {
-        last_buffer_->ready_to_upload_event.store(&event_success_);
-      }
-
-      status = last_buffer_->status.load();
-    }
-    queue_id_ = (queue_id_ + 1) % buffers_.size();
-  }
-
-  return {0, std::vector<char*>(0), false, 0};
-}
-
-void AsyncReaderImpl::wait_for_gpu_events(const std::vector<cudaEvent_t*> events) {
-  if (!wait_for_gpu_idle_) {
-    return;
-  }
-  assert(events.size() == (size_t)num_devices_);
-
-  for (int thid = 0; thid < num_threads_; thid++) {
-    int raw_id = thid % num_devices_;
-    wait_for_gpu_event(events[raw_id], raw_id);
-  }
-}
-
-void AsyncReaderImpl::wait_for_gpu_event(cudaEvent_t* event, int raw_device_id) {
-  if (!wait_for_gpu_idle_) {
-    return;
-  }
-
-  for (auto thid : gpu_thread_ids_.at(raw_device_id)) {
-    for (auto bufid : thread_buffer_ids_.at(thid)) {
-      if (buffers_[bufid]->status == BufferStatus::UploadInProcess) {
-        buffers_[bufid]->ready_to_upload_event.store(event);
-      }
-    }
-  }
-}
-
-void AsyncReaderImpl::finalize_batch() {
-  // Don't update status of finished or resident buffers
-  BufferStatus expected = BufferStatus::ReadReady;
-  last_buffer_->status.compare_exchange_strong(expected, BufferStatus::IOReady);
-  if (loop_ && last_buffer_->id == (int64_t)num_batches_ - 1) {
-    queue_id_ = 0;
-  } else {
-    queue_id_ = (queue_id_ + 1) % buffers_.size();
-  }
-}
-
-void AsyncReaderImpl::finalize_batch(cudaEvent_t* event) {
-  last_buffer_->safe_to_upload_event.store(event);
-  finalize_batch();
-}
-
-int AsyncReaderImpl::get_last_batch_device() {
-  if (last_buffer_) {
-    return last_buffer_->raw_device_id;
-  } else {
-    return buffers_[queue_id_]->raw_device_id;
-  }
-}
-
-void AsyncReaderImpl::reset() {
-  for (auto& reader : local_readers_) {
-    reader->reset();
-  }
-  for (auto& thread : threads_) {
-    thread.join();
-  }
-  threads_.clear();
-  queue_id_ = 0;
-}
-
-AsyncReaderImpl::~AsyncReaderImpl() {
-  reset();
-  cudaEventDestroy(event_success_);
-}
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp b/HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp
deleted file mode 100644
index 73ddb2b0e6..0000000000
--- a/HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common.hpp>
-#include <core23/tensor_operations.hpp>
-#include <data_reader.hpp>
-#include <data_readers/async_reader/async_reader.hpp>
-#include <data_readers/async_reader/async_reader_adapter.hpp>
-#include <data_readers/async_reader/async_reader_common.hpp>
-#include <data_readers/async_reader/split_label_dense_sparse.hpp>
-#include <inference/preallocated_buffer2.hpp>
-#include <resource_manager.hpp>
-#include <utils.hpp>
-namespace HugeCTR {
-template <typename SparseType>
-AsyncReader<SparseType>::AsyncReader(std::string fname, size_t batch_size, size_t label_dim,
-                                     size_t dense_dim, std::vector<DataReaderSparseParam>& params,
-                                     bool mixed_precision,
-                                     const std::shared_ptr<ResourceManager>& resource_manager,
-                                     int num_threads, int num_batches_per_thread,
-                                     size_t io_block_size, int io_depth, int io_alignment,
-                                     bool shuffle, bool wait_for_gpu_idle, Alignment_t aligned)
-    : resource_manager_(resource_manager),
-      mixed_precision_(mixed_precision),
-      batch_size_(batch_size),
-      batch_size_per_dev_(batch_size_ / resource_manager->get_global_gpu_count()),
-      completion_events_(resource_manager->get_local_gpu_count()),
-      schedule_events_(resource_manager->get_local_gpu_count()),
-      split_schedule_events_(resource_manager->get_local_gpu_count()),
-      d2d_schedule_events_(resource_manager->get_local_gpu_count()),
-      s3w_streams_(resource_manager->get_local_gpu_count()),
-      d2d_streams_(resource_manager->get_local_gpu_count()),
-      cache_buffers_(false) {
-  assert(batch_size_ % resource_manager_->get_global_gpu_count() == 0);
-  assert(params.size() == 1);
-  static_assert(sizeof(LabelType) == sizeof(InputType));
-
-  int64_t dense_dim_align8 = dense_dim;
-  if (aligned == Alignment_t::Auto) dense_dim_align8 = (dense_dim + 7) / 8 * 8;
-  int64_t sparse_dim = params[0].slot_num;
-  sample_size_items_ = label_dim + dense_dim + sparse_dim;
-  size_t batch_size_bytes = sample_size_items_ * sizeof(InputType) * batch_size;
-
-  label_dim_ = label_dim;
-  dense_dim_ = dense_dim_align8;
-  sparse_dim_ = sparse_dim;
-  reader_impl_ = std::make_unique<AsyncReaderImpl>(
-      fname, batch_size_bytes, resource_manager.get(), num_threads, num_batches_per_thread,
-      io_block_size, io_depth, io_alignment, shuffle, wait_for_gpu_idle);
-
-  for (uint32_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
-    auto local_gpu = resource_manager_->get_local_gpu(i);
-    auto gpu_id = local_gpu->get_device_id();
-    CudaDeviceContext ctx(gpu_id);
-    HCTR_LIB_THROW(cudaEventCreateWithFlags(&completion_events_[i], cudaEventDisableTiming));
-    HCTR_LIB_THROW(cudaEventCreateWithFlags(&schedule_events_[i], cudaEventDisableTiming));
-    HCTR_LIB_THROW(cudaEventCreateWithFlags(&split_schedule_events_[i], cudaEventDisableTiming));
-    HCTR_LIB_THROW(cudaEventCreateWithFlags(&d2d_schedule_events_[i], cudaEventDisableTiming));
-
-    // set default stream
-    s3w_streams_[i] = local_gpu->get_stream();
-    d2d_streams_[i] = local_gpu->get_stream();
-    int64_t bytes = batch_size_per_dev_ *
-                    (label_dim * sizeof(LabelType) +
-                     dense_dim_align8 * (mixed_precision ? sizeof(__half) : sizeof(float)));
-
-    core23::Tensor one_tensor(core23::TensorParams()
-                                  .device(core23::Device(core23::DeviceType::GPU, gpu_id))
-                                  .data_type(core23::ScalarType::Char)
-                                  .shape({bytes}));
-    temp_tensors_.push_back(one_tensor);
-
-    label_tensors_.emplace_back(core23::Tensor::bind(
-        one_tensor.data(), {batch_size_per_dev_, static_cast<int64_t>(label_dim)},
-        core23::ToScalarType<LabelType>::value, core23::Device(core23::DeviceType::GPU, gpu_id)));
-
-    dense_tensors_.emplace_back(core23::Tensor::bind(
-        one_tensor.data<LabelType>() + batch_size_per_dev_ * label_dim,
-        {batch_size_per_dev_, dense_dim_align8},
-        mixed_precision_ ? core23::ScalarType::Half : core23::ScalarType::Float,
-        core23::Device(core23::DeviceType::GPU, gpu_id)));
-  }
-
-  // zero-initialization
-  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
-    const auto local_gpu = resource_manager_->get_local_gpu(i);
-    CudaDeviceContext ctx(local_gpu->get_device_id());
-    core23::zeros_sync(dense_tensors_[i]);
-  }
-
-  set_tensor_buffering(1);
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::set_tensor_buffering(size_t num_batches_to_buffer) {
-  // If the number of buffers exceeds or is equal to number of batches in our dataset, then we
-  // may as well cache them so we only execute the 'split_3_way' kernel once.
-  cache_buffers_ = num_batches_to_buffer >= reader_impl_->get_num_batches();
-  init_batch_tensors(num_batches_to_buffer);
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::init_batch_tensors(size_t num_inflight) {
-  inflight_batch_tensors_.resize(num_inflight);
-  for (auto& batch_tensors : inflight_batch_tensors_) {
-    batch_tensors.tag = SIZE_MAX;  // Invalid
-
-    for (uint32_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
-      auto local_gpu = resource_manager_->get_local_gpu(i);
-      auto gpu_id = local_gpu->get_device_id();
-      CudaDeviceContext ctx(gpu_id);
-      int64_t bytes =
-          batch_size_per_dev_ * (label_dim_ * sizeof(LabelType) +
-                                 dense_dim_ * (mixed_precision_ ? sizeof(__half) : sizeof(float)));
-
-      core23::Tensor one_tensor(core23::TensorParams()
-                                    .device(core23::Device(core23::DeviceType::GPU, gpu_id))
-                                    .data_type(core23::ScalarType::Char)
-                                    .shape({bytes}));
-      temp_tensors_.push_back(one_tensor);
-      batch_tensors.label_tensors.push_back(core23::Tensor::bind(
-          one_tensor.data(), {batch_size_per_dev_, static_cast<int64_t>(label_dim_)},
-          core23::ToScalarType<LabelType>::value, core23::Device(core23::DeviceType::GPU, gpu_id)));
-
-      batch_tensors.dense_tensors.emplace_back(core23::Tensor::bind(
-          one_tensor.data<LabelType>() + batch_size_per_dev_ * label_dim_,
-          {batch_size_per_dev_, dense_dim_},
-          mixed_precision_ ? core23::ScalarType::Half : core23::ScalarType::Float,
-          core23::Device(core23::DeviceType::GPU, gpu_id)));
-      core23::zeros_sync(batch_tensors.dense_tensors.back());
-      auto value_tensor =
-          core23::Tensor(core23::TensorParams()
-                             .device(core23::Device(core23::DeviceType::GPU, gpu_id))
-                             .data_type(core23::ToScalarType<SparseType>::value)
-                             .shape({batch_size_, sparse_dim_}));
-      // needs to allocate memory eagerly
-      value_tensor.data();
-      auto dummy_row_offset_tensor =
-          core23::Tensor(core23::TensorParams()
-                             .data_type(core23::ToScalarType<int32_t>::value)
-                             .device(core23::Device(core23::DeviceType::GPU, gpu_id))
-                             .shape({4}));
-      std::shared_ptr<size_t> dummy_nnz(new size_t(1));
-      batch_tensors.sparse_tensors.emplace_back(
-          SparseTensor23(value_tensor, dummy_row_offset_tensor, dummy_nnz));
-    }
-  }
-  current_sparse_tensors_ = inflight_batch_tensors_.at(0).sparse_tensors;
-}
-
-template <typename SparseType>
-long long AsyncReader<SparseType>::read_a_batch_to_device_delay_release() {
-  auto batch = reader_impl_->get_batch();
-  if (batch.size_bytes == 0) {
-    reader_impl_->reset();
-    reader_impl_->load_async();
-    batch = reader_impl_->get_batch();
-  }
-
-  if (cache_buffers_) {
-    // TODO: replace with cache policy like LRU when number of batches exceeds what we can store
-    inflight_id_ = batch.id;
-  } else {
-    inflight_id_ = (inflight_id_ + 1) % inflight_batch_tensors_.size();  // FIFO
-  }
-
-  BatchTensors& batch_tensors = inflight_batch_tensors_.at(inflight_id_);
-
-  size_t current_batch_id = static_cast<size_t>(batch.id);
-  current_batch_size_ = batch.size_bytes / (sample_size_items_ * sizeof(InputType));
-  current_sparse_tensors_ = batch_tensors.sparse_tensors;
-  current_batch_cached_ = (current_batch_id == batch_tensors.tag) && cache_buffers_;
-
-  int num_local_gpus = resource_manager_->get_local_gpu_count();
-#pragma omp parallel for num_threads(num_local_gpus)
-  for (int i = 0; i < num_local_gpus; i++) {
-    auto local_gpu = resource_manager_->get_local_gpu(i);
-    auto gpu_id = local_gpu->get_device_id();
-
-    CudaCPUDeviceContext ctx(gpu_id);
-    auto global_dev_id = resource_manager_->get_gpu_global_id_from_local_id(i);
-
-    const cudaStream_t& stream = s3w_streams_[i];
-
-    // schedule at correct place in iteration
-    HCTR_LIB_THROW(cudaStreamWaitEvent(stream, split_schedule_events_[i]));
-
-    if (!current_batch_cached_) {  // data can be cached for eval
-
-      auto ptr_wrap =
-          std::make_shared<RawPtrWrapper>(reinterpret_cast<InputType*>(batch.dev_data[i]));
-      // To save memory we're going to use the space in the Data for the unprocessed
-      //  sparse features, and then run to_unique_categories essentially in place
-      //    auto current_batch_size = batch.size_bytes / (sample_size_items_ * sizeof(dtype));
-      //    auto in_place_tensor = my_data.samples;
-      //    in_place_tensor.reset_shape({current_batch_size, sparse_dim_});
-      if (mixed_precision_) {
-        split_3_way<__half, SparseType>(
-            batch_tensors.label_tensors[i], batch_tensors.dense_tensors[i],
-            batch_tensors.sparse_tensors[i].get_value_tensor(),
-            core23::Tensor::bind(reinterpret_cast<void*>(ptr_wrap->get_ptr()),
-                                 {current_batch_size_, static_cast<int64_t>(sample_size_items_)},
-                                 core23::ToScalarType<InputType>::value,
-                                 core23::Device(core23::DeviceType::GPU, gpu_id)),
-            global_dev_id * batch_size_per_dev_, (global_dev_id + 1) * batch_size_per_dev_, stream);
-      } else {
-        split_3_way<float, SparseType>(
-            batch_tensors.label_tensors[i], batch_tensors.dense_tensors[i],
-            batch_tensors.sparse_tensors[i].get_value_tensor(),
-            core23::Tensor::bind(reinterpret_cast<void*>(ptr_wrap->get_ptr()),
-                                 {current_batch_size_, static_cast<int64_t>(sample_size_items_)},
-                                 core23::ToScalarType<InputType>::value,
-                                 core23::Device(core23::DeviceType::GPU, gpu_id)),
-            global_dev_id * batch_size_per_dev_, (global_dev_id + 1) * batch_size_per_dev_, stream);
-      }
-    }
-
-    auto sparse_ready_event = local_gpu->get_event("sparse_tensors_ready");
-    HCTR_LIB_THROW(cudaEventRecord(sparse_ready_event, stream));
-
-    auto d2d_stream = d2d_streams_[i];
-
-    // Need result from split-3-way
-    HCTR_LIB_THROW(cudaStreamWaitEvent(d2d_stream, sparse_ready_event));
-
-    // we are safe to overwrite
-    HCTR_LIB_THROW(cudaStreamWaitEvent(d2d_stream, d2d_schedule_events_[i]));
-
-    // batch.dev_data can be reused
-    HCTR_LIB_THROW(cudaEventRecord(completion_events_[i], d2d_stream));
-
-    // isn't part of hybrid embedding
-    assign_dense_and_label_tensors(batch_tensors.label_tensors[i], batch_tensors.dense_tensors[i],
-                                   i, d2d_stream);
-
-    auto tensors_ready_event = local_gpu->get_event("bottom_MLP_tensors_ready");
-    HCTR_LIB_THROW(cudaEventRecord(tensors_ready_event, d2d_stream));
-  }
-
-  batch_tensors.tag = current_batch_id;
-  return current_batch_size_;
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::set_schedule_streams(cudaStream_t s3w_stream, cudaStream_t d2d_stream,
-                                                   int raw_device_id) {
-  s3w_streams_[raw_device_id] = s3w_stream;
-  d2d_streams_[raw_device_id] = d2d_stream;
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::assign_dense_and_label_tensors(core23::Tensor& label_tensor,
-                                                             core23::Tensor& dense_tensor,
-                                                             int raw_device_id,
-                                                             cudaStream_t stream) {
-  auto& dst_label_tensor = label_tensors_[raw_device_id];
-  auto& dst_dense_tensor = dense_tensors_[raw_device_id];
-  // TODO: allocate tensors together
-  if ((char*)dst_label_tensor.data() + dst_label_tensor.num_bytes() ==
-      (char*)dst_dense_tensor.data()) {
-    HCTR_LIB_THROW(cudaMemcpyAsync(dst_label_tensor.data(), label_tensor.data(),
-                                   dst_label_tensor.num_bytes() + dense_tensor.num_bytes(),
-                                   cudaMemcpyDeviceToDevice, stream));
-  } else {
-    HCTR_LIB_THROW(cudaMemcpyAsync(dst_label_tensor.data(), label_tensor.data(),
-                                   dst_label_tensor.num_bytes(), cudaMemcpyDeviceToDevice, stream));
-
-    HCTR_LIB_THROW(cudaMemcpyAsync(dst_dense_tensor.data(), dense_tensor.data(),
-                                   dst_dense_tensor.num_bytes(), cudaMemcpyDeviceToDevice, stream));
-  }
-}
-
-template <typename SparseType>
-long long AsyncReader<SparseType>::get_full_batchsize() const {
-  return batch_size_;
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::stream_wait_sparse_tensors(cudaStream_t stream, int raw_device_id,
-                                                         bool from_graph) {
-  auto gpu = resource_manager_->get_local_gpu(raw_device_id);
-  const auto flags = from_graph ? cudaEventWaitExternal : 0;
-  HCTR_LIB_THROW(cudaStreamWaitEvent(stream, gpu->get_event("sparse_tensors_ready"), flags));
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::stream_wait_dense_tensors(cudaStream_t stream, int raw_device_id,
-                                                        bool from_graph) {
-  auto gpu = resource_manager_->get_local_gpu(raw_device_id);
-  const auto flags = from_graph ? cudaEventWaitExternal : 0;
-  HCTR_LIB_THROW(cudaStreamWaitEvent(stream, gpu->get_event("bottom_MLP_tensors_ready"), flags));
-}
-
-template <typename SparseType>
-bool AsyncReader<SparseType>::current_batch_incomplete() const {
-  return current_batch_size_ != batch_size_;
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::ready_to_collect() {
-  auto raw_device_id = reader_impl_->get_last_batch_device();
-  auto local_gpu = resource_manager_->get_local_gpu(raw_device_id);
-  CudaDeviceContext ctx(local_gpu->get_device_id());
-
-  reader_impl_->finalize_batch(&completion_events_[raw_device_id]);
-}
-
-template <typename SparseType>
-long long AsyncReader<SparseType>::read_a_batch_to_device() {
-  auto result = read_a_batch_to_device_delay_release();
-  ready_to_collect();
-  return result;
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::schedule_split_3_way_here(cudaStream_t stream, int raw_device_id,
-                                                        bool from_graph) {
-  unsigned int flags = from_graph ? cudaEventRecordExternal : 0;
-  HCTR_LIB_THROW(cudaEventRecordWithFlags(split_schedule_events_[raw_device_id], stream, flags));
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::schedule_d2d_here(cudaStream_t stream, int raw_device_id,
-                                                bool from_graph) {
-  unsigned int flags = from_graph ? cudaEventRecordExternal : 0;
-  HCTR_LIB_THROW(cudaEventRecordWithFlags(d2d_schedule_events_[raw_device_id], stream, flags));
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::schedule_here(cudaStream_t stream, int raw_device_id) {
-  HCTR_LIB_THROW(cudaEventRecord(schedule_events_[raw_device_id], stream));
-  reader_impl_->wait_for_gpu_event(&schedule_events_[raw_device_id], raw_device_id);
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::schedule_here_graph(cudaStream_t stream, int raw_device_id) {
-  HCTR_LIB_THROW(
-      cudaEventRecordWithFlags(schedule_events_[raw_device_id], stream, cudaEventRecordExternal));
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::update_schedule_graph(int raw_device_id) {
-  reader_impl_->wait_for_gpu_event(&schedule_events_[raw_device_id], raw_device_id);
-}
-
-template <typename SparseType>
-size_t AsyncReader<SparseType>::get_max_batches_inflight() const {
-  return reader_impl_->get_num_buffers();
-}
-
-template <typename SparseType>
-bool AsyncReader<SparseType>::is_mixed_precision() {
-  return mixed_precision_;
-}
-
-template <typename SparseType>
-void AsyncReader<SparseType>::get_dimensions(size_t& label_dim, size_t& dense_dim,
-                                             size_t& sparse_dim, size_t& sample_size_items) {
-  label_dim = label_dim_;
-  dense_dim = dense_dim_;
-  sparse_dim = sparse_dim_;
-  sample_size_items = sample_size_items_;
-}
-
-template <typename SparseType>
-long long AsyncReader<SparseType>::get_current_batchsize_per_device(size_t local_id) {
-  long long batchsize_per_device = batch_size_ / resource_manager_->get_global_gpu_count();
-  size_t global_id = resource_manager_->get_gpu_global_id_from_local_id(local_id);
-  long long remain_samples = current_batch_size_ - global_id * batchsize_per_device;
-  if (remain_samples >= batchsize_per_device) {
-    return batchsize_per_device;
-  } else if (remain_samples > 0) {
-    return remain_samples;
-  } else {
-    return 0;
-  }
-}
-
-template <typename SparseType>
-TensorScalarType AsyncReader<SparseType>::get_scalar_type() const {
-  return TensorScalarTypeFunc<SparseType>::get_type();
-};
-template <typename SparseType>
-bool AsyncReader<SparseType>::is_started() const {
-  return reader_impl_->is_currently_loading();
-}
-template <typename SparseType>
-void AsyncReader<SparseType>::start() {
-  if (!this->is_started()) {
-    reader_impl_->load_async();
-  }
-}
-
-template <typename SparseType>
-std::vector<core23::Tensor> AsyncReader<SparseType>::get_label_tensor23s() const {
-  return label_tensors_;
-}
-
-template <typename SparseType>
-std::vector<core23::Tensor> AsyncReader<SparseType>::get_dense_tensor23s() const {
-  return dense_tensors_;
-}
-// TODO remove after hybrid embedding deprecation
-template <typename SparseType>
-SparseTensors<SparseType> AsyncReader<SparseType>::get_value_tensors() const {
-  SparseTensors<SparseType> tmp_tensors;
-  // convert from SparseTensor23 to SparseTensor<SparseType>
-  // offset is negligible
-  for (const auto& sparse23 : current_sparse_tensors_) {
-    core23::Tensor value_tensor = sparse23.get_value_tensor();
-    core23::Tensor off_tensor = sparse23.get_rowoffset_tensor();
-    auto shape = value_tensor.shape();
-    std::vector<size_t> dimensions(shape.data(), shape.data() + shape.dims());
-
-    auto value_buffer = PreallocatedBuffer2<SparseType>::create(value_tensor.data(), dimensions);
-    // dummy row_offset tensor
-    auto rowoffset_buffer = PreallocatedBuffer2<SparseType>::create(off_tensor.data(), {1});
-
-    std::shared_ptr<Tensor2<SparseType>> value_tensor2(new Tensor2<SparseType>);
-    std::shared_ptr<Tensor2<SparseType>> off_tensor2(new Tensor2<SparseType>);
-    // dummy nnz
-    SparseTensor<SparseType> current_sparse(dimensions, value_buffer, rowoffset_buffer,
-                                            sparse23.get_nnz_ptr(), 1);
-    tmp_tensors.push_back(current_sparse);
-  }
-  return tmp_tensors;
-}
-template <typename SparseType>
-std::vector<SparseTensor23> AsyncReader<SparseType>::get_value_tensor23s() const {
-  return current_sparse_tensors_;
-}
-
-// TODO remove after hybrid embedding deprecation
-template <typename SparseType>
-std::vector<std::vector<SparseTensor<SparseType>>>
-AsyncReader<SparseType>::get_value_tensor_buffers() const {
-  std::vector<std::vector<SparseTensor<SparseType>>> ret;
-  // std::vector<std::vector<SparseTensor23>> tensors;
-  for (const auto& batch_tensor : inflight_batch_tensors_) {
-    // std::vector<SparseTensor23> gpu_tensors;
-    std::vector<SparseTensor<SparseType>> gpu_tensors;
-    for (const auto& sparse23 : batch_tensor.sparse_tensors) {
-      // gpu_tensors.emplace_back(sparse_tensor);
-      core23::Tensor value_tensor = sparse23.get_value_tensor();
-      core23::Tensor off_tensor = sparse23.get_rowoffset_tensor();
-      auto shape = value_tensor.shape();
-      std::vector<size_t> dimensions(shape.data(), shape.data() + shape.dims());
-
-      auto value_buffer = PreallocatedBuffer2<SparseType>::create(value_tensor.data(), dimensions);
-      // dummy row_offset tensor
-      auto rowoffset_buffer = PreallocatedBuffer2<SparseType>::create(off_tensor.data(), {1});
-
-      std::shared_ptr<Tensor2<SparseType>> value_tensor2(new Tensor2<SparseType>);
-      std::shared_ptr<Tensor2<SparseType>> off_tensor2(new Tensor2<SparseType>);
-      // dummy nnz
-      SparseTensor<SparseType> current_sparse(dimensions, value_buffer, rowoffset_buffer,
-                                              sparse23.get_nnz_ptr(), 1);
-      gpu_tensors.push_back(current_sparse);
-    }
-    ret.emplace_back(gpu_tensors);
-  }
-  // return tensors;
-  return ret;
-}
-template <typename SparseType>
-std::vector<std::vector<SparseTensor23>> AsyncReader<SparseType>::get_value_tensor_buffer23s()
-    const {
-  std::vector<std::vector<SparseTensor23>> ret;
-  for (const auto& batch_tensor : inflight_batch_tensors_) {
-    ret.push_back(batch_tensor.sparse_tensors);
-  }
-  return ret;
-}
-
-#ifndef DISABLE_CUDF
-template <typename SparseType>
-void AsyncReader<SparseType>::create_drwg_parquet(std::string file_list,
-                                                  bool strict_order_of_batches,
-                                                  const std::vector<long long> slot_offset,
-                                                  bool start_reading_from_beginning,
-                                                  long long max_samples_per_group,
-                                                  int label_dense_num, int label_dense_dim) {}
-#endif
-template <typename SparseType>
-void AsyncReader<SparseType>::set_source(std::string file_list) {}
-
-template <typename SparseType>
-AsyncReader<SparseType>::~AsyncReader() {
-  // Underlying reader mush be destroyed BEFORE the events
-  reader_impl_.reset(nullptr);
-  for (auto& e : completion_events_) {
-    cudaEventDestroy(e);
-  }
-  for (auto& e : schedule_events_) {
-    cudaEventDestroy(e);
-  }
-}
-
-template class AsyncReader<uint32_t>;
-template class AsyncReader<long long>;
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/data_readers/async_reader/broadcast.cu b/HugeCTR/src/data_readers/async_reader/broadcast.cu
deleted file mode 100644
index 71534f95cc..0000000000
--- a/HugeCTR/src/data_readers/async_reader/broadcast.cu
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common.hpp>
-#include <data_readers/async_reader/broadcast.hpp>
-#include <utils.hpp>
-
-namespace HugeCTR {
-
-constexpr int copy_width = 4;
-
-namespace {
-
-inline __device__ float4 read4(const float* src, int n) {
-  if (n == copy_width) {
-    return *((float4*)src);
-  } else {
-    float4 res;
-    if (n > 0) res.x = src[0];
-    if (n > 1) res.y = src[1];
-    if (n > 2) res.z = src[2];
-    return res;
-  }
-}
-
-inline __device__ void write4(float* dst, int n, float4 val) {
-  if (n == copy_width) {
-    *((float4*)dst) = val;
-  } else {
-    if (n > 0) dst[0] = val.x;
-    if (n > 1) dst[1] = val.y;
-    if (n > 2) dst[2] = val.z;
-  }
-}
-
-__global__ void broadcast_kernel(float** addrs, const bool* p2p_accessible, int batch_size_floats,
-                                 int num_dests, int src_id) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  int idx4 = idx * copy_width;
-  int num_elems = min(batch_size_floats - idx4, copy_width);
-
-  float4 src_val = read4(addrs[src_id] + idx4, num_elems);
-  for (int i = 1; i < num_dests; i++) {
-    int dst_id = (src_id + i) % num_dests;
-    if (p2p_accessible[dst_id]) {
-      write4(addrs[dst_id] + idx4, num_elems, src_val);
-    }
-  }
-}
-
-}  // namespace
-
-void broadcast(float** dev_pointers, const bool* dev_p2p_accessible, int batch_size_floats,
-               int num_dests, int src_id, cudaStream_t stream) {
-  int block_size = 128;
-  int grid_size = (batch_size_floats + copy_width * block_size - 1) / block_size;
-
-  constexpr bool use_kernel = false;
-
-  for (int i = 1; i < num_dests; i++) {
-    int dst_id = (src_id + i) % num_dests;
-    if (!dev_p2p_accessible[dst_id] || (!use_kernel)) {
-      HCTR_LIB_THROW(cudaMemcpyAsync(dev_pointers[dst_id], dev_pointers[src_id],
-                                     batch_size_floats * sizeof(float), cudaMemcpyDeviceToDevice,
-                                     stream));
-    }
-  }
-
-  if (use_kernel) {
-    broadcast_kernel<<<grid_size, block_size, 0, stream>>>(dev_pointers, dev_p2p_accessible,
-                                                           batch_size_floats, num_dests, src_id);
-  }
-}
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu b/HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu
deleted file mode 100644
index e629e6b9c1..0000000000
--- a/HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cassert>
-#include <common.hpp>
-#include <data_readers/async_reader/split_label_dense_sparse.hpp>
-
-namespace HugeCTR {
-
-// Sparse pointer should be casted to int* when calling this kernel
-template <typename DenseType, typename SparseType>
-__global__ void split_kernel_3_way(int batch_size, float* label_ptr, int label_dim,
-                                   DenseType* dense_ptr, int dense_dim, int dense_dim_no_align,
-                                   SparseType* sparse_ptr, int sparse_dim,
-                                   const int* label_dense_sparse, int sample_size_int,
-                                   size_t local_idx_start, size_t local_idx_end) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (idx < batch_size * sample_size_int) {
-    const int in_col = idx % sample_size_int;
-    const int in_row = idx / sample_size_int;
-    const int out_row = in_row;
-    if (in_col < label_dim) {
-      const int out_col = in_col;
-      int label = label_dense_sparse[idx];
-      if (local_idx_start <= out_row && out_row < local_idx_end) {
-        label_ptr[(out_row - local_idx_start) * label_dim + out_col] = label;
-      }
-    } else if (in_col < label_dim + dense_dim_no_align) {
-      const int out_col = in_col - label_dim;
-      int dense = label_dense_sparse[idx];
-      if (local_idx_start <= out_row && out_row < local_idx_end) {
-        dense_ptr[(out_row - local_idx_start) * dense_dim + out_col] =
-            logf(dense + 1.f);  // TODO : FIXME move to data preprocessing
-      }
-    } else {
-      const int out_col = in_col - label_dim - dense_dim_no_align;
-      sparse_ptr[out_row * sparse_dim + out_col] = label_dense_sparse[idx];
-    }
-  }
-  return;
-}
-
-template <int samples_per_cta, typename DenseType, typename SparseType>
-__global__ void split_kernel_3_way_read4_write4(int batch_size, float* label_ptr, int label_dim,
-                                                DenseType* dense_ptr, int dense_dim,
-                                                int dense_dim_no_align, int* sparse_ptr,
-                                                int sparse_dim, const int* label_dense_sparse,
-                                                int sample_size_int, size_t local_idx4_start,
-                                                size_t local_idx4_end) {
-  using DenseType4 = typename std::conditional<(sizeof(DenseType) == 4), int4, int2>::type;
-  extern __shared__ int label_dense_sparse_s[];
-  constexpr int vec_size = sizeof(int4) / sizeof(int);
-  static_assert(samples_per_cta % vec_size == 0,
-                "Number of samples per block has to respect divisibility constraints");
-  assert(blockDim.x >= 3 * warpSize);
-
-  const int idx_l = threadIdx.x;
-  const int warp_id = threadIdx.x / warpSize;
-  const int lane_id = threadIdx.x % warpSize;
-
-  const int my_cta_samples = min(samples_per_cta, batch_size - samples_per_cta * blockIdx.x);
-  if (my_cta_samples <= 0) {
-    return;
-  }
-  assert(my_cta_samples % vec_size == 0);
-
-  int4* label_dense_sparse_s_align4 = reinterpret_cast<int4*>(label_dense_sparse_s);
-  const int4* label_dense_sparse_align4 = reinterpret_cast<const int4*>(label_dense_sparse);
-
-  float* label_s =
-      reinterpret_cast<float*>(label_dense_sparse_s + sample_size_int * samples_per_cta);
-  DenseType* dense_s = reinterpret_cast<DenseType*>(label_s + label_dim * samples_per_cta);
-  SparseType* sparse_s = reinterpret_cast<SparseType*>((int*)dense_s + dense_dim * samples_per_cta);
-
-  // read with int4
-  const int src_base = samples_per_cta * sample_size_int / vec_size * blockIdx.x;
-  for (int id = idx_l; id < my_cta_samples * sample_size_int / vec_size; id += blockDim.x) {
-    label_dense_sparse_s_align4[id] = label_dense_sparse_align4[src_base + id];
-  }
-
-  for (int id = idx_l; id < samples_per_cta * dense_dim; id += blockDim.x) {
-    dense_s[id] = 0;
-  }
-
-  __syncthreads();
-
-  // transpose
-  for (int id = idx_l; id < samples_per_cta * sample_size_int; id += blockDim.x) {
-    const int in_col = id % sample_size_int;
-    const int in_row = id / sample_size_int;
-    const int out_row = in_row;
-    if (in_col < label_dim) {
-      const int out_col = in_col;
-      label_s[out_row * label_dim + out_col] = label_dense_sparse_s[id];
-    } else if (in_col < label_dim + dense_dim_no_align) {
-      const int out_col = in_col - label_dim;
-      int dense = label_dense_sparse_s[id];
-      dense_s[out_row * dense_dim + out_col] =
-          logf(dense + 1.f);  // TODO : FIXME move to data preprocessing
-    } else {
-      const int out_col = in_col - label_dim - dense_dim_no_align;
-      sparse_s[out_row * sparse_dim + out_col] = label_dense_sparse_s[id];
-    }
-  }
-  __syncthreads();
-
-  float4* label_s_align4 = reinterpret_cast<float4*>(label_s);
-  DenseType4* dense_s_align4 = reinterpret_cast<DenseType4*>(dense_s);
-  int4* sparse_s_align4 = reinterpret_cast<int4*>(sparse_s);
-  float4* label_align4 = reinterpret_cast<float4*>(label_ptr);
-  DenseType4* dense_align4 = reinterpret_cast<DenseType4*>(dense_ptr);
-  int4* sparse_align4 = reinterpret_cast<int4*>(sparse_ptr);
-
-  const int label_size_int4_per_cta = label_dim * samples_per_cta / vec_size;
-  const int dense_size_int4_per_cta = dense_dim * samples_per_cta / vec_size;
-  const int sparse_size_int4_per_cta = sparse_dim * samples_per_cta / vec_size;
-
-  if (warp_id == 0) {
-    for (int id = lane_id; id < label_dim * my_cta_samples / vec_size; id += warpSize) {
-      size_t local_idx4 = id + blockIdx.x * label_size_int4_per_cta;
-      if (label_dim * local_idx4_start <= local_idx4 && local_idx4 < label_dim * local_idx4_end) {
-        label_align4[local_idx4 - label_dim * local_idx4_start] = label_s_align4[id];
-      }
-    }
-  }
-  if (warp_id == 1) {
-    for (int id = lane_id; id < dense_dim * my_cta_samples / vec_size; id += warpSize) {
-      size_t local_idx4 = id + blockIdx.x * dense_size_int4_per_cta;
-      if (dense_dim * local_idx4_start <= local_idx4 && local_idx4 < dense_dim * local_idx4_end) {
-        dense_align4[local_idx4 - dense_dim * local_idx4_start] = dense_s_align4[id];
-      }
-    }
-  }
-  if (warp_id == 2) {
-    for (int id = lane_id; id < sparse_dim * my_cta_samples / vec_size; id += warpSize) {
-      sparse_align4[id + blockIdx.x * sparse_size_int4_per_cta] = sparse_s_align4[id];
-    }
-  }
-}
-
-template <typename DenseType, typename SparseType>
-void split_3_way(core23::Tensor& label_tensor_per_dev, core23::Tensor& dense_tensor_per_dev,
-                 core23::Tensor& sparse_tensor, const core23::Tensor& label_dense_sparse_buffer,
-                 size_t local_idx_start, size_t local_idx_end, cudaStream_t stream) {
-  if (label_dense_sparse_buffer.dims() > 0) {
-    assert(label_tensor_per_dev.size(0) == dense_tensor_per_dev.size(0));
-    assert(label_tensor_per_dev.size(0) == local_idx_end - local_idx_start);
-
-    const int batch_size = label_dense_sparse_buffer.size(0);
-    const int label_dim = label_tensor_per_dev.size(1);
-    const int dense_dim = dense_tensor_per_dev.size(1);
-    const int sparse_dim = sparse_tensor.size(1);
-    const int sample_size_int = label_dense_sparse_buffer.size(1);
-    cudaPointerAttributes attributes_src, attributes_dst;
-
-    int dense_dim_no_align = sample_size_int - label_dim - sparse_dim;
-
-    constexpr int block_dim = 128;
-    constexpr int samples_per_cta = 24;
-
-    int vec_width = sizeof(int4) / sizeof(int);
-    if (sizeof(SparseType) == 4 && batch_size % vec_width == 0 &&
-        local_idx_start % vec_width == 0 && local_idx_end % vec_width == 0 &&
-        samples_per_cta * sample_size_int * sizeof(int) <= 24 * 1024) {
-      const int grid_dim = (batch_size + samples_per_cta - 1) / samples_per_cta;
-      const int shmem = 2 * samples_per_cta * (label_dim + dense_dim + sparse_dim) * sizeof(int);
-
-      split_kernel_3_way_read4_write4<samples_per_cta, DenseType, SparseType>
-          <<<grid_dim, block_dim, shmem, stream>>>(
-              batch_size, label_tensor_per_dev.data<float>(), label_dim,
-              dense_tensor_per_dev.data<DenseType>(), dense_dim, dense_dim_no_align,
-              sparse_tensor.data<int>(), sparse_dim, label_dense_sparse_buffer.data<int>(),
-              sample_size_int, local_idx_start / vec_width, local_idx_end / vec_width);
-    } else {
-      const int grid_dim = (label_dense_sparse_buffer.num_elements() - 1) / block_dim + 1;
-      split_kernel_3_way<DenseType, SparseType><<<grid_dim, block_dim, 0, stream>>>(
-          batch_size, label_tensor_per_dev.data<float>(), label_dim,
-          dense_tensor_per_dev.data<DenseType>(), dense_dim, dense_dim_no_align,
-          sparse_tensor.data<SparseType>(), sparse_dim, label_dense_sparse_buffer.data<int>(),
-          sample_size_int, local_idx_start, local_idx_end);
-    }
-
-    HCTR_LIB_THROW(cudaPeekAtLastError());
-  }
-}
-
-template void split_3_way<float, uint32_t>(core23::Tensor& label_tensor_per_dev,
-                                           core23::Tensor& dense_tensor_per_dev,
-                                           core23::Tensor& sparse_tensor,
-                                           const core23::Tensor& label_dense_sparse_buffer,
-                                           size_t local_idx_start, size_t local_idx_end,
-                                           cudaStream_t stream);
-template void split_3_way<__half, uint32_t>(core23::Tensor& label_tensor_per_dev,
-                                            core23::Tensor& dense_tensor_per_dev,
-                                            core23::Tensor& sparse_tensor,
-                                            const core23::Tensor& label_dense_sparse_buffer,
-                                            size_t local_idx_start, size_t local_idx_end,
-                                            cudaStream_t stream);
-
-template void split_3_way<float, long long>(core23::Tensor& label_tensor_per_dev,
-                                            core23::Tensor& dense_tensor_per_dev,
-                                            core23::Tensor& sparse_tensor,
-                                            const core23::Tensor& label_dense_sparse_buffer,
-                                            size_t local_idx_start, size_t local_idx_end,
-                                            cudaStream_t stream);
-template void split_3_way<__half, long long>(core23::Tensor& label_tensor_per_dev,
-                                             core23::Tensor& dense_tensor_per_dev,
-                                             core23::Tensor& sparse_tensor,
-                                             const core23::Tensor& label_dense_sparse_buffer,
-                                             size_t local_idx_start, size_t local_idx_end,
-                                             cudaStream_t stream);
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp b/HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp
deleted file mode 100644
index 36c5665517..0000000000
--- a/HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <fcntl.h>
-
-#include <cassert>
-#include <common.hpp>
-#include <data_readers/async_reader/async_reader_common.hpp>
-#include <data_readers/async_reader/broadcast.hpp>
-#include <data_readers/async_reader/thread_async_reader.hpp>
-#include <numeric>
-#include <resource_manager.hpp>
-#include <stdexcept>
-namespace HugeCTR {
-
-ThreadAsyncReader::ThreadAsyncReader(std::string fname, const ResourceManager* resource_mananager,
-                                     size_t batch_size_bytes, int device_id, cudaStream_t stream,
-                                     std::vector<size_t> batch_ids,
-                                     std::vector<InternalBatchBuffer*> dest_buffers,
-                                     ThreadAsyncReaderParameters params, size_t total_file_size)
-    : batch_size_bytes_(batch_size_bytes),
-      device_id_(device_id),
-      stream_(stream),
-      total_file_size_(total_file_size),
-      batch_ids_(batch_ids),
-      dest_buffers_(dest_buffers),
-      params_(params),
-      num_buffers_waiting_io_(0) {
-#if (__cplusplus >= 201703L)
-  static_assert(std::atomic<BufferStatus>::is_always_lock_free &&
-                    std::atomic<WorkerStatus>::is_always_lock_free,
-                "Compiler cannot use atomic enum class, need to change to int type");
-#endif
-  HCTR_CHECK_HINT(params_.io_block_size % params_.io_alignment == 0,
-                  " params_.io_block_size % params_.io_alignment != 0");
-
-  num_dest_buffers_ = dest_buffers_.size();
-
-  fd_ = open(fname.c_str(), O_RDONLY | O_DIRECT);
-  if (fd_ == -1) {
-    int errnum = errno;
-    if (errnum == ENOENT) {
-      throw std::runtime_error("No such file: " + fname);
-    } else if (errnum == EINVAL) {
-      HCTR_LOG(WARNING, ROOT,
-               "Current filesystem does not support O_DIRECT open(), use "
-               "general open() instead\n");
-      fd_ = open(fname.c_str(), O_RDONLY);
-    }
-    if (fd_ == -1) {
-      throw std::runtime_error("Open " + fname + " fails due to uncertain reason");
-    }
-  };
-
-  max_num_blocks_per_batch_ = batch_size_bytes_ / params_.io_block_size + 2;
-  size_t pinned_size = 0;
-  for (auto buf : dest_buffers_) {
-    buf->raw_host_ptr = (char*)aligned_alloc(params_.io_alignment,
-                                             max_num_blocks_per_batch_ * params_.io_block_size);
-    HCTR_LIB_THROW(
-        cudaHostRegister(buf->raw_host_ptr, max_num_blocks_per_batch_ * params_.io_block_size, 0));
-    assert((size_t)buf->raw_host_ptr % params_.io_alignment == 0);
-
-    HCTR_LIB_THROW(cudaEventCreateWithFlags(&buf->event, cudaEventDisableTiming));
-
-    buf->io_reqs.resize(max_num_blocks_per_batch_);
-    for (auto& req : buf->io_reqs) {
-      req = new iocb;
-    }
-    pinned_size += max_num_blocks_per_batch_ * params_.io_block_size;
-  }
-  for (auto buf : dest_buffers_) {
-    buf->status.store(BufferStatus::IOReady);
-  }
-}
-
-void ThreadAsyncReader::load() {
-  size_t num_batches = batch_ids_.size();
-  size_t processed = 0;
-  std::vector<size_t> id_per_host_buffer(num_dest_buffers_);
-  std::iota(id_per_host_buffer.begin(), id_per_host_buffer.end(), 0);
-
-  status_.store(WorkerStatus::OK);
-  for (auto buf : dest_buffers_) {
-    buf->safe_to_upload_event.store(nullptr);
-    buf->ready_to_upload_event.store(nullptr);
-    buf->preload_done = false;
-  }
-
-  ioctx_ = 0;
-  if (io_queue_init(params_.io_depth, &ioctx_) < 0) {
-    HCTR_OWN_THROW(Error_t::UnspecificError, "io_setup failed");
-  }
-
-  while (status_.load() != WorkerStatus::Terminate) {
-    // bool all_resident = true;
-    // for (auto buf : dest_buffers_) {
-    //   if (buf->status != BufferStatus::PermanentlyResident) {
-    //     all_resident = false;
-    //     break;
-    //   }
-    // }
-    // if (all_resident){
-    //   return;
-    // }
-
-    for (int i = 0; i < num_dest_buffers_; i++) {
-      if (id_per_host_buffer[i] < num_batches) {
-        try_submit_io(batch_ids_[id_per_host_buffer[i]], i);
-      }
-    }
-    wait_io();
-    for (int i = 0; i < num_dest_buffers_; i++) {
-      if (id_per_host_buffer[i] < num_batches) {
-        try_submit_p2p(dest_buffers_[i]);
-      }
-    }
-    for (int i = 0; i < num_dest_buffers_; i++) {
-      if (id_per_host_buffer[i] < num_batches) {
-        try_submit_upload(dest_buffers_[i]);
-      }
-    }
-    for (int i = 0; i < num_dest_buffers_; i++) {
-      if (id_per_host_buffer[i] < num_batches) {
-        if (check_completion(dest_buffers_[i])) {
-          processed++;
-          id_per_host_buffer[i] += num_dest_buffers_;
-          if (params_.loop && id_per_host_buffer[i] >= num_batches) {
-            id_per_host_buffer[i] = i;
-          }
-        }
-      }
-    }
-    usleep(10);
-    if (!params_.loop && processed >= num_batches) {
-      break;
-    }
-  }
-
-  if (io_destroy(ioctx_) < 0) {
-    throw std::runtime_error("io_destroy failed");
-  }
-
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream_));
-
-  if (status_.load() != WorkerStatus::Terminate) {
-    for (int i = 0; i < num_dest_buffers_; i++) {
-      BufferStatus expected = BufferStatus::IOReady;
-      while (!dest_buffers_[i]->status.compare_exchange_weak(expected, BufferStatus::Finished)) {
-        expected = BufferStatus::IOReady;
-      }
-    }
-  }
-}
-
-void ThreadAsyncReader::try_submit_io(size_t batch_id, int io_id) {
-  auto& buffer = dest_buffers_[io_id];
-  if (buffer->status.load() != BufferStatus::IOReady) {
-    return;
-  }
-  // Maybe we have already loaded this batch before?!
-  if (buffer->id == (int64_t)batch_id) {
-    buffer->status.store(BufferStatus::PermanentlyResident);
-    return;
-  }
-
-  buffer->status.store(BufferStatus::IOInProcess);
-
-  size_t req_beg_offset = batch_id * batch_size_bytes_;
-  size_t req_end_offset = std::min((batch_id + 1) * batch_size_bytes_, total_file_size_);
-  size_t raw_beg_offset = (req_beg_offset / params_.io_block_size) * params_.io_block_size;
-  size_t raw_end_offset = ((req_end_offset + params_.io_block_size - 1) / params_.io_block_size) *
-                          params_.io_block_size;
-  size_t num_blocks = (raw_end_offset - raw_beg_offset) / params_.io_block_size;
-  assert(num_blocks <= (size_t)max_num_blocks_per_batch_);
-
-  buffer->id = batch_id;
-  buffer->num_outstanding_reqs = num_blocks;
-  buffer->num_submitted_h2d_chunks = 0;
-  buffer->num_submitted_broadcasts = 0;
-  buffer->size = req_end_offset - req_beg_offset;
-  buffer->host_data = buffer->raw_host_ptr + (req_beg_offset - raw_beg_offset);
-  assert(buffer->size % sizeof(float) == 0);
-
-  for (size_t block = 0; block < num_blocks; block++) {
-    auto req = buffer->io_reqs[block];
-
-    io_prep_pread(req, fd_, buffer->raw_host_ptr + params_.io_block_size * block,
-                  params_.io_block_size, raw_beg_offset + params_.io_block_size * block);
-    req->data = (void*)buffer;
-  }
-
-  int ret = io_submit(ioctx_, num_blocks, buffer->io_reqs.data());
-  num_buffers_waiting_io_ += 1;
-  if (ret < 0) {
-    HCTR_OWN_THROW(Error_t::UnspecificError, "io_submit failed");
-  }
-}
-
-void ThreadAsyncReader::wait_io() {
-  timespec timeout = {0, 10'000l};
-
-  io_event events[max_num_blocks_per_batch_];
-  int num_completed =
-      io_getevents(ioctx_, max_num_blocks_per_batch_, max_num_blocks_per_batch_, events, &timeout);
-  if (num_completed < 0) {
-    HCTR_OWN_THROW(Error_t::UnspecificError, "io_getevents failed");
-  }
-
-  for (int b = 0; b < num_completed; b++) {
-    auto req = events[b].obj;
-    if ((events[b].res < 0 || events[b].res2 != 0)) {
-      HCTR_OWN_THROW(Error_t::UnspecificError, "io_getevents returned failed event");
-    }
-    auto buffer = (InternalBatchBuffer*)req->data;
-    buffer->num_outstanding_reqs--;
-    assert(buffer->num_outstanding_reqs >= 0);
-    if (buffer->num_outstanding_reqs == 0) {
-      num_buffers_waiting_io_ -= 1;
-      buffer->status.store(BufferStatus::UploadInProcess);
-      if (params_.wait_for_gpu_idle) {
-        buffer->ready_to_upload_event.store(nullptr);
-      }
-    }
-  }
-}
-
-bool ThreadAsyncReader::wait_for_gpu_idle(InternalBatchBuffer* buffer) {
-  if (params_.wait_for_gpu_idle && buffer->preload_done) {
-    auto event_ptr = buffer->ready_to_upload_event.load();
-    if (event_ptr == nullptr) {
-      return false;
-    } else {
-      buffer->ready_to_upload_event.store(nullptr);
-      HCTR_LIB_THROW(cudaStreamWaitEvent(stream_, *event_ptr));
-    }
-  }
-  return true;
-}
-
-void ThreadAsyncReader::try_submit_upload(InternalBatchBuffer* buffer) {
-  if (buffer->status.load() != BufferStatus::UploadInProcess ||
-      buffer->num_submitted_h2d_chunks >= params_.num_h2d_chunks) {
-    return;
-  }
-  if (!wait_for_gpu_idle(buffer)) {
-    return;
-  }
-
-  // H2D upload
-  // Wait until the buffers are consumed (one event after a barrier)
-  if (buffer->num_submitted_h2d_chunks == 0 && buffer->safe_to_upload_event != nullptr) {
-    HCTR_LIB_THROW(cudaStreamWaitEvent(stream_, *buffer->safe_to_upload_event));
-  }
-
-  size_t chunk_size = (buffer->size + params_.num_h2d_chunks - 1) / params_.num_h2d_chunks;
-  size_t beg_offset = chunk_size * buffer->num_submitted_h2d_chunks;
-  size_t end_offset = std::min(buffer->size, chunk_size * (buffer->num_submitted_h2d_chunks + 1));
-
-  HCTR_LIB_THROW(cudaMemcpyAsync(buffer->dev_data[device_id_] + beg_offset,
-                                 buffer->host_data + beg_offset, end_offset - beg_offset,
-                                 cudaMemcpyHostToDevice, stream_));
-  buffer->num_submitted_h2d_chunks++;
-}
-
-void ThreadAsyncReader::try_submit_p2p(InternalBatchBuffer* buffer) {
-  if (buffer->status.load() != BufferStatus::UploadInProcess ||
-      buffer->num_submitted_h2d_chunks < params_.num_h2d_chunks) {
-    return;
-  }
-  if (!wait_for_gpu_idle(buffer)) {
-    return;
-  }
-
-  // Broadcast to the other GPUs
-  if (buffer->num_submitted_broadcasts != (int)buffer->dev_data.size()) {
-    if (device_id_ != buffer->num_submitted_broadcasts) {
-      HCTR_LIB_THROW(cudaMemcpyAsync(buffer->dev_data[buffer->num_submitted_broadcasts],
-                                     buffer->dev_data[device_id_], buffer->size, cudaMemcpyDefault,
-                                     stream_));
-    }
-    buffer->num_submitted_broadcasts++;
-    return;
-  }
-
-  // Here we've submitted everything
-  // There is no real need to make eventRecord atomic (wrt stream) with the
-  // rest,
-  //  we only care that eventRecord is AFTER the H2D and the broadcast
-  buffer->preload_done = true;
-  buffer->num_submitted_h2d_chunks = 0;
-  buffer->num_submitted_broadcasts = 0;
-  HCTR_LIB_THROW(cudaEventRecord(buffer->event, stream_));
-  buffer->status.store(BufferStatus::UploadSubmitted);
-}
-
-bool ThreadAsyncReader::check_completion(InternalBatchBuffer* buffer) {
-  if (buffer->status.load() != BufferStatus::UploadSubmitted) {
-    return false;
-  }
-
-  auto res = cudaEventQuery(buffer->event);
-  if (res == cudaSuccess) {
-    buffer->status.store(BufferStatus::ReadReady);
-    return true;
-  }
-  if (res == cudaErrorNotReady) {
-    return false;
-  }
-  HCTR_LIB_THROW(res);
-  return false;
-}
-
-void ThreadAsyncReader::reset() {
-  status_.store(WorkerStatus::Terminate);
-  for (auto buf : dest_buffers_) {
-    buf->status.store(BufferStatus::IOReady);
-  }
-}
-
-ThreadAsyncReader::~ThreadAsyncReader() = default;
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp b/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp
index 606fe785b1..2a1e90f1c2 100644
--- a/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp
+++ b/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp
@@ -17,9 +17,8 @@
 #include <common.hpp>
 #include <core23/tensor.hpp>
 #include <data_reader.hpp>
-#include <data_readers/async_reader/async_reader_adapter.hpp>
-#include <data_readers/async_reader/async_reader_common.hpp>
 #include <data_readers/multi_hot/async_data_reader.hpp>
+#include <data_readers/multi_hot/async_reader_common.hpp>
 #include <data_readers/multi_hot/split_batch.hpp>
 #include <inference/preallocated_buffer2.hpp>
 #include <resource_manager.hpp>
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu b/HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu
deleted file mode 100644
index c2e7448433..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/calibration_data.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <tensor2.hpp>
-#include <utils.cuh>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-namespace calibration_data_kernels {
-
-template <typename CountsT, typename ThresholdT, typename IdxT>
-__global__ void binary_threshold_search(const CountsT *__restrict__ counts, ThresholdT threshold,
-                                        IdxT *out_idx, IdxT n_elem) {
-  if (threadIdx.x == 0) {
-    IdxT start = 0;
-    IdxT end = n_elem;
-    while (start < end) {
-      IdxT mid = (start + end) / 2;
-      CountsT count = counts[mid];
-
-      if (count >= threshold)
-        start = mid + 1;
-      else
-        end = mid;
-    }
-
-    *out_idx = start;
-  }
-}
-
-template <typename CountsT>
-__global__ void sum_counts(const CountsT *__restrict__ counts, CountsT *result, size_t n_elem) {
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  CountsT val = 0;
-  if (tid < n_elem) {
-    val = counts[tid];
-  }
-  CountsT local_res = blockReduceSum(val);
-  if (threadIdx.x == 0) {
-    atomicAdd(result, local_res);
-  }
-}
-
-}  // namespace calibration_data_kernels
-
-///
-/// interpolate data_size using the two calibration data
-///   calibrated_data_size, calibrated_times
-///   return communication_times
-///
-void CalibrationData::interpolate(const Tensor2<float> &calibrated_data_size,
-                                  const Tensor2<float> &calibrated_times,
-                                  const Tensor2<float> &data_size,
-                                  Tensor2<float> &communication_times) {
-  // TODO: implement this
-}
-
-///
-/// Convenience function for interpolating all-to-all communication times from
-/// calibrated data
-///
-void CalibrationData::interpolate_all_reduce(const Tensor2<float> &data_size,
-                                             Tensor2<float> &communication_times) {
-  interpolate(all_reduce_data_size, all_reduce_times, data_size, communication_times);
-}
-
-///
-/// Convenience function for interpolating all-to-all communication times from
-/// calibrated data
-///
-void CalibrationData::interpolate_all_to_all(const Tensor2<float> &data_size,
-                                             Tensor2<float> &communication_times) {
-  interpolate(all_to_all_data_size, all_to_all_times, data_size, communication_times);
-}
-
-// Calculate threshold such that for the worst case distribution there will
-// be one duplication per network on average
-template <typename dtype>
-double ModelInitializationFunctors<dtype>::calculate_threshold(
-    const CommunicationType communication_type, double p_dup_max, double all_to_all_bandwidth,
-    double all_reduce_bandwidth, double efficiency_bandwidth_ratio, size_t num_nodes,
-    size_t batch_size, size_t num_networks, size_t num_iterations, size_t num_tables) {
-  double count_threshold = 1.;
-
-  // for NVLink capture effectively all duplications with number of categories
-  double M = (double)batch_size / (double)num_networks;
-  // double p_dup_max = 1.0 / 100.;  // maximum 1 % of samples the category will be duplicated
-  switch (communication_type) {
-    case CommunicationType::IB_NVLink:
-      count_threshold = (double)num_iterations * (double)num_networks * all_to_all_bandwidth /
-                        all_reduce_bandwidth * efficiency_bandwidth_ratio * (double)num_networks /
-                        ((double)num_networks - 1.);
-      break;
-    case CommunicationType::IB_NVLink_Hier:
-      count_threshold = (double)num_iterations * (double)num_networks * all_to_all_bandwidth /
-                        all_reduce_bandwidth * efficiency_bandwidth_ratio * (double)num_nodes /
-                        ((double)num_nodes - 1.);
-      break;
-    case CommunicationType::NVLink_SingleNode:
-      // count threshold such that the probability of duplication is less than p_dup_max
-      //   even if there are batch size number of categories that occur more often,
-      //   there will be a duplication at most once every iteration per gpu
-      //
-      // p_duplication(category) \approx 1/2 M (M-1) \left( \frac{count}{batch_size x
-      // num_iterations} \right)^2
-      count_threshold =
-          (double)batch_size * (double)num_iterations * sqrt(2.0 * p_dup_max / (M * (M - 1)));
-      break;
-    default:
-      HCTR_OWN_THROW(Error_t::WrongInput,
-                     "Unknown communication type, expecting IB_NVLink or NVLink");
-  }
-
-  return count_threshold;
-}
-
-///
-/// Calculate the number of frequent categories from data
-///
-template <typename dtype>
-dtype ModelInitializationFunctors<dtype>::calculate_num_frequent_categories(
-    const CommunicationType &communication_type, const size_t num_networks,
-    const CalibrationData &calibration, const Statistics<dtype> &statistics,
-    const Data<dtype> &data, dtype *d_num_frequent, cudaStream_t stream) {
-  dtype num_frequent;
-  dtype num_top_categories = (dtype)statistics.num_unique_categories;
-
-  if (calibration.all_to_all_times.get_size_in_bytes() > 0) {
-    // calibration is given, perform fully optimized hybrid model
-    HCTR_OWN_THROW(Error_t::WrongInput,
-                   "initialization hybrid model from communication calibration not available yet");
-  } else {
-    size_t num_nodes = calibration.num_nodes;
-    size_t batch_size = data.batch_size;
-    size_t num_iterations = data.num_iterations;
-    size_t num_tables = data.table_sizes.size();
-
-    // Use threshold to determine number of frequent categories,
-    // calculates optimal number of frequent categories when the all-to-all
-    // and all-reduce are both bandwidth limited.
-    double count_threshold = ModelInitializationFunctors::calculate_threshold(
-        communication_type, calibration.p_dup_max, calibration.max_all_to_all_bandwidth,
-        calibration.max_all_reduce_bandwidth, calibration.efficiency_bandwidth_ratio, num_nodes,
-        batch_size, num_networks, num_iterations, num_tables);
-
-    calibration_data_kernels::binary_threshold_search<<<1, 1, 0, stream>>>(
-        statistics.counts_sorted.get_ptr(), count_threshold, d_num_frequent,
-        (dtype)num_top_categories);
-
-    HCTR_LIB_THROW(cudaMemcpyAsync(&num_frequent, d_num_frequent, sizeof(dtype),
-                                   cudaMemcpyDeviceToHost, stream));
-    HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-  }
-  if (num_frequent > 0) {
-    num_frequent = ((num_frequent - 1) / num_networks + 1) * num_networks;
-  }
-  if (num_frequent > num_top_categories) {
-    num_frequent -= num_networks;
-  }
-  return num_frequent;
-}
-
-///
-/// Calculate the number of frequent categories from data
-///
-template <typename dtype>
-double ModelInitializationFunctors<dtype>::calculate_frequent_probability(
-    const Statistics<dtype> &statistics, const dtype num_frequent, uint32_t *d_total_frequent_count,
-    cudaStream_t stream) {
-  uint32_t total_frequent_count;
-
-  HCTR_LIB_THROW(cudaMemsetAsync(d_total_frequent_count, 0, sizeof(uint32_t), stream));
-  calibration_data_kernels::sum_counts<<<num_frequent / 128 + 1, 128, 0, stream>>>(
-      statistics.counts_sorted.get_ptr(), d_total_frequent_count, num_frequent);
-  HCTR_LIB_THROW(cudaMemcpyAsync(&total_frequent_count, d_total_frequent_count, sizeof(dtype),
-                                 cudaMemcpyDeviceToHost, stream));
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-
-  return (double)total_frequent_count / (double)statistics.num_samples;
-}
-
-template class ModelInitializationFunctors<uint32_t>;
-template class ModelInitializationFunctors<long long>;
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/communication.cu b/HugeCTR/src/embeddings/hybrid_embedding/communication.cu
deleted file mode 100644
index 36175747e4..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/communication.cu
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/communication.hpp>
-#include <iostream>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace {
-
-template <typename T>
-ncclDataType_t get_nccl_type();
-template <>
-ncclDataType_t get_nccl_type<int>() {
-  return ncclInt32;
-}
-template <>
-ncclDataType_t get_nccl_type<unsigned int>() {
-  return ncclUint32;
-}
-template <>
-ncclDataType_t get_nccl_type<unsigned long long>() {
-  return ncclUint64;
-}
-template <>
-ncclDataType_t get_nccl_type<float>() {
-  return ncclFloat32;
-}
-template <>
-ncclDataType_t get_nccl_type<__half>() {
-  return ncclFloat16;
-}
-
-}  // namespace
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-Communication::Communication(size_t width_data_field) : width_data_field_(width_data_field) {}
-
-/*
- * All to All communications
- */
-template <typename commtype>
-AllToAllVComm<commtype>::AllToAllVComm(Tensor2<commtype> send_buffer, Tensor2<commtype> recv_buffer,
-                                       const uint32_t* send_offsets, const uint32_t* recv_offsets,
-                                       const GPUResource* gpu_resource, size_t width_data_field)
-    : Communication(width_data_field),
-      send_buffer_(send_buffer),
-      recv_buffer_(recv_buffer),
-      send_offsets_(send_offsets),
-      recv_offsets_(recv_offsets),
-      gpu_resource_(gpu_resource) {}
-
-template <typename commtype>
-void AllToAll_Multi_NCCL<commtype>::communicate(cudaStream_t stream) {
-  auto& comm = this->gpu_resource_->get_nccl();
-  auto type = get_nccl_type<commtype>();
-
-  int num_global_gpus;
-  HCTR_LIB_THROW(ncclCommCount(comm, &num_global_gpus));
-
-  HCTR_LIB_THROW(ncclGroupStart());
-  for (int i = 0; i < num_global_gpus; i++) {
-    HCTR_LIB_THROW(
-        ncclSend(this->send_buffer_.get_ptr() + this->send_offsets_[i] * this->width_data_field_,
-                 (this->send_offsets_[i + 1] - this->send_offsets_[i]) * this->width_data_field_,
-                 type, i, comm, stream));
-    HCTR_LIB_THROW(
-        ncclRecv(this->recv_buffer_.get_ptr() + this->recv_offsets_[i] * this->width_data_field_,
-                 (this->recv_offsets_[i + 1] - this->recv_offsets_[i]) * this->width_data_field_,
-                 type, i, comm, stream));
-  }
-  HCTR_LIB_THROW(ncclGroupEnd());
-}
-
-/*
- * All Reduce communications
- */
-template <typename commtype>
-AllReduceComm<commtype>::AllReduceComm(AllReduceInPlaceComm* ar_comm,
-                                       AllReduceInPlaceComm::Handle ar_handle,
-                                       const GPUResource* gpu_resource)
-    : Communication(0), ar_comm_(ar_comm), ar_handle_(ar_handle), gpu_resource_(gpu_resource) {}
-
-template <typename commtype>
-void AllReduceComm<commtype>::communicate(cudaStream_t stream) {
-  ar_comm_->all_reduce(ar_handle_, stream, gpu_resource_->get_local_id());
-}
-
-#ifdef ENABLE_MPI
-template <typename commtype>
-HierAll2Allv_Multi_IB<commtype>::HierAll2Allv_Multi_IB(uint32_t instance_id,
-                                                       HierA2AvCollHandle coll_handle,
-                                                       size_t** send_sizes,
-                                                       const GPUResource* gpu_resource,
-                                                       IbComm* ib_comm, cudaStream_t comm_stream)
-    : Communication(sizeof(commtype)),
-      instance_id_(instance_id),
-      coll_handle_(coll_handle),
-      send_sizes_(send_sizes),
-      gpu_resource_(gpu_resource),
-      ib_comm_(ib_comm),
-      comm_stream_(comm_stream) {
-  HCTR_LIB_THROW(cudaEventCreate(&comm_event_));
-}
-
-template <typename commtype>
-void HierAll2Allv_Multi_IB<commtype>::update_sizes(cudaStream_t stream) {
-  ib_comm_->pre_intra_update_a2a_coll_sizes(coll_handle_, send_sizes_, stream, instance_id_);
-}
-
-template <typename commtype>
-void HierAll2Allv_Multi_IB<commtype>::communicate(cudaStream_t stream) {
-  ib_comm_->post_send_command_a2a<commtype>(coll_handle_, stream, instance_id_);
-  HCTR_LIB_THROW(cudaEventRecord(comm_event_, comm_stream_));
-  HCTR_LIB_THROW(cudaStreamWaitEvent(stream, comm_event_));
-  // ib_comm_->wait_global_recv_async(coll_handle_, instance_id_);
-}
-
-template <typename commtype>
-void HierAll2Allv_Multi_IB<commtype>::initiate_communication(cudaStream_t stream) {
-  ib_comm_->post_a2a_send_command<commtype>(coll_handle_, stream, instance_id_);
-  HCTR_LIB_THROW(cudaEventRecord(comm_event_, comm_stream_));
-  HCTR_LIB_THROW(cudaStreamWaitEvent(stream, comm_event_));
-}
-
-template <typename commtype>
-void HierAll2Allv_Multi_IB<commtype>::wait_completion(cudaStream_t stream) {
-  ib_comm_->blocking_wait(coll_handle_, stream, instance_id_);
-  HCTR_LIB_THROW(cudaEventRecord(comm_event_, comm_stream_));
-  HCTR_LIB_THROW(cudaStreamWaitEvent(stream, comm_event_));
-  // ib_comm_->wait_global_recv_async(coll_handle_, instance_id_);
-}
-
-template <typename commtype>
-HierAll2Allv_Multi_IB<commtype>::~HierAll2Allv_Multi_IB() {
-  cudaEventDestroy(comm_event_);
-}
-#endif
-
-template class AllToAllVComm<float>;
-template class AllToAllVComm<__half>;
-template class AllReduceComm<float>;
-template class AllReduceComm<__half>;
-
-template class AllToAll_Multi_NCCL<float>;
-template class AllToAll_Multi_NCCL<__half>;
-#ifdef ENABLE_MPI
-template class HierAll2Allv_Multi_IB<float>;
-template class HierAll2Allv_Multi_IB<__half>;
-#endif
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/data.cu b/HugeCTR/src/embeddings/hybrid_embedding/data.cu
deleted file mode 100644
index 9509899d9f..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/data.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <cassert>
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-size_t EmbeddingTableFunctors<dtype>::get_embedding_table_index(
-    const std::vector<size_t>& table_sizes, dtype category) {
-  size_t embedding = 0;
-  dtype next_offset = (dtype)table_sizes[embedding];
-  for (embedding = 0; embedding < table_sizes.size() - 1 && category >= next_offset; ++embedding)
-    next_offset += table_sizes[embedding + 1];
-  return embedding;
-}
-
-template <typename dtype>
-void EmbeddingTableFunctors<dtype>::get_embedding_offsets(std::vector<dtype>& embedding_offsets,
-                                                          const std::vector<size_t>& table_sizes) {
-  const size_t num_tables = table_sizes.size();
-  embedding_offsets.resize(num_tables);
-  dtype embedding_offset = (dtype)0;
-  for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-    embedding_offsets[embedding] = embedding_offset;
-    embedding_offset += static_cast<dtype>(table_sizes[embedding]);
-  }
-}
-
-template <typename dtype>
-dtype EmbeddingTableFunctors<dtype>::get_num_categories(const std::vector<size_t>& table_sizes) {
-  dtype num_categories = (dtype)0;
-  for (size_t i = 0; i < table_sizes.size(); ++i)
-    num_categories += static_cast<dtype>(table_sizes[i]);
-  return num_categories;
-}
-
-template <typename dtype>
-__global__ void data_to_unique_categories_kernel(dtype* data, dtype* embedding_offsets,
-                                                 int num_tables, int num_data, dtype* samples,
-                                                 int num_valid_data, dtype pad_val) {
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < num_data;
-       idx += blockDim.x * gridDim.x) {
-    samples[idx] =
-        idx >= num_valid_data ? pad_val : data[idx] + embedding_offsets[idx % num_tables];
-  }
-}
-
-template <typename dtype>
-__global__ void data_to_unique_categories_align4_kernel(dtype* data, dtype* embedding_offsets,
-                                                        int num_tables, int num_data,
-                                                        dtype* samples, int num_valid_data,
-                                                        dtype pad_val) {
-  auto data4 = reinterpret_cast<uint4*>(data);
-  auto samples4 = reinterpret_cast<uint4*>(samples);
-  for (int idx4 = threadIdx.x + blockIdx.x * blockDim.x; idx4 < num_data / 4;
-       idx4 += blockDim.x * gridDim.x) {
-    uint4 load_data = data4[idx4];
-    uint4 load_embedding_offsets;
-
-    int idx = idx4 * 4;
-    load_data.x += embedding_offsets[(idx) % num_tables];
-    load_data.y += embedding_offsets[(idx + 1) % num_tables];
-    load_data.z += embedding_offsets[(idx + 2) % num_tables];
-    load_data.w += embedding_offsets[(idx + 3) % num_tables];
-
-    load_data.x = idx >= num_valid_data ? pad_val : load_data.x;
-    load_data.y = idx + 1 >= num_valid_data ? pad_val : load_data.y;
-    load_data.z = idx + 2 >= num_valid_data ? pad_val : load_data.z;
-    load_data.w = idx + 3 >= num_valid_data ? pad_val : load_data.w;
-
-    samples4[idx4] = load_data;
-  }
-}
-
-/// data_to_unique_categories converts the argument 'data' and stores
-///        the result in member variable 'samples'.
-///        Per network, the columns corresponding to embedding tables
-///        are concatenated and categories get an unique index / label.
-template <typename dtype>
-void Data<dtype>::data_to_unique_categories(Tensor2<dtype> data, cudaStream_t stream) {
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-  /// === TODO: PERFORM ON GPU ===
-  /// ============================
-  // HCTR_LOG_S(WARNING, WORLD) << "data_to_unique_categories() needs to be placed on the GPU!" <<
-  // std::endl;
-  // TODO : perform conversion by kernel (before start of iteration ? => see below)
-  //        for batch_size = 55*1024
-  //        batch_size * 26 * 4 / 1600e9 = 3.67 microseconds,
-  //
-  // Remark:
-  //        Doesn't need to be before start of kernel.
-  //        Would be nice to have just before calculating indices, since
-  //        those would be in L2 cache already.
-  size_t current_batch_size = data.get_dimensions()[0];
-  size_t block_size = 256;
-  size_t grid_size =
-      std::min(static_cast<size_t>(4096),
-               (table_sizes.size() * batch_size * num_iterations - 1) / block_size + 1);
-  size_t num_samples = table_sizes.size() * batch_size * num_iterations;
-  // Not all samples in a batch may be valid. I.e last iteration of evaluation may be incomplete.
-  size_t num_valid_samples = table_sizes.size() * current_batch_size;
-  assert(num_valid_samples > 0 && "Batch contained 0 valid samples");
-  auto null_category = static_cast<dtype>(num_categories);
-  if (num_samples % 4 == 0 && sizeof(dtype) == 4) {
-    data_to_unique_categories_align4_kernel<<<grid_size, block_size, 0, stream>>>(
-        data.get_ptr(), embedding_offsets.get_ptr(), table_sizes.size(), num_samples,
-        samples.get_ptr(), num_valid_samples, null_category);
-  } else {
-    data_to_unique_categories_kernel<<<grid_size, block_size, 0, stream>>>(
-        data.get_ptr(), embedding_offsets.get_ptr(), table_sizes.size(), (int)num_samples,
-        samples.get_ptr(), (int)num_valid_samples, null_category);
-  }
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template class Data<uint32_t>;
-template class Data<long long>;
-
-template struct EmbeddingTableFunctors<uint32_t>;
-template struct EmbeddingTableFunctors<long long>;
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu b/HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu
deleted file mode 100644
index 75d9cbcf8d..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu
+++ /dev/null
@@ -1,487 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <core23_wrapper.hpp>
-#include <cub/cub.cuh>
-#include <data_simulator.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/update.cuh>
-#include <embeddings/hybrid_embedding/utils.cuh>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <network_buffer_channels.hpp>
-#include <shuffle/shuffle.cuh>
-#include <tensor2.hpp>
-#include <utils.cuh>
-#include <utils.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-namespace frequent_embedding_kernels {
-
-template <typename dtype>
-__global__ void reset_relevant_gradients(float* __restrict__ gradients, uint32_t embedding_vec_size,
-                                         FrequentEmbeddingCompressionView<dtype>* indices,
-                                         uint32_t num_instances) {
-  const uint32_t num_network_cache_indices = indices->network_cache_indices_offsets[num_instances];
-  for (uint32_t i = blockIdx.x; i < num_network_cache_indices; i += gridDim.x)
-    gradients[indices->network_cache_indices[i] * embedding_vec_size + threadIdx.x] = 0.0f;
-}
-
-template <typename dtype, typename emtype>
-__global__ void frequent_local_reduce(const emtype* __restrict__ gradients_in,
-                                      float* __restrict__ gradients_out,
-                                      size_t local_samples_offset,
-                                      const dtype* __restrict__ category_location,
-                                      uint32_t embedding_vec_size,
-                                      FrequentEmbeddingCompressionView<dtype>* indices) {
-  const uint32_t num_frequent_sample_indices = *indices->d_num_frequent_sample_indices;
-
-  for (uint32_t i = blockIdx.x; i < num_frequent_sample_indices; i += gridDim.x) {
-    uint32_t local_sample_index = indices->frequent_sample_indices[i];
-    dtype category = indices->samples[local_samples_offset + local_sample_index];
-    dtype frequent_index = category_location[2 * category + 1];
-
-    atomicAdd(gradients_out + frequent_index * embedding_vec_size + threadIdx.x,
-              TypeConvertFunc<float, emtype>::convert(
-                  gradients_in[local_sample_index * embedding_vec_size + threadIdx.x]));
-  }
-}
-
-template <typename emtype>
-__forceinline__ __device__ void update_model_direct_common(
-    const emtype* const* __restrict__ gradients_pointers, float* __restrict__ embedding_vectors,
-    const uint32_t* __restrict__ model_cache_indices,
-    const uint32_t* __restrict__ model_cache_indices_offsets, uint32_t num_instances,
-    uint32_t model_id, uint32_t num_frequent_per_model, uint32_t embedding_vec_size, float lr) {}
-
-template <typename dtype, typename emtype>
-__global__ void update_model_direct(const emtype* const* __restrict__ gradients_pointers,
-                                    float* __restrict__ embedding_vectors,
-                                    FrequentEmbeddingCompressionView<dtype>* indices,
-                                    uint32_t num_instances, uint32_t model_id,
-                                    uint32_t num_frequent_per_model, uint32_t embedding_vec_size,
-                                    const float* __restrict__ lr_ptr, const float scale) {
-  float lr = __ldg(lr_ptr) / scale;
-  const uint32_t offset = indices->model_cache_indices_offsets[model_id + 1];
-  const uint32_t num_model_cache_indices = indices->model_cache_indices_offsets[num_instances];
-
-  for (uint32_t i = blockIdx.x; i < num_model_cache_indices; i += gridDim.x) {
-    int vid = (i + offset) % num_model_cache_indices;
-
-    uint32_t frequent_index = indices->model_cache_indices[vid];
-    uint32_t network_id;
-    for (network_id = 0;
-         network_id < num_instances && indices->model_cache_indices_offsets[network_id + 1] <= vid;
-         network_id++)
-      ;
-
-    const emtype* gradients = gradients_pointers[network_id];
-
-    uint32_t cache_location = frequent_index * embedding_vec_size + threadIdx.x;
-    atomicAdd(embedding_vectors + cache_location,
-              -lr * TypeConvertFunc<float, emtype>::convert(gradients[cache_location]));
-  }
-}
-
-}  // namespace frequent_embedding_kernels
-
-template <typename dtype>
-FrequentEmbeddingBase<dtype>::FrequentEmbeddingBase() {}
-
-template <typename dtype>
-FrequentEmbeddingBase<dtype>::~FrequentEmbeddingBase() {}
-
-template <typename dtype>
-void FrequentEmbeddingBase<dtype>::set_current_indices(
-    FrequentEmbeddingCompression<dtype>* indices) {
-  indices_ = indices;
-  data_ = indices->get_data();
-  indices_view_ = indices->get_device_view();
-}
-
-template <typename dtype, typename emtype>
-FrequentEmbeddingData<dtype, emtype>::FrequentEmbeddingData(const Model<dtype>& model,
-                                                            const GPUResource& gpu_resource,
-                                                            BuffPtr<emtype>& grouped_wgrad_buff,
-                                                            uint32_t embedding_vec_size,
-                                                            size_t max_num_frequent_categories)
-    : model_(model),
-      gpu_resource_(gpu_resource),
-      grouped_wgrad_buff_(grouped_wgrad_buff),
-      wgrad_core23_buffer_(nullptr),
-      embedding_vec_size_(embedding_vec_size),
-      max_num_frequent_categories_(max_num_frequent_categories) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-  buf->reserve({max_num_frequent_categories, embedding_vec_size_}, &frequent_embedding_vectors_);
-  if (sizeof(emtype) != sizeof(float)) {
-    buf->reserve({max_num_frequent_categories, embedding_vec_size_}, &float_frequent_gradients_);
-  }
-
-  auto& gradients = get_gradients();
-  if (grouped_wgrad_buff == NULL) {
-    buf->reserve({max_num_frequent_categories, embedding_vec_size_}, &gradients);
-  } else {
-    core23::BufferParams buffer_params = {};
-    // TODO this has to be consistent with add_dense_layer.cpp:2126
-    buffer_params.channel = std::is_same_v<emtype, float> ? "TRAIN_WGRAD" : "TRAIN_WGRAD_HALF";
-    core23::Device device(core23::DeviceType::GPU, gpu_resource.get_device_id());
-    core23::Shape shape{static_cast<int64_t>(max_num_frequent_categories),
-                        static_cast<int64_t>(embedding_vec_size_)};
-
-    core23::TensorParams t_params = core23::TensorParams()
-                                        .data_type(core23::ToScalarType<emtype>::value)
-                                        .shape(shape)
-                                        .device(device)
-                                        .buffer_params(buffer_params)
-                                        .alignment(256);  // default 256 Byte
-    core23::Tensor grad_tensor(t_params);
-    wgrad_core23_buffer_ = std::make_shared<Core23WrappingBuffer>(grad_tensor);
-    gradients =
-        Tensor2<emtype>({max_num_frequent_categories, embedding_vec_size_}, wgrad_core23_buffer_);
-  }
-
-  buf->allocate();
-}
-
-template <typename dtype, typename emtype>
-FrequentEmbeddingSingleNode<dtype, emtype>::FrequentEmbeddingSingleNode(
-    const Model<dtype>& model, const GPUResource& gpu_resource, BuffPtr<emtype>& grouped_wgrad_buff,
-    uint32_t embedding_vec_size, size_t max_num_frequent_categories)
-    : frequent_data_(model, gpu_resource, grouped_wgrad_buff, embedding_vec_size,
-                     max_num_frequent_categories) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-
-  buf->reserve({model.num_instances, 1}, &embedding_vectors_cache_pointers_);
-  buf->reserve({model.num_instances, 1}, &partial_gradients_pointers_);
-  if (sizeof(emtype) != sizeof(float)) {
-    buf->reserve({max_num_frequent_categories, embedding_vec_size},
-                 &frequent_embedding_vectors_cache_);
-  }
-  buf->allocate();
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingMultiNode<dtype, emtype>::init_ar_comm(AllReduceInPlaceComm* ar_comm,
-                                                             AllReduceInPlaceComm::Handle& handle,
-                                                             int local_id) {
-  auto& local_gpu = frequent_data_.gpu_resource_;
-  CudaDeviceContext context(local_gpu.get_device_id());
-
-  auto& gradients = frequent_data_.get_gradients();
-  ar_comm->set_coll_buf(handle, gradients.get_ptr(), gradients.get_size_in_bytes(), local_id);
-  ar_comm_ = std::make_unique<AllReduceComm<emtype>>(ar_comm, handle, &local_gpu);
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingData<dtype, emtype>::initialize_embedding_vectors(
-    const std::vector<size_t>& table_sizes, size_t grouped_wgrad_offset_in_bytes) {
-  CudaDeviceContext context(gpu_resource_.get_device_id());
-
-  const size_t num_tables = table_sizes.size();
-  for (size_t model_id = 0; model_id < model_.num_instances; ++model_id) {
-    for (size_t embedding = 0; embedding < num_tables; embedding++) {
-      float up_bound = sqrt(1.f / table_sizes[embedding]);
-      size_t offset =
-          embedding_vec_size_ *
-          model_.h_frequent_model_table_offsets[model_id * (num_tables + 1) + embedding];
-      size_t num_elements =
-          embedding_vec_size_ *
-          (model_.h_frequent_model_table_offsets[model_id * (num_tables + 1) + embedding + 1] -
-           model_.h_frequent_model_table_offsets[model_id * (num_tables + 1) + embedding]);
-      UniformGenerator::fill(frequent_embedding_vectors_.get_ptr() + offset, num_elements,
-                             -up_bound, up_bound, gpu_resource_.get_sm_count(),
-                             gpu_resource_.get_replica_uniform_curand_generator(),
-                             gpu_resource_.get_stream());
-    }
-  }
-
-  if (wgrad_core23_buffer_) {
-    // update wgrad tensors
-    size_t grad_size = model_.num_frequent * embedding_vec_size_;
-    if (sizeof(float) != sizeof(emtype)) {
-      frequent_gradients_ = Tensor2<emtype>({grad_size}, wgrad_core23_buffer_);
-    } else {
-      float_frequent_gradients_ = Tensor2<float>({grad_size}, wgrad_core23_buffer_);
-    }
-  } else if (grouped_wgrad_buff_) {
-    // update wgrad tensors
-    size_t grad_size = model_.num_frequent * embedding_vec_size_;
-    if (sizeof(float) != sizeof(emtype)) {
-      auto buf = std::make_shared<ExternalManagedBuffer>(
-          (char*)grouped_wgrad_buff_->as_tensor().get_ptr() + grouped_wgrad_offset_in_bytes);
-      frequent_gradients_ = Tensor2<emtype>({grad_size}, buf);
-    } else {
-      auto buf = std::make_shared<ExternalManagedBuffer>(
-          (char*)grouped_wgrad_buff_->as_tensor().get_ptr() + grouped_wgrad_offset_in_bytes);
-      float_frequent_gradients_ = Tensor2<float>({grad_size}, buf);
-    }
-  }
-}
-
-/* Single-node: refresh needed vectors in the cache of each network
- * Note: each network pulls from the models */
-template <typename dtype, typename emtype>
-void FrequentEmbeddingSingleNode<dtype, emtype>::forward_model(cudaStream_t stream) {
-  const uint32_t num_instances = frequent_data_.model_.num_instances;
-  const uint32_t model_id = frequent_data_.model_.global_instance_id;
-
-  auto embedding_vectors_cache_pointers = embedding_vectors_cache_pointers_.get_ptr();
-  auto frequent_embedding_vectors = frequent_data_.frequent_embedding_vectors_.get_ptr();
-  auto indices = this->indices_view_;
-  auto embedding_vec_size = frequent_data_.embedding_vec_size_;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<float, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->model_cache_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<float, emtype, 1> {
-        const uint32_t offset = indices->model_cache_indices_offsets[model_id + 1];
-        const uint32_t num_model_cache_indices =
-            indices->model_cache_indices_offsets[num_instances];
-        int vid = (i + offset) % num_model_cache_indices;
-        uint32_t frequent_index = indices->model_cache_indices[vid];
-
-        uint32_t network_id;
-        for (network_id = 0; network_id < num_instances &&
-                             indices->model_cache_indices_offsets[network_id + 1] <= vid;
-             network_id++)
-          ;
-        emtype* embedding_vectors_out = embedding_vectors_cache_pointers[network_id];
-
-        const float* src_ptr = frequent_embedding_vectors + frequent_index * embedding_vec_size;
-        emtype* dst_ptr = embedding_vectors_out + frequent_index * embedding_vec_size;
-
-        return {
-            src_ptr, {dst_ptr}, {static_cast<const void*>(src_ptr) != static_cast<void*>(dst_ptr)}};
-      });
-
-  shuffle(copy_desc, stream, frequent_data_.model_.num_frequent / 4);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-/* Single-node: refresh all vectors in the cache of each network */
-template <typename dtype, typename emtype>
-void FrequentEmbeddingSingleNode<dtype, emtype>::forward_model_eval(cudaStream_t stream) {
-  const uint32_t num_instances = frequent_data_.model_.num_instances;
-  const uint32_t model_id = frequent_data_.model_.global_instance_id;
-
-  emtype** embedding_vectors_cache_pointers = embedding_vectors_cache_pointers_.get_ptr();
-  const float* frequent_embedding_vectors = frequent_data_.frequent_embedding_vectors_.get_ptr();
-  size_t embedding_vec_size = frequent_data_.embedding_vec_size_;
-  const uint32_t num_frequent = frequent_data_.model_.num_frequent;
-  const uint32_t num_frequent_per_model = num_frequent / num_instances;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<float, emtype, 1>(
-      embedding_vec_size, [=] __device__() { return num_frequent; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<float, emtype, 1> {
-        // Shift pattern
-        uint32_t shifted_i = (i + (model_id + 1) * num_frequent_per_model) % num_frequent;
-        uint32_t network_id = shifted_i / num_frequent_per_model;
-        uint32_t frequent_index =
-            model_id * num_frequent_per_model + shifted_i % num_frequent_per_model;
-
-        emtype* embedding_vectors_out = embedding_vectors_cache_pointers[network_id];
-
-        const float* src_ptr = frequent_embedding_vectors + frequent_index * embedding_vec_size;
-        emtype* dst_ptr = embedding_vectors_out + frequent_index * embedding_vec_size;
-
-        return {
-            src_ptr, {dst_ptr}, {static_cast<const void*>(src_ptr) != static_cast<void*>(dst_ptr)}};
-      });
-
-  shuffle(copy_desc, stream, num_frequent);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-template <typename vectype>
-void FrequentEmbeddingData<dtype, emtype>::forward_network<vectype>(
-    const vectype* embedding_vectors, emtype* interaction_layer_input,
-    FrequentEmbeddingBase<dtype>* base, cudaStream_t stream) {
-  uint32_t samples_per_instance =
-      base->data_->samples.get_num_elements() / this->model_.num_instances;
-  uint32_t global_sample_index_base = model_.global_instance_id * samples_per_instance;
-
-  auto indices = base->indices_view_;
-  auto category_location = this->model_.category_location.get_ptr();
-  auto embedding_vec_size = this->embedding_vec_size_;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<vectype, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() -> size_t { return *indices->d_num_frequent_sample_indices; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<vectype, emtype, 1> {
-        auto index = indices->frequent_sample_indices[i];
-        auto category = indices->samples[index + global_sample_index_base];
-        auto frequent_index = category_location[2 * category + 1];
-
-        return {
-            embedding_vectors + frequent_index * embedding_vec_size,
-            {interaction_layer_input + indices->frequent_sample_indices[i] * embedding_vec_size},
-            {true}};
-      });
-
-  shuffle(copy_desc, stream, samples_per_instance);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-/* Concatenate the embedding vectors into the buffer for top-mlp input */
-template <typename dtype, typename emtype>
-void FrequentEmbeddingSingleNode<dtype, emtype>::forward_network(emtype* interaction_layer_input,
-                                                                 cudaStream_t stream) {
-  frequent_data_.forward_network(get_embedding_vectors_cache().get_ptr(), interaction_layer_input,
-                                 this, stream);
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingMultiNode<dtype, emtype>::forward_network(emtype* interaction_layer_input,
-                                                                cudaStream_t stream) {
-  frequent_data_.forward_network(frequent_data_.frequent_embedding_vectors_.get_ptr(),
-                                 interaction_layer_input, this, stream);
-}
-
-/* Reduce gradients on each network */
-template <typename dtype, typename emtype>
-void FrequentEmbeddingData<dtype, emtype>::local_reduce(const emtype* gradients,
-                                                        FrequentEmbeddingBase<dtype>* base,
-                                                        cudaStream_t stream) {
-  const auto num_instances = model_.num_instances;
-  const auto network_id = model_.global_instance_id;
-  size_t local_samples_size =
-      ceildiv<size_t>(base->data_->batch_size, num_instances) * base->data_->table_sizes.size();
-
-  int n_blocks = 16 * gpu_resource_.get_sm_count();
-  auto embedding_vec_size = embedding_vec_size_;
-
-  frequent_embedding_kernels::frequent_local_reduce<<<n_blocks, embedding_vec_size, 0, stream>>>(
-      gradients, float_frequent_gradients_.get_ptr(), network_id * local_samples_size,
-      model_.category_location.get_ptr(), embedding_vec_size, base->indices_view_);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  if (sizeof(emtype) != sizeof(float)) {
-    convert_array<<<1000, 128, 0, stream>>>(frequent_gradients_.get_ptr(),
-                                            float_frequent_gradients_.get_ptr(),
-                                            model_.num_frequent * embedding_vec_size);
-    HCTR_LIB_THROW(cudaPeekAtLastError());
-  }
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingSingleNode<dtype, emtype>::local_reduce(const emtype* gradients,
-                                                              cudaStream_t stream) {
-  auto num_instances = frequent_data_.model_.num_instances;
-  int n_blocks = 16 * frequent_data_.gpu_resource_.get_sm_count();
-  auto embedding_vec_size = frequent_data_.embedding_vec_size_;
-
-  /* Set to zero the gradients of categories that appear in the batch */
-  frequent_embedding_kernels::reset_relevant_gradients<<<n_blocks, embedding_vec_size, 0, stream>>>(
-      frequent_data_.float_frequent_gradients_.get_ptr(), embedding_vec_size, this->indices_view_,
-      num_instances);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  frequent_data_.local_reduce(gradients, this, stream);
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingMultiNode<dtype, emtype>::local_reduce(const emtype* gradients,
-                                                             cudaStream_t stream) {
-  /* Set to zero all the gradients */
-  if (frequent_data_.model_.num_frequent > 0) {
-    HCTR_LIB_THROW(cudaMemsetAsync(
-        frequent_data_.float_frequent_gradients_.get_ptr(), 0,
-        frequent_data_.model_.num_frequent * frequent_data_.embedding_vec_size_ * sizeof(float),
-        stream));
-  }
-
-  frequent_data_.local_reduce(gradients, this, stream);
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingMultiNode<dtype, emtype>::update_model(float* dev_lr, float scale,
-                                                             cudaStream_t stream) {
-  sgd_global_update(frequent_data_.get_gradients().get_ptr(),
-                    frequent_data_.frequent_embedding_vectors_.get_ptr(),
-                    frequent_data_.model_.num_frequent, frequent_data_.embedding_vec_size_, dev_lr,
-                    scale, stream);
-}
-
-/* Update model for single-node: direct write in category "owner"'s table, lr is a device variable
- */
-template <typename dtype, typename emtype>
-void FrequentEmbeddingSingleNode<dtype, emtype>::update_model_direct(float* dev_lr, float scale,
-                                                                     cudaStream_t stream) {
-  const uint32_t& num_instances = frequent_data_.model_.num_instances;
-  const uint32_t& model_id = frequent_data_.model_.global_instance_id;
-  const uint32_t num_frequent_per_model = frequent_data_.model_.num_frequent / num_instances;
-
-  int num_sm = frequent_data_.gpu_resource_.get_sm_count();
-  int n_blocks = 8 * num_sm;  // TODO: better heuristics
-
-  /* Update models */
-  frequent_embedding_kernels::
-      update_model_direct<<<n_blocks, frequent_data_.embedding_vec_size_, 0, stream>>>(
-          partial_gradients_pointers_.get_ptr(),
-          frequent_data_.frequent_embedding_vectors_.get_ptr(), this->indices_view_, num_instances,
-          model_id, num_frequent_per_model, frequent_data_.embedding_vec_size_, dev_lr, scale);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void FrequentEmbeddingMultiNode<dtype, emtype>::communicate(cudaStream_t stream) {
-  ar_comm_->communicate(stream);
-}
-
-template class FrequentEmbeddingBase<uint32_t>;
-template class FrequentEmbeddingBase<long long>;
-
-template class FrequentEmbeddingData<uint32_t, __half>;
-template class FrequentEmbeddingData<uint32_t, float>;
-template class FrequentEmbeddingData<long long, __half>;
-template class FrequentEmbeddingData<long long, float>;
-
-template class FrequentEmbeddingSingleNode<uint32_t, __half>;
-template class FrequentEmbeddingSingleNode<uint32_t, float>;
-template class FrequentEmbeddingSingleNode<long long, __half>;
-template class FrequentEmbeddingSingleNode<long long, float>;
-
-template class FrequentEmbeddingMultiNode<uint32_t, __half>;
-template class FrequentEmbeddingMultiNode<uint32_t, float>;
-template class FrequentEmbeddingMultiNode<long long, __half>;
-template class FrequentEmbeddingMultiNode<long long, float>;
-
-template void FrequentEmbeddingData<uint32_t, __half>::forward_network<__half>(
-    const __half*, __half*, FrequentEmbeddingBase<uint32_t>*, cudaStream_t);
-template void FrequentEmbeddingData<uint32_t, __half>::forward_network<float>(
-    const float*, __half*, FrequentEmbeddingBase<uint32_t>*, cudaStream_t);
-template void FrequentEmbeddingData<uint32_t, float>::forward_network<float>(
-    const float*, float*, FrequentEmbeddingBase<uint32_t>*, cudaStream_t);
-template void FrequentEmbeddingData<long long, __half>::forward_network<__half>(
-    const __half*, __half*, FrequentEmbeddingBase<long long>*, cudaStream_t);
-template void FrequentEmbeddingData<long long, __half>::forward_network<float>(
-    const float*, __half*, FrequentEmbeddingBase<long long>*, cudaStream_t);
-template void FrequentEmbeddingData<long long, float>::forward_network<float>(
-    const float*, float*, FrequentEmbeddingBase<long long>*, cudaStream_t);
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu b/HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu
deleted file mode 100644
index e52199c233..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cub/cub.cuh>
-#include <embeddings/hybrid_embedding/hybrid_indices.hpp>
-#include <embeddings/hybrid_embedding/utils.cuh>
-#include <utils.cuh>
-
-namespace indices_kernels {
-
-template <typename dtype>
-__global__ void fused_cache_masks(const dtype* __restrict__ samples,
-                                  const dtype* __restrict__ category_location,
-                                  bool* __restrict__ model_cache_mask,
-                                  bool* __restrict__ network_cache_mask, uint32_t offset,
-                                  uint32_t samples_size, uint32_t local_samples_size,
-                                  uint32_t num_frequent, uint32_t num_frequent_per_model,
-                                  uint32_t model_id, uint32_t num_instances) {
-  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (tid < samples_size) {
-    dtype category = __ldg(samples + tid);
-    dtype frequent_loc = __ldg(category_location + 2 * category);
-    dtype frequent_index = __ldg(category_location + (2 * category + 1));
-
-    if (frequent_loc == num_instances && frequent_index / num_frequent_per_model == model_id)
-      model_cache_mask[(tid / local_samples_size) * num_frequent_per_model +
-                       frequent_index % num_frequent_per_model] = true;
-  }
-
-  if (tid < local_samples_size) {
-    dtype category = __ldg(samples + offset + tid);
-    dtype frequent_loc = __ldg(category_location + 2 * category);
-    dtype frequent_index = __ldg(category_location + (2 * category + 1));
-
-    if (frequent_loc == num_instances) network_cache_mask[frequent_index] = true;
-  }
-}
-
-__global__ void mask_indices_to_buffer_indices(
-    uint32_t* __restrict__ model_cache_indices,
-    const uint32_t* __restrict__ model_cache_indices_offsets, uint32_t num_instances,
-    uint32_t num_frequent_per_model, uint32_t model_id) {
-  const uint32_t num_selected = __ldg(model_cache_indices_offsets + num_instances);
-
-  for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_selected;
-       i += blockDim.x * gridDim.x)
-    model_cache_indices[i] =
-        model_cache_indices[i] % num_frequent_per_model + num_frequent_per_model * model_id;
-}
-
-template <typename dtype>
-__global__ void calculate_network_indices_mask(const dtype* __restrict__ local_samples,
-                                               const dtype* __restrict__ category_location,
-                                               bool* mask, uint32_t local_samples_size,
-                                               uint32_t num_instances) {
-  for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < local_samples_size;
-       i += gridDim.x * blockDim.x) {
-    dtype category = local_samples[i];
-    uint32_t model_id = static_cast<uint32_t>(category_location[2 * category]);
-    for (uint32_t section_id = 0; section_id < num_instances; section_id++) {
-      mask[local_samples_size * section_id + i] = (model_id == section_id);
-    }
-  }
-}
-
-}  // namespace indices_kernels
-
-namespace HugeCTR {
-namespace hybrid_embedding {
-
-// ===========================================================================================
-// Frequent Compression
-// ===========================================================================================
-
-template <typename dtype>
-FrequentEmbeddingCompression<dtype>::FrequentEmbeddingCompression(
-    size_t max_num_frequent_categories, const Data<dtype>& data, const Model<dtype>& model)
-    : data_(data), model_(model) {
-  const int num_tables = data_.table_sizes.size();
-
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-  buf->reserve({max_num_frequent_categories, 1}, &model_cache_indices_);
-  buf->reserve({model.num_instances + 1, 1}, &model_cache_indices_offsets_);
-  buf->reserve({max_num_frequent_categories, 1}, &network_cache_indices_);
-  buf->reserve({model.num_instances + 1, 1}, &network_cache_indices_offsets_);
-  buf->reserve({2 * max_num_frequent_categories, 1}, &cache_masks_);
-  buf->reserve({ceildiv<size_t>(data_.batch_size, model.num_instances) * num_tables, 1},
-               &frequent_sample_indices_);
-  buf->reserve({1}, &d_num_frequent_sample_indices_);
-
-  // Temporary storage
-  calculate_frequent_sample_indices_temp_storage_bytes((data_.batch_size / model.num_instances) *
-                                                       num_tables);
-  calculate_model_cache_indices_temp_storage_bytes(max_num_frequent_categories);
-  calculate_network_cache_indices_temp_storage_bytes(max_num_frequent_categories);
-  buf->reserve({frequent_sample_indices_temp_storage_bytes_, 1},
-               &frequent_sample_indices_temp_storage_);
-  buf->reserve({model_cache_indices_temp_storage_bytes_, 1}, &model_cache_indices_temp_storage_);
-  buf->reserve({network_cache_indices_temp_storage_bytes_, 1},
-               &network_cache_indices_temp_storage_);
-  buf->allocate();
-
-  FrequentEmbeddingCompressionView<dtype> view = {data_.samples.get_ptr(),
-                                                  cache_masks_.get_ptr(),
-                                                  model_cache_indices_.get_ptr(),
-                                                  model_cache_indices_offsets_.get_ptr(),
-                                                  network_cache_indices_.get_ptr(),
-                                                  network_cache_indices_offsets_.get_ptr(),
-                                                  d_num_frequent_sample_indices_.get_ptr(),
-                                                  frequent_sample_indices_.get_ptr()};
-
-  HCTR_LIB_THROW(cudaMalloc(&device_indices_view_, sizeof(view)));
-  HCTR_LIB_THROW(cudaMemcpy(device_indices_view_, &view, sizeof(view), cudaMemcpyHostToDevice));
-}
-
-template <typename dtype>
-struct FrequentSampleIndicesSelectOp {
-  const dtype* samples;
-  const dtype* category_location;
-  uint32_t offset;
-  dtype num_instances;
-  __host__ __device__ __forceinline__ FrequentSampleIndicesSelectOp(const dtype* samples,
-                                                                    const dtype* category_location,
-                                                                    uint32_t offset,
-                                                                    dtype num_instances)
-      : samples(samples),
-        category_location(category_location),
-        offset(offset),
-        num_instances(num_instances) {}
-  __device__ __forceinline__ bool operator()(const uint32_t& idx) const {
-    dtype category = __ldg(samples + offset + idx);
-    dtype frequent_location = __ldg(category_location + 2 * category);
-    return frequent_location == num_instances;
-  }
-};
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_frequent_sample_indices_temp_storage_bytes(
-    const size_t local_samples_size) {
-  cub::CountingInputIterator<uint32_t> counting(0);
-  FrequentSampleIndicesSelectOp<dtype> select_op(nullptr, nullptr, 0, 0);
-  cub::DeviceSelect::If(nullptr, frequent_sample_indices_temp_storage_bytes_, counting,
-                        (uint32_t*)nullptr, (uint32_t*)nullptr, local_samples_size, select_op, 0);
-}
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_model_cache_indices_temp_storage_bytes(
-    const size_t num_frequent) {
-  size_t select_bytes = 0;
-  cub::CountingInputIterator<uint32_t> counting(0);
-  cub::DeviceSelect::Flagged(nullptr, select_bytes, counting, (bool*)nullptr, (uint32_t*)nullptr,
-                             (uint32_t*)nullptr, num_frequent, 0);
-
-  constexpr uint32_t align = 256;
-  model_cache_indices_temp_storage_bytes_ = alignTo<size_t>(num_frequent, align) + select_bytes;
-}
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_network_cache_indices_temp_storage_bytes(
-    const size_t num_frequent) {
-  size_t select_bytes = (size_t)0;
-  cub::CountingInputIterator<uint32_t> counting(0);
-  cub::DeviceSelect::Flagged(nullptr, select_bytes, counting, (bool*)nullptr, (uint32_t*)nullptr,
-                             (uint32_t*)nullptr, num_frequent, 0);
-
-  network_cache_indices_temp_storage_bytes_ = select_bytes;
-}
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_frequent_sample_indices(cudaStream_t stream) {
-  const size_t num_networks = model_.num_instances;
-  size_t local_samples_size = (data_.batch_size / num_networks) * data_.table_sizes.size();
-
-  // Select indices of frequent categories appearing in the local MLP batch
-  cub::CountingInputIterator<uint32_t> counting(0);
-  FrequentSampleIndicesSelectOp<dtype> select_op(
-      data_.samples.get_ptr(), model_.category_location.get_ptr(),
-      model_.global_instance_id * local_samples_size, model_.num_instances);
-  cub::DeviceSelect::If(
-      reinterpret_cast<void*>(frequent_sample_indices_temp_storage_.get_ptr()),
-      frequent_sample_indices_temp_storage_bytes_, counting, frequent_sample_indices_.get_ptr(),
-      d_num_frequent_sample_indices_.get_ptr(), local_samples_size, select_op, stream);
-}
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_model_cache_indices(size_t sm_count,
-                                                                        cudaStream_t stream) {
-  const size_t num_instances = model_.num_instances;
-  const size_t num_frequent = model_.num_frequent;
-  const size_t samples_size = data_.batch_size * data_.table_sizes.size();
-  size_t local_samples_size =
-      ceildiv<size_t>(data_.batch_size, num_instances) * data_.table_sizes.size();
-
-  // Note: we assume that the number of frequent categories is a
-  // multiple of the number of models!
-  const size_t num_frequent_per_model = num_frequent / num_instances;
-
-  /**
-   * Explanation of the mask:
-   * The model owns num_frequent_per_model categories. For each network,
-   * we want to know the categories that appear in their local batch and
-   * belong to this model. The mask is the concatenation of num_network
-   * sections of size num_frequent_per_model.
-   * It has a size num_frequent but does not represent all the frequent
-   * categories, only num_networks repetitions of the same categories.
-   */
-
-  // Temporary storage
-  char* scratch_ptr = model_cache_indices_temp_storage_.get_ptr();
-  void* d_temp_storage = reinterpret_cast<void*>(scratch_ptr);
-  size_t temp_storage_bytes = model_cache_indices_temp_storage_bytes_;
-
-  const bool* d_model_cache_mask = cache_masks_.get_ptr() + num_frequent;
-
-  /* Select categories according to the mask */
-  cub::CountingInputIterator<uint32_t> counting(0);
-  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, counting, d_model_cache_mask,
-                             model_cache_indices_.get_ptr(),
-                             model_cache_indices_offsets_.get_ptr() + num_instances, num_frequent,
-                             stream);
-
-  /* Compute offsets */
-  constexpr size_t TPB_offsets = 256;
-  size_t n_blocks = ceildiv<size_t>(num_instances, TPB_offsets);
-  offsets_kernel<<<n_blocks, TPB_offsets, 0, stream>>>(model_cache_indices_.get_ptr(),
-                                                       model_cache_indices_offsets_.get_ptr(),
-                                                       num_instances, num_frequent_per_model);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  /* Convert to buffer indices */
-
-  constexpr size_t TPB_convert = 256;
-  n_blocks = sm_count;
-  indices_kernels::mask_indices_to_buffer_indices<<<n_blocks, TPB_convert, 0, stream>>>(
-      model_cache_indices_.get_ptr(), model_cache_indices_offsets_.get_ptr(), num_instances,
-      num_frequent_per_model, model_.global_instance_id);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_cache_masks(cudaStream_t stream) {
-  const size_t num_instances = model_.num_instances;
-  const size_t num_frequent = model_.num_frequent;
-  size_t samples_size = data_.batch_size * data_.table_sizes.size();
-  size_t local_samples_size = ceildiv<size_t>(samples_size, num_instances);
-  const size_t num_frequent_per_model = num_frequent / num_instances;
-
-  bool* d_network_cache_mask = cache_masks_.get_ptr();
-  bool* d_model_cache_mask = cache_masks_.get_ptr() + num_frequent;
-
-  /* Initialize the masks to false */
-  HCTR_LIB_THROW(cudaMemsetAsync(cache_masks_.get_ptr(), 0, 2 * num_frequent, stream));
-
-  /* Compute the model cache mask */
-  constexpr size_t TPB_mask = 256;
-  size_t n_blocks = ceildiv<size_t>(samples_size, TPB_mask);
-  indices_kernels::fused_cache_masks<<<n_blocks, TPB_mask, 0, stream>>>(
-      data_.samples.get_ptr(), model_.category_location.get_ptr(), d_model_cache_mask,
-      d_network_cache_mask, model_.global_instance_id * local_samples_size, samples_size,
-      local_samples_size, num_frequent, num_frequent_per_model, model_.global_instance_id,
-      model_.num_instances);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype>
-void FrequentEmbeddingCompression<dtype>::calculate_network_cache_indices(cudaStream_t stream) {
-  const size_t num_instances = model_.num_instances;
-  const size_t num_frequent = model_.num_frequent;
-  size_t local_samples_size =
-      ceildiv<size_t>(data_.batch_size, num_instances) * data_.table_sizes.size();
-
-  // Note: we assume that the number of frequent categories is a
-  // multiple of the number of models!
-  const size_t num_frequent_per_model = num_frequent / num_instances;
-
-  // Temporary storage
-  char* scratch_ptr = network_cache_indices_temp_storage_.get_ptr();
-  void* d_temp_storage = reinterpret_cast<void*>(scratch_ptr);
-  size_t temp_storage_bytes = network_cache_indices_temp_storage_bytes_;
-
-  const bool* d_network_cache_mask = cache_masks_.get_ptr();
-
-  /* Select categories according to the mask */
-  cub::CountingInputIterator<uint32_t> counting(0);
-  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, counting, d_network_cache_mask,
-                             network_cache_indices_.get_ptr(),
-                             network_cache_indices_offsets_.get_ptr() + num_instances, num_frequent,
-                             stream);
-
-  /* Compute offsets */
-  constexpr size_t TPB_offsets = 256;
-  size_t n_blocks = ceildiv<size_t>(num_instances, TPB_offsets);
-  offsets_kernel<<<n_blocks, TPB_offsets, 0, stream>>>(network_cache_indices_.get_ptr(),
-                                                       network_cache_indices_offsets_.get_ptr(),
-                                                       num_instances, num_frequent_per_model);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-// ===========================================================================================
-// Inrequent Selection
-// ===========================================================================================
-
-template <typename dtype>
-InfrequentEmbeddingSelection<dtype>::InfrequentEmbeddingSelection(const Data<dtype>& data,
-                                                                  const Model<dtype>& model)
-    : data_(data), model_(model) {
-  const size_t num_tables = data_.table_sizes.size();
-
-  auto buf = GeneralBuffer2<CudaAllocator>::create();
-
-  buf->reserve({data_.batch_size, num_tables}, &model_indices_);
-  buf->reserve({ceildiv<size_t>(data_.batch_size, model.num_instances), num_tables},
-               &network_indices_);
-  buf->reserve({ceildiv<size_t>(data_.batch_size, model.num_instances), num_tables},
-               &network_indices_src_model_id_);
-
-  // buf->reserve({model.num_instances}, &model_indices_sizes_);
-  // buf->reserve({model.num_instances}, &model_indices_sizes_ptrs_);
-  // buf->reserve({model.num_instances}, &network_indices_sizes_);
-  // buf->reserve({model.num_instances}, &network_indices_sizes_ptrs_);
-
-  // Temporary storage
-  calculate_model_indices_temp_storage_bytes(data_.batch_size, num_tables);
-  calculate_network_indices_temp_storage_bytes(data_.batch_size, num_tables, model.num_instances);
-  buf->reserve({model_indices_temp_storage_bytes_, 1}, &model_indices_temp_storage_);
-  buf->reserve({network_indices_temp_storage_bytes_, 1}, &network_indices_temp_storage_);
-
-  buf->allocate();
-
-  auto managed_buf = GeneralBuffer2<CudaManagedAllocator>::create();
-  managed_buf->reserve({model.num_instances + 1, 1}, &model_indices_offsets_);
-  managed_buf->reserve({model.num_instances + 1, 1}, &network_indices_offsets_);
-  managed_buf->allocate();
-  // int current_device;
-  // HCTR_LIB_THROW(cudaGetDevice(&current_device));
-  // HCTR_LIB_THROW(cudaMemAdvise(managed_buf->get_ptr(), managed_buf->get_size_in_bytes(),
-  //                             cudaMemAdviseSetReadMostly, current_device));
-
-  InfrequentEmbeddingSelectionView<dtype> view = {data_.samples.get_ptr(),
-                                                  model_indices_.get_ptr(),
-                                                  model_indices_offsets_.get_ptr(),
-                                                  network_indices_.get_ptr(),
-                                                  network_indices_offsets_.get_ptr(),
-                                                  network_indices_src_model_id_.get_ptr()};
-
-  HCTR_LIB_THROW(cudaMalloc(&device_indices_view_, sizeof(view)));
-  HCTR_LIB_THROW(cudaMemcpy(device_indices_view_, &view, sizeof(view), cudaMemcpyHostToDevice));
-}
-
-template <typename dtype>
-struct ModelIndicesSelectOp {
-  const dtype* samples;
-  const dtype* category_location;
-  uint32_t my_model_id;
-  __host__ __device__ __forceinline__ ModelIndicesSelectOp(const dtype* samples,
-                                                           const dtype* category_location,
-                                                           uint32_t my_model_id)
-      : samples(samples), category_location(category_location), my_model_id(my_model_id) {}
-  __device__ __forceinline__ bool operator()(const uint32_t& idx) const {
-    dtype category = __ldg(samples + idx);
-    dtype model_id = __ldg(category_location + 2 * category);
-    return model_id == my_model_id;
-  }
-};
-
-template <typename dtype>
-void InfrequentEmbeddingSelection<dtype>::calculate_model_indices_temp_storage_bytes(
-    size_t max_batch_size, size_t table_size) {
-  cub::CountingInputIterator<uint32_t> counting(0);
-  ModelIndicesSelectOp<dtype> select_op(nullptr, nullptr, 0);
-  cub::DeviceSelect::If(nullptr, model_indices_temp_storage_bytes_, counting, (uint32_t*)nullptr,
-                        (uint32_t*)nullptr, max_batch_size * table_size, select_op, 0);
-}
-
-template <typename dtype>
-void InfrequentEmbeddingSelection<dtype>::calculate_network_indices_temp_storage_bytes(
-    size_t max_batch_size, size_t table_size, const uint32_t num_instances) {
-  uint32_t samples_size = max_batch_size * table_size;
-  uint32_t local_samples_size = ceildiv<uint32_t>(samples_size, num_instances);
-
-  // Calculate select bytes
-  size_t select_bytes = 0;
-  cub::CountingInputIterator<uint32_t> counting(0);
-  cub::DeviceSelect::Flagged(nullptr, select_bytes, counting, (bool*)nullptr, (uint32_t*)nullptr,
-                             (uint32_t*)nullptr, samples_size, 0);
-
-  // Total size
-  constexpr uint32_t align = 256;
-  network_indices_temp_storage_bytes_ =
-      alignTo<size_t>(sizeof(bool) * samples_size, align) + select_bytes;
-}
-
-template <typename dtype>
-void InfrequentEmbeddingSelection<dtype>::calculate_model_indices(cudaStream_t stream) {
-  const uint32_t& num_instances = model_.num_instances;
-
-  size_t local_batch_size = ceildiv<size_t>(data_.batch_size, num_instances);
-
-  // Select indices of infrequent categories belonging to this model
-  cub::CountingInputIterator<uint32_t> counting(0);
-  ModelIndicesSelectOp<dtype> select_op(data_.samples.get_ptr(), model_.category_location.get_ptr(),
-                                        model_.global_instance_id);
-  cub::DeviceSelect::If(reinterpret_cast<void*>(model_indices_temp_storage_.get_ptr()),
-                        model_indices_temp_storage_bytes_, counting, model_indices_.get_ptr(),
-                        model_indices_offsets_.get_ptr() + num_instances,
-                        data_.batch_size * data_.table_sizes.size(), select_op, stream);
-
-  // Compute offsets
-  constexpr size_t TPB = 256;
-  const size_t n_blocks = ceildiv<size_t>(num_instances, TPB);
-  offsets_kernel<<<n_blocks, TPB, 0, stream>>>(model_indices_.get_ptr(),
-                                               model_indices_offsets_.get_ptr(), num_instances,
-                                               local_batch_size * data_.table_sizes.size());
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype>
-void InfrequentEmbeddingSelection<dtype>::calculate_network_indices(size_t sm_count,
-                                                                    cudaStream_t stream) {
-  const uint32_t num_instances = model_.num_instances;
-  uint32_t samples_size = data_.batch_size * data_.table_sizes.size();
-  uint32_t local_samples_size = ceildiv<uint32_t>(samples_size, num_instances);
-
-  // Temporary storage
-  constexpr uint32_t align = 256;
-  char* scratch_ptr = network_indices_temp_storage_.get_ptr();
-  size_t scratch_offset = 0;
-  bool* d_mask = reinterpret_cast<bool*>(scratch_ptr + scratch_offset);
-  scratch_offset += alignTo<size_t>(sizeof(bool) * samples_size, align);
-  void* d_temp_storage = reinterpret_cast<void*>(scratch_ptr + scratch_offset);
-  size_t temp_storage_bytes = network_indices_temp_storage_bytes_ - scratch_offset;
-
-  // Compute mask (for each source GPU, whether each element in the batch is located there)
-  constexpr uint32_t TPB_mask = 256;
-  uint32_t n_blocks_mask = ceildiv<uint32_t>(local_samples_size, TPB_mask);
-  indices_kernels::calculate_network_indices_mask<<<n_blocks_mask, TPB_mask, 0, stream>>>(
-      data_.samples.get_ptr() + model_.global_instance_id * local_samples_size,
-      model_.category_location.get_ptr(), d_mask, local_samples_size, num_instances);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  // Select indices according to the mask
-  cub::CountingInputIterator<uint32_t> counting(0);
-  cub::DeviceSelect::Flagged(
-      d_temp_storage, temp_storage_bytes, counting, d_mask, network_indices_.get_ptr(),
-      network_indices_offsets_.get_ptr() + num_instances, samples_size, stream);
-
-  // Compute offsets
-  constexpr uint32_t TPB_offsets = 256;
-  uint32_t n_blocks_offsets = ceildiv<uint32_t>(num_instances, TPB_offsets);
-  offsets_kernel<<<n_blocks_offsets, TPB_offsets, 0, stream>>>(network_indices_.get_ptr(),
-                                                               network_indices_offsets_.get_ptr(),
-                                                               num_instances, local_samples_size);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  // Re-map indices between 0 and local_samples_size - 1
-  uint32_t TPB_remap = 256;
-  uint32_t n_blocks_remap = sm_count;
-  modulo_kernel<<<n_blocks_remap, TPB_remap, 0, stream>>>(
-      network_indices_.get_ptr(), network_indices_offsets_.get_ptr() + num_instances,
-      local_samples_size);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  // Figure out the model id for each indices
-  model_id_kernel<<<n_blocks_remap, TPB_remap, 0, stream>>>(
-      network_indices_offsets_.get_ptr(), network_indices_src_model_id_.get_ptr(),
-      network_indices_offsets_.get_ptr() + num_instances);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-// template <typename dtype>
-// void InfrequentEmbeddingSelection<dtype>::calculate_model_indices_sizes_from_offsets(
-//     size_t embedding_vec_bytes, cudaStream_t stream) {
-//   constexpr size_t TPB = 256;
-//   const size_t n_blocks = ceildiv<size_t>(model_.num_instances, TPB);
-//   offsets_to_sizes<<<n_blocks, TPB, 0, stream>>>(
-//       model_indices_sizes_.get_ptr(), model_indices_offsets_.get_ptr(),
-//       embedding_vec_bytes, model_.num_instances);
-// }
-
-// template <typename dtype>
-// void InfrequentEmbeddingSelection<dtype>::calculate_network_indices_sizes_from_offsets(
-//     size_t embedding_vec_bytes, cudaStream_t stream) {
-//   constexpr size_t TPB = 256;
-//   const size_t n_blocks = ceildiv<size_t>(model_.num_instances, TPB);
-//   offsets_to_sizes<<<n_blocks, TPB, 0, stream>>>(
-//       network_indices_sizes_.get_ptr(), network_indices_offsets_.get_ptr(),
-//       embedding_vec_bytes, model_.num_instances);
-// }
-
-template <typename dtype>
-void compute_indices(FrequentEmbeddingCompression<dtype>& compression,
-                     InfrequentEmbeddingSelection<dtype>& selection,
-                     CommunicationType communication_type, bool compute_network_cache_indices,
-                     cudaStream_t stream, int sm_count) {
-  compression.calculate_frequent_sample_indices(stream);
-  selection.calculate_model_indices(stream);
-
-  if (communication_type != CommunicationType::NVLink_SingleNode) {
-    selection.calculate_network_indices(sm_count, stream);
-  } else {
-    compression.calculate_cache_masks(stream);
-    if (compute_network_cache_indices) {
-      compression.calculate_network_cache_indices(stream);
-    }
-    compression.calculate_model_cache_indices(sm_count, stream);
-  }
-}
-
-template void compute_indices<uint32_t>(FrequentEmbeddingCompression<uint32_t>& compression,
-                                        InfrequentEmbeddingSelection<uint32_t>& selection,
-                                        CommunicationType communication_type,
-                                        bool compute_network_cache_indices, cudaStream_t stream,
-                                        int sm_count);
-
-template void compute_indices<long long>(FrequentEmbeddingCompression<long long>& compression,
-                                         InfrequentEmbeddingSelection<long long>& selection,
-                                         CommunicationType communication_type,
-                                         bool compute_network_cache_indices, cudaStream_t stream,
-                                         int sm_count);
-
-template class FrequentEmbeddingCompression<uint32_t>;
-template class FrequentEmbeddingCompression<long long>;
-template class InfrequentEmbeddingSelection<uint32_t>;
-template class InfrequentEmbeddingSelection<long long>;
-
-}  // namespace hybrid_embedding
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu b/HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu
deleted file mode 100644
index d1641bd6b7..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <data_readers/async_reader/split_label_dense_sparse.hpp>
-#include <embeddings/hybrid_embedding/indices_container.hpp>
-#include <tensor2.hpp>
-#include <utils.cuh>
-
-namespace HugeCTR {
-namespace hybrid_embedding {
-
-template <typename dtype>
-BatchIndices<dtype>::BatchIndices(std::vector<Model<dtype>>& models,
-                                  std::vector<SparseTensor<dtype>> data_sources,
-                                  std::shared_ptr<ResourceManager>& resource_manager,
-                                  size_t batch_size, std::vector<size_t>& slot_size_array,
-                                  size_t max_num_frequent_categories,
-                                  CommunicationType communication_type)
-    : num_slots_(slot_size_array.size()),
-      resource_manager_(resource_manager),
-      communication_type_(communication_type) {
-  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); ++i) {
-    CudaDeviceContext ctx(resource_manager_->get_local_gpu(i)->get_device_id());
-    data_.emplace_back(data_sources[i].get_value_tensor(), slot_size_array, batch_size, 1);
-  }
-
-  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
-    CudaDeviceContext ctx(resource_manager_->get_local_gpu(i)->get_device_id());
-
-    frequent_compression_.emplace_back(max_num_frequent_categories, data_[i], models[i]);
-    infrequent_selection_.emplace_back(data_[i], models[i]);
-  }
-}
-
-template <typename dtype>
-void BatchIndices<dtype>::compute(int raw_device_id, size_t batch_size, cudaStream_t stream) {
-  auto& local_gpu = resource_manager_->get_local_gpu(raw_device_id);
-  auto& my_data = data_[raw_device_id];
-
-  auto samples = my_data.samples;
-  samples.reset_shape({batch_size, num_slots_});
-
-  my_data.data_to_unique_categories(samples, stream);
-
-  compute_indices(frequent_compression_[raw_device_id], infrequent_selection_[raw_device_id],
-                  communication_type_, true, stream, local_gpu->get_sm_count());
-}
-
-template class BatchIndices<uint32_t>;
-template class BatchIndices<long long>;
-
-}  // namespace hybrid_embedding
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu b/HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu
deleted file mode 100644
index 9020bcd2e2..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <cub/cub.cuh>
-#include <data_simulator.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/update.cuh>
-#include <embeddings/hybrid_embedding/utils.cuh>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <shuffle/shuffle.cuh>
-#include <tensor2.hpp>
-#include <utility>
-#include <utils.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-namespace infrequent_embedding_kernels {
-
-template <typename dtype, typename emtype>
-__global__ void hier_update_model(InfrequentEmbeddingSelectionView<dtype>* indices,
-                                  const dtype* __restrict__ category_location,
-                                  const emtype* __restrict__ gradients,
-                                  float* __restrict__ embedding_vectors,
-                                  uint32_t embedding_vec_size, uint32_t num_instances,
-                                  uint32_t local_samples_size, uint32_t local_comm_buff_size,
-                                  const float* __restrict__ lr_ptr, const float scale) {
-  float lr = __ldg(lr_ptr) / scale;
-  const uint32_t num_indices = indices->model_indices_offsets[num_instances];
-
-  // Load offset only when the network_id changes
-  uint32_t previous_network_id = 0;
-  uint32_t offset = 0;
-
-  for (uint32_t i = blockIdx.x; i < num_indices; i += gridDim.x) {
-    uint32_t index = indices->model_indices[i];
-    dtype category = indices->samples[index];
-    dtype location = category_location[2 * category + 1];
-    uint32_t network_id = index / local_samples_size;
-    if (network_id != previous_network_id) {
-      offset = indices->model_indices_offsets[network_id];
-      previous_network_id = network_id;
-    }
-    atomicAdd(
-        embedding_vectors + location * embedding_vec_size + threadIdx.x,
-        -lr * TypeConvertFunc<float, emtype>::convert(
-                  gradients[embedding_vec_size * (network_id * local_comm_buff_size + i - offset) +
-                            threadIdx.x]));
-  }
-}
-
-template <typename dtype, typename emtype>
-__global__ void infrequent_update_model_direct(
-    const emtype* const* __restrict__ gradients_pointers, float* embedding_vectors,
-    InfrequentEmbeddingSelectionView<dtype>* indices, const dtype* __restrict__ category_location,
-    uint32_t num_instances, uint32_t model_id, uint32_t embedding_vec_size,
-    uint32_t local_samples_size, const float* __restrict__ lr_ptr, const float scale) {
-  float lr = __ldg(lr_ptr) / scale;
-  // Shift pattern
-  const uint32_t offset = indices->model_indices_offsets[model_id + 1];
-  const uint32_t num_model_indices = indices->model_indices_offsets[num_instances];
-
-  for (uint32_t i = blockIdx.x; i < num_model_indices; i += gridDim.x) {
-    uint32_t vid = (i + offset) % num_model_indices;
-
-    uint32_t index = indices->model_indices[vid];
-    uint32_t network_id = index / local_samples_size;
-    uint32_t local_index = index % local_samples_size;
-    dtype category = indices->samples[index];
-    uint32_t location = category_location[2 * category + 1];
-
-    const emtype* gradients = gradients_pointers[network_id];
-
-    atomicAdd(embedding_vectors + location * embedding_vec_size + threadIdx.x,
-              -lr * TypeConvertFunc<float, emtype>::convert(
-                        gradients[local_index * embedding_vec_size + threadIdx.x]));
-  }
-}
-
-// template <typename dtype>
-// __global__ void calculate_network_indices_mask(const dtype* __restrict__ local_samples,
-//                                                const dtype* __restrict__ category_location,
-//                                                bool* mask, uint32_t local_samples_size,
-//                                                uint32_t num_instances) {
-//   for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < local_samples_size;
-//        i += gridDim.x * blockDim.x) {
-//     dtype category = local_samples[i];
-//     uint32_t model_id = static_cast<uint32_t>(category_location[2 * category]);
-//     for (uint32_t section_id = 0; section_id < num_instances; section_id++) {
-//       mask[local_samples_size * section_id + i] = (model_id == section_id);
-//     }
-//   }
-// }
-
-template <typename LambdaPtr>
-static __global__ void offsets_to_sizes(size_t* sizes, LambdaPtr get_offsets_ptr,
-                                        size_t element_size, uint32_t num_instances) {
-  uint32_t* offsets = get_offsets_ptr();
-  for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_instances;
-       t += gridDim.x * blockDim.x) {
-    sizes[t] = (offsets[t + 1] - offsets[t]) * element_size;
-  }
-}
-
-}  // namespace infrequent_embedding_kernels
-
-template <typename dtype>
-InfrequentEmbeddingBase<dtype>::InfrequentEmbeddingBase() {}
-
-template <typename dtype>
-InfrequentEmbeddingBase<dtype>::~InfrequentEmbeddingBase() {}
-
-template <typename dtype>
-InfrequentEmbeddingBase<dtype>::InfrequentEmbeddingBase(const InfrequentEmbeddingBase& other) {
-  HCTR_LIB_THROW(cudaMalloc(&indices_view_, sizeof(*indices_view_)));
-
-  HCTR_LIB_THROW(cudaMemcpy(indices_view_, other.indices_view_, sizeof(*indices_view_),
-                            cudaMemcpyDeviceToDevice));
-}
-
-template <typename dtype>
-void InfrequentEmbeddingBase<dtype>::set_current_indices(
-    InfrequentEmbeddingSelection<dtype>* indices) {
-  indices_ = indices;
-  data_ = indices->get_data();
-  indices_view_ = indices->get_device_view();
-}
-
-template <typename dtype, typename emtype>
-InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>::InfrequentEmbedding_NVLink_SingleNode(
-    Model<dtype>& model, GPUResource& gpu_resource, size_t embedding_vec_size)
-    : model_(model), gpu_resource_(gpu_resource), embedding_vec_size_(embedding_vec_size) {
-  auto buf = GeneralBuffer2<CudaAllocator>::create();
-  buf->reserve({ceildiv<size_t>(model.num_categories, model.num_instances), embedding_vec_size_},
-               &infrequent_embedding_vectors_);
-  buf->reserve({model.num_instances, 1}, &interaction_layer_input_pointers_train_);
-  buf->reserve({model.num_instances, 1}, &interaction_layer_input_pointers_eval_);
-  buf->reserve({model.num_instances, 1}, &gradients_pointers_);
-  buf->allocate();
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>::init_pointers(
-    int local_gpu_count, const cudaStream_t stream,
-    std::vector<emtype*>& interaction_layer_input_pointers_train,
-    std::vector<emtype*>& interaction_layer_input_pointers_eval,
-    std::vector<const emtype*>& gradients_pointers) {
-  HCTR_LIB_THROW(cudaMemcpyAsync(interaction_layer_input_pointers_train_.get_ptr(),
-                                 interaction_layer_input_pointers_train.data(),
-                                 local_gpu_count * sizeof(emtype*), cudaMemcpyHostToDevice,
-                                 stream));
-  HCTR_LIB_THROW(cudaMemcpyAsync(interaction_layer_input_pointers_eval_.get_ptr(),
-                                 interaction_layer_input_pointers_eval.data(),
-                                 local_gpu_count * sizeof(emtype*), cudaMemcpyHostToDevice,
-                                 stream));
-  HCTR_LIB_THROW(cudaMemcpyAsync(gradients_pointers_.get_ptr(), gradients_pointers.data(),
-                                 local_gpu_count * sizeof(emtype*), cudaMemcpyHostToDevice,
-                                 stream));
-}
-
-/** Forward network for single GPU (no communications) */
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>::forward_network_direct(
-    bool is_train, cudaStream_t stream) {
-  const uint32_t num_instances = model_.num_instances;
-  const uint32_t model_id = model_.global_instance_id;
-  uint32_t local_samples_size =
-      ceildiv<uint32_t>(data_->batch_size, num_instances) * data_->table_sizes.size();
-
-  auto interaction_layer_input_pointers = is_train
-                                              ? interaction_layer_input_pointers_train_.get_ptr()
-                                              : interaction_layer_input_pointers_eval_.get_ptr();
-  auto indices = this->indices_view_;
-  auto category_location = model_.category_location.get_ptr();
-  auto model_table = infrequent_embedding_vectors_.get_ptr();
-  auto embedding_vec_size = embedding_vec_size_;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<float, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->model_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<float, emtype, 1> {
-        const uint32_t offset = indices->model_indices_offsets[model_id + 1];
-        const uint32_t num_model_indices = indices->model_indices_offsets[num_instances];
-        const uint32_t vid = (i + offset) % num_model_indices;
-        const uint32_t index = indices->model_indices[vid];
-
-        const dtype category = indices->samples[index];
-        const dtype location = category_location[2 * category + 1];
-
-        const uint32_t network_id = index / local_samples_size;
-        const uint32_t local_index = index % local_samples_size;
-
-        emtype* interaction_layer_input = interaction_layer_input_pointers[network_id];
-
-        return {model_table + location * embedding_vec_size,
-                {interaction_layer_input + local_index * embedding_vec_size},
-                {true}};
-      });
-
-  shuffle(copy_desc, stream, local_samples_size / 10);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>::update_model_direct(
-    float* dev_lr, float scale, cudaStream_t stream) {
-  const uint32_t& num_instances = model_.num_instances;
-  uint32_t local_samples_size =
-      ceildiv<uint32_t>(data_->batch_size, num_instances) * data_->table_sizes.size();
-
-  int num_sm = gpu_resource_.get_sm_count();
-  int n_blocks = 8 * num_sm;  // TODO: better heuristics
-
-  /* Each model reads from the gradients of each network */
-  infrequent_embedding_kernels::
-      infrequent_update_model_direct<<<n_blocks, embedding_vec_size_, 0, stream>>>(
-          gradients_pointers_.get_ptr(), infrequent_embedding_vectors_.get_ptr(),
-          this->indices_view_, model_.category_location.get_ptr(), model_.num_instances,
-          model_.global_instance_id, embedding_vec_size_, local_samples_size, dev_lr, scale);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-InfrequentEmbedding_IB_NVLINK<dtype, emtype>::InfrequentEmbedding_IB_NVLINK(
-    Model<dtype>& model, GPUResource& gpu_resource, size_t embedding_vec_size)
-    : model_(model), gpu_resource_(gpu_resource), embedding_vec_size_(embedding_vec_size) {
-  auto buf = GeneralBuffer2<CudaAllocator>::create();
-
-  buf->reserve({ceildiv<size_t>(model.num_categories, model.num_instances), embedding_vec_size_},
-               &infrequent_embedding_vectors_);
-  buf->allocate();
-
-  auto managed_buf = GeneralBuffer2<CudaManagedAllocator>::create();
-  managed_buf->reserve({model.num_instances + 1, 1}, &model_indices_offsets_);
-  managed_buf->reserve({model.num_instances + 1, 1}, &network_indices_offsets_);
-  managed_buf->allocate();
-  // int current_device;
-  // HCTR_LIB_THROW(cudaGetDevice(&current_device));
-  // HCTR_LIB_THROW(cudaMemAdvise(managed_buf->get_ptr(), managed_buf->get_size_in_bytes(),
-  // cudaMemAdviseSetReadMostly, current_device));
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLINK<dtype, emtype>::init_comms(size_t embedding_vec_size,
-                                                              const GPUResource* gpu_resource,
-                                                              GeneralBuffer2<CudaAllocator>* i_buf,
-                                                              size_t max_buf_size) {
-  infrequent_forward_comm_buffers_ = std::make_unique<AllToAllStorage<emtype>>(i_buf, max_buf_size);
-  infrequent_backward_comm_buffers_ =
-      std::make_unique<AllToAllStorage<emtype>>(i_buf, max_buf_size);
-  infrequent_forward_comms_ = std::make_unique<AllToAll_Multi_NCCL<emtype>>(
-      infrequent_forward_comm_buffers_->send_buffer, infrequent_forward_comm_buffers_->recv_buffer,
-      get_model_indices_offsets_ptr(), get_network_indices_offsets_ptr(), gpu_resource,
-      embedding_vec_size);
-  infrequent_backward_comms_ = std::make_unique<AllToAll_Multi_NCCL<emtype>>(
-      infrequent_backward_comm_buffers_->send_buffer,
-      infrequent_backward_comm_buffers_->recv_buffer, get_network_indices_offsets_ptr(),
-      get_model_indices_offsets_ptr(), gpu_resource, embedding_vec_size);
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLINK<dtype, emtype>::forward_model(emtype* message_buffer,
-                                                                 cudaStream_t stream) {
-  HCTR_LIB_THROW(cudaMemcpyAsync(
-      model_indices_offsets_.get_ptr(), this->indices_->model_indices_offsets_.get_ptr(),
-      model_indices_offsets_.get_size_in_bytes(), cudaMemcpyDeviceToDevice, stream));
-
-  HCTR_LIB_THROW(cudaMemcpyAsync(
-      network_indices_offsets_.get_ptr(), this->indices_->network_indices_offsets_.get_ptr(),
-      network_indices_offsets_.get_size_in_bytes(), cudaMemcpyDeviceToDevice, stream));
-
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-
-  auto indices = this->indices_view_;
-  auto category_location = model_.category_location.get_ptr();
-  auto infrequent_embedding_vectors = infrequent_embedding_vectors_.get_ptr();
-  auto embedding_vec_size = embedding_vec_size_;
-  auto num_instances = model_.num_instances;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<float, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->model_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<float, emtype, 1> {
-        uint32_t index = indices->model_indices[i];
-        dtype category = indices->samples[index];
-        dtype location = category_location[2 * category + 1];
-
-        return {infrequent_embedding_vectors + location * embedding_vec_size,
-                {message_buffer + i * embedding_vec_size},
-                {true}};
-      });
-
-  shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLINK<dtype, emtype>::forward_network(const emtype* message_buffer,
-                                                                   emtype* output_ptr,
-                                                                   cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  auto embedding_vec_size = embedding_vec_size_;
-  auto num_instances = model_.num_instances;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<emtype, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->network_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<emtype, emtype, 1> {
-        uint32_t index = indices->network_indices[i];
-        return {message_buffer + i * embedding_vec_size,
-                {output_ptr + index * embedding_vec_size},
-                {true}};
-      });
-
-  shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLINK<dtype, emtype>::update_network(const emtype* gradients,
-                                                                  emtype* message_buffer,
-                                                                  cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  auto embedding_vec_size = embedding_vec_size_;
-  auto num_instances = model_.num_instances;
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<emtype, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->network_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<emtype, emtype, 1> {
-        uint32_t index = indices->network_indices[i];
-
-        return {gradients + index * embedding_vec_size,
-                {message_buffer + i * embedding_vec_size},
-                {true}};
-      });
-
-  shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLINK<dtype, emtype>::update_model(const emtype* message_buffer,
-                                                                float* dev_lr, float scale,
-                                                                cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  const dtype* __restrict__ category_location = model_.category_location.get_ptr();
-  auto num_instances = model_.num_instances;
-
-  uint32_t n_blocks = gpu_resource_.get_sm_count();
-
-  sgd_atomic_update(
-      message_buffer, infrequent_embedding_vectors_.get_ptr(),
-      [indices, num_instances] __device__() {
-        return indices->model_indices_offsets[num_instances];
-      },
-      [indices, category_location] __device__(uint32_t i) {
-        uint32_t index = indices->model_indices[i];
-        dtype category = indices->samples[index];
-        return category_location[2 * category + 1];
-      },
-      n_blocks, embedding_vec_size_, dev_lr, scale, stream);
-}
-
-template <typename dtype, typename emtype>
-InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::InfrequentEmbedding_IB_NVLink_Hier(
-    Model<dtype>& model, GPUResource& gpu_resource, size_t embedding_vec_size)
-    : model_(model), gpu_resource_(gpu_resource), embedding_vec_size_(embedding_vec_size) {
-  auto buf = GeneralBuffer2<CudaAllocator>::create();
-  buf->reserve({ceildiv<size_t>(model.num_categories, model.num_instances), embedding_vec_size_},
-               &infrequent_embedding_vectors_);
-  buf->reserve({model_.num_instances}, &model_indices_sizes_);
-  buf->reserve({model_.num_instances}, &model_indices_sizes_ptrs_);
-  buf->reserve({model_.num_instances}, &network_indices_sizes_);
-  buf->reserve({model_.num_instances}, &network_indices_sizes_ptrs_);
-  buf->allocate();
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::init_comms(
-    int64_t max_num_infrequent_samples, size_t slot_num, size_t embedding_vec_size,
-    GeneralBuffer2<CudaAllocator>* buf_ptr, size_t batch_size_true, size_t batch_size_false,
-    size_t local_gpu_count) {
-  double p_infrequent_samples = 1.0;
-  if (max_num_infrequent_samples >= 0) {
-    p_infrequent_samples =
-        (double)max_num_infrequent_samples / ((double)batch_size_true * slot_num);
-  }
-  auto align = [this](size_t val) {
-    auto alignment = model_.num_instances;
-    return ((val + alignment - 1) / alignment) * alignment;
-  };
-
-  max_num_infrequent_per_batch_ =
-      align(std::max(batch_size_true, batch_size_false) * slot_num * p_infrequent_samples);
-
-  max_num_infrequent_per_train_batch_ = align(batch_size_true * slot_num * p_infrequent_samples);
-
-  size_t max_buf_size = embedding_vec_size * max_num_infrequent_per_batch_;
-  size_t max_back_buf_size = embedding_vec_size * max_num_infrequent_per_train_batch_;
-
-  HCTR_LOG_S(INFO, ROOT) << "Allocating A2A buffers for infrequent categories. For training : "
-                         << max_num_infrequent_per_train_batch_
-                         << ", for evaluation:  " << max_num_infrequent_per_batch_ << std::endl;
-
-  infrequent_backward_comm_buffers_ =
-      std::make_unique<AllToAllStorage<emtype>>(buf_ptr, max_back_buf_size);
-  infrequent_forward_comm_buffers_ =
-      std::make_unique<AllToAllStorage<emtype>>(buf_ptr, max_buf_size);
-  // TODO: need to check the correctness
-  buf_ptr->reserve({local_gpu_count}, &infrequent_forward_comm_buffers_->send_buffer_ptrs);
-  buf_ptr->reserve({local_gpu_count}, &infrequent_backward_comm_buffers_->send_buffer_ptrs);
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::fused_intra_forward_model(
-    emtype** message_buffer, cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  auto category_location = model_.category_location.get_ptr();
-  auto infrequent_embedding_vectors = infrequent_embedding_vectors_.get_ptr();
-  size_t embedding_vec_size = embedding_vec_size_;
-  auto local_instance_id = model_.instance_id;
-  auto num_instances = model_.num_instances;
-  auto per_node_instances = num_instances / model_.h_num_instances_per_node.size();
-  uint32_t local_samples_size =
-      ceildiv<uint32_t>(data_->batch_size, num_instances) * data_->table_sizes.size();
-
-  uint32_t local_comm_buff_size =
-      ceildiv<uint32_t>(max_num_infrequent_per_batch_, model_.num_instances);
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<float, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->model_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<float, emtype, 1> {
-        uint32_t num_selected = indices->model_indices_offsets[num_instances];
-        uint32_t vid =
-            (i + indices->model_indices_offsets[(local_instance_id + 1) % per_node_instances]) %
-            num_selected;
-        uint32_t index = indices->model_indices[vid];
-        uint32_t network_id = (index / local_samples_size);
-        dtype category = indices->samples[index];
-        dtype location = category_location[2 * category + 1];
-        uint32_t local_network_id = (network_id % per_node_instances);
-        emtype* output_ptr =
-            &message_buffer[local_network_id][(network_id - local_network_id + local_instance_id) *
-                                              local_comm_buff_size * embedding_vec_size];
-
-        return {
-            infrequent_embedding_vectors + location * embedding_vec_size,
-            {output_ptr + (vid - indices->model_indices_offsets[network_id]) * embedding_vec_size},
-            {true}};
-      });
-
-  shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::hier_forward_network(
-    const emtype* message_buffer, emtype* output_ptr, cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  auto embedding_vec_size = embedding_vec_size_;
-  auto num_instances = model_.num_instances;
-  uint32_t local_samples_size =
-      ceildiv<uint32_t>(data_->batch_size, model_.num_instances) * data_->table_sizes.size();
-  uint32_t local_comm_buff_size =
-      ceildiv<uint32_t>(max_num_infrequent_per_batch_, model_.num_instances);
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<emtype, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->network_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<emtype, emtype, 1> {
-        uint32_t index = indices->network_indices[i];
-        uint32_t model_id = indices->network_indices_src_model_id[i];
-        uint32_t offset = indices->network_indices_offsets[model_id];
-
-        return {
-            message_buffer + (model_id * local_comm_buff_size + i - offset) * embedding_vec_size,
-            {output_ptr + index * embedding_vec_size},
-            {true}};
-      });
-
-  shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::fused_intra_update_network(
-    const emtype* gradients, emtype** message_buffer, cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  size_t embedding_vec_size = embedding_vec_size_;
-  auto local_instance_id = model_.instance_id;
-  auto num_instances = model_.num_instances;
-  auto per_node_instances = num_instances / model_.h_num_instances_per_node.size();
-  uint32_t local_comm_buff_size =
-      ceildiv<uint32_t>(max_num_infrequent_per_train_batch_, model_.num_instances);
-
-  auto copy_desc = CopyDescriptors::make_OneToOne<emtype, emtype, 1>(
-      embedding_vec_size,
-      [=] __device__() { return indices->network_indices_offsets[num_instances]; },
-      [=] __device__(size_t i) -> CopyDescriptors::CopyDetails<emtype, emtype, 1> {
-        uint32_t num_selected = indices->network_indices_offsets[num_instances];
-        uint32_t vid =
-            (i + indices->network_indices_offsets[(local_instance_id + 1) % per_node_instances]) %
-            num_selected;
-        uint32_t index = indices->network_indices[vid];
-
-        uint32_t model_id = indices->network_indices_src_model_id[vid];
-
-        uint32_t local_model_id = (model_id % per_node_instances);
-        emtype* output_ptr =
-            &message_buffer[local_model_id][(model_id - local_model_id + local_instance_id) *
-                                            local_comm_buff_size * embedding_vec_size];
-
-        return {
-            gradients + index * embedding_vec_size,
-            {output_ptr + (vid - indices->network_indices_offsets[model_id]) * embedding_vec_size},
-            {true}};
-      });
-
-  shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::hier_update_model(
-    const emtype* message_buffer, float* dev_lr, float scale, cudaStream_t stream) {
-  const uint32_t& num_instances = model_.num_instances;
-  uint32_t local_samples_size =
-      ceildiv<uint32_t>(data_->batch_size, num_instances) * data_->table_sizes.size();
-  uint32_t local_comm_buff_size =
-      ceildiv<uint32_t>(max_num_infrequent_per_train_batch_, model_.num_instances);
-
-  int num_sm = gpu_resource_.get_sm_count();
-  int n_blocks = 16 * num_sm;  // TODO: better heuristics
-
-  infrequent_embedding_kernels::hier_update_model<<<n_blocks, embedding_vec_size_, 0, stream>>>(
-      this->indices_view_, model_.category_location.get_ptr(),
-      // infrequent_backward_comm_buffers_.back().recv_buffer.get_ptr(),
-      message_buffer, infrequent_embedding_vectors_.get_ptr(), embedding_vec_size_,
-      model_.num_instances, local_samples_size, local_comm_buff_size, dev_lr, scale);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::calculate_model_indices_sizes_from_offsets(
-    cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  constexpr size_t TPB = 256;
-  const size_t n_blocks = ceildiv<size_t>(model_.num_instances, TPB);
-  infrequent_embedding_kernels::offsets_to_sizes<<<n_blocks, TPB, 0, stream>>>(
-      model_indices_sizes_.get_ptr(), [=] __device__() { return indices->model_indices_offsets; },
-      embedding_vec_size_ * sizeof(emtype), model_.num_instances);
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<
-    dtype, emtype>::calculate_network_indices_sizes_from_offsets(cudaStream_t stream) {
-  auto indices = this->indices_view_;
-  constexpr size_t TPB = 256;
-  const size_t n_blocks = ceildiv<size_t>(model_.num_instances, TPB);
-  infrequent_embedding_kernels::offsets_to_sizes<<<n_blocks, TPB, 0, stream>>>(
-      network_indices_sizes_.get_ptr(),
-      [=] __device__() { return indices->network_indices_offsets; },
-      embedding_vec_size_ * sizeof(emtype), model_.num_instances);
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>::initialize_embedding_vectors(
-    const std::vector<size_t>& table_sizes) {
-  CudaDeviceContext context(gpu_resource_.get_device_id());
-
-  const size_t num_tables = table_sizes.size();
-  for (size_t i = 0; i < num_tables; i++) {
-    float up_bound = sqrt(1.f / table_sizes[i]);
-
-    const size_t offset = embedding_vec_size_ * model_.h_infrequent_model_table_offsets[i];
-    const size_t number_of_vectors =
-        model_.h_infrequent_model_table_offsets[i + 1] - model_.h_infrequent_model_table_offsets[i];
-    UniformGenerator::fill(
-        infrequent_embedding_vectors_.get_ptr() + offset, embedding_vec_size_ * number_of_vectors,
-        -up_bound, up_bound, gpu_resource_.get_sm_count(),
-        gpu_resource_.get_replica_variant_curand_generator(), gpu_resource_.get_stream());
-  }
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLINK<dtype, emtype>::initialize_embedding_vectors(
-    const std::vector<size_t>& table_sizes) {
-  CudaDeviceContext context(gpu_resource_.get_device_id());
-
-  const size_t num_tables = table_sizes.size();
-  for (size_t i = 0; i < num_tables; i++) {
-    float up_bound = sqrt(1.f / table_sizes[i]);
-
-    const size_t offset = embedding_vec_size_ * model_.h_infrequent_model_table_offsets[i];
-    const size_t number_of_vectors =
-        model_.h_infrequent_model_table_offsets[i + 1] - model_.h_infrequent_model_table_offsets[i];
-    UniformGenerator::fill(
-        infrequent_embedding_vectors_.get_ptr() + offset, embedding_vec_size_ * number_of_vectors,
-        -up_bound, up_bound, gpu_resource_.get_sm_count(),
-        gpu_resource_.get_replica_variant_curand_generator(), gpu_resource_.get_stream());
-  }
-}
-
-template <typename dtype, typename emtype>
-void InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>::initialize_embedding_vectors(
-    const std::vector<size_t>& table_sizes) {
-  CudaDeviceContext context(gpu_resource_.get_device_id());
-
-  const size_t num_tables = table_sizes.size();
-  for (size_t i = 0; i < num_tables; i++) {
-    float up_bound = sqrt(1.f / table_sizes[i]);
-
-    const size_t offset = embedding_vec_size_ * model_.h_infrequent_model_table_offsets[i];
-    const size_t number_of_vectors =
-        model_.h_infrequent_model_table_offsets[i + 1] - model_.h_infrequent_model_table_offsets[i];
-    UniformGenerator::fill(
-        infrequent_embedding_vectors_.get_ptr() + offset, embedding_vec_size_ * number_of_vectors,
-        -up_bound, up_bound, gpu_resource_.get_sm_count(),
-        gpu_resource_.get_replica_variant_curand_generator(), gpu_resource_.get_stream());
-  }
-}
-
-template class InfrequentEmbeddingBase<uint32_t>;
-template class InfrequentEmbeddingBase<long long>;
-
-// NVLink_SingleNode
-template class InfrequentEmbedding_NVLink_SingleNode<uint32_t, __half>;
-template class InfrequentEmbedding_NVLink_SingleNode<uint32_t, float>;
-template class InfrequentEmbedding_NVLink_SingleNode<long long, __half>;
-template class InfrequentEmbedding_NVLink_SingleNode<long long, float>;
-
-// IB_NVLINK
-template class InfrequentEmbedding_IB_NVLINK<uint32_t, __half>;
-template class InfrequentEmbedding_IB_NVLINK<uint32_t, float>;
-template class InfrequentEmbedding_IB_NVLINK<long long, __half>;
-template class InfrequentEmbedding_IB_NVLINK<long long, float>;
-
-// IB_NVLink_Hier
-template class InfrequentEmbedding_IB_NVLink_Hier<uint32_t, __half>;
-template class InfrequentEmbedding_IB_NVLink_Hier<uint32_t, float>;
-template class InfrequentEmbedding_IB_NVLink_Hier<long long, __half>;
-template class InfrequentEmbedding_IB_NVLink_Hier<long long, float>;
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/model.cu b/HugeCTR/src/embeddings/hybrid_embedding/model.cu
deleted file mode 100644
index 2564a8aff6..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/model.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <resource_manager.hpp>
-#include <tensor2.hpp>
-#include <utils.cuh>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-Model<dtype>::Model(const Model &model) {
-  node_id = model.node_id;
-  instance_id = model.instance_id;
-  global_instance_id = model.global_instance_id;
-  communication_type = model.communication_type;
-  d_num_frequent = model.d_num_frequent;
-  d_total_frequent_count = model.d_total_frequent_count;
-  num_frequent = model.num_frequent;
-  num_categories = model.num_categories;
-  num_instances = model.num_instances;
-  if (model.h_num_instances_per_node.size() > 0) {
-    h_num_instances_per_node.resize(model.h_num_instances_per_node.size());
-    for (size_t i = 0; i < model.h_num_instances_per_node.size(); ++i) {
-      h_num_instances_per_node[i] = model.h_num_instances_per_node[i];
-    }
-  }
-  num_instances_per_node = model.num_instances_per_node;
-  category_location = model.category_location;
-  frequent_categories = model.frequent_categories;
-  if (model.h_frequent_model_table_offsets.size() > 0) {
-    h_frequent_model_table_offsets = model.h_frequent_model_table_offsets;
-  }
-  if (model.h_infrequent_model_table_offsets.size() > 0) {
-    h_infrequent_model_table_offsets = model.h_infrequent_model_table_offsets;
-  }
-}
-
-template <typename dtype>
-void Model<dtype>::init_params_and_reserve(CommunicationType communication_type_in,
-                                           uint32_t global_instance_id_in,
-                                           const std::vector<uint32_t> &num_instances_per_node_in,
-                                           size_t num_categories_in,
-                                           std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf) {
-  // initialize model parameters and reserve memory
-  communication_type = communication_type_in;
-  global_instance_id = global_instance_id_in;
-  h_num_instances_per_node = num_instances_per_node_in;
-  num_categories = num_categories_in;
-  num_instances = 0;
-  for (size_t i = 0; i < h_num_instances_per_node.size(); ++i)
-    num_instances += h_num_instances_per_node[i];
-
-  const size_t num_nodes = h_num_instances_per_node.size();
-  assert(num_nodes > 0);
-  uint32_t sum_instances = (uint32_t)0;
-  for (node_id = 0; node_id < num_nodes && global_instance_id >= sum_instances; ++node_id)
-    sum_instances += h_num_instances_per_node[node_id];
-  node_id--;
-
-  // instance id within node
-  instance_id = global_instance_id - (sum_instances - h_num_instances_per_node[node_id]);
-  buf->reserve({1, 1}, &d_num_frequent);
-  buf->reserve({1, 1}, &d_total_frequent_count);
-  buf->reserve({h_num_instances_per_node.size(), 1}, &num_instances_per_node);
-  size_t cate_len = (static_cast<size_t>(num_categories) + 1) << 1;
-  buf->reserve({cate_len, 1}, &category_location);  // +1 for NULL category
-}
-
-/// init_model calculates the optimal number of frequent categories
-/// given the calibration of the all-to-all and all-reduce.
-template <typename dtype>
-void Model<dtype>::init_hybrid_model(const CalibrationData &calibration,
-                                     Statistics<dtype> &statistics, const Data<dtype> &data,
-                                     Tensor2<dtype> &tmp_categories, cudaStream_t stream) {
-  dtype *frequent_categories_ptr = tmp_categories.get_ptr();  // tmp_categories.get_ptr();
-  // list the top categories sorted by count
-  const Tensor2<dtype> &samples = data.samples;
-  statistics.sort_categories_by_count(samples, stream);
-
-  /* Calculate table offsets, i.e cumulative sum of the table sizes */
-  std::vector<dtype> h_table_offsets(data.table_sizes.size() + 1);
-  h_table_offsets[0] = 0;
-  for (size_t i = 0; i < data.table_sizes.size(); i++) {
-    h_table_offsets[i + 1] = h_table_offsets[i] + (dtype)data.table_sizes[i];
-  }
-  upload_tensor(h_table_offsets, statistics.table_offsets, stream);
-
-  // from the sorted count, determine the number of frequent categories
-  //
-  // If the calibration data is present, this is used to calculate the number
-  // of frequent categories.  Otherwise use the threshold required by the
-  // communication type.
-  num_frequent = ModelInitializationFunctors<dtype>::calculate_num_frequent_categories(
-      communication_type, num_instances, calibration, statistics, data, d_num_frequent.get_ptr(),
-      stream);
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-  buf->reserve({(size_t)num_frequent, 1}, &this->frequent_categories);
-  buf->allocate();
-  frequent_probability = ModelInitializationFunctors<dtype>::calculate_frequent_probability(
-      statistics, num_frequent, d_total_frequent_count.get_ptr(), stream);
-
-  dtype num_infrequent = num_categories - num_frequent;
-  dtype *infrequent_categories_ptr = frequent_categories_ptr + num_frequent;
-  /* The categories are organized:
-   *  - per instance (round-robin)
-   *  - then per slot
-   *  - and finally in decreasing order of frequency
-   */
-  statistics.calculate_frequent_and_infrequent_categories(
-      frequent_categories_ptr, infrequent_categories_ptr, category_location.get_ptr(), num_frequent,
-      num_infrequent, stream);
-  HCTR_LIB_THROW(cudaMemcpyAsync(this->frequent_categories.get_ptr(), frequent_categories_ptr,
-                                 num_frequent * sizeof(dtype), cudaMemcpyDeviceToDevice, stream));
-  /* Calculate frequent and infrequent table offsets */
-  statistics.calculate_frequent_model_table_offsets(h_frequent_model_table_offsets,
-                                                    frequent_categories_ptr, num_frequent, stream);
-  statistics.calculate_infrequent_model_table_offsets(h_infrequent_model_table_offsets,
-                                                      infrequent_categories_ptr, category_location,
-                                                      global_instance_id, num_infrequent, stream);
-  // statistics.revoke_temp_storage();
-  /* A synchronization is necessary to ensure that the host arrays have been copied */
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-}
-
-template class Model<uint32_t>;
-template class Model<long long>;
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/statistics.cu b/HugeCTR/src/embeddings/hybrid_embedding/statistics.cu
deleted file mode 100644
index 4116436f43..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/statistics.cu
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <cub/cub.cuh>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/select.cuh>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <iostream>
-#include <string>
-#include <tensor2.hpp>
-#include <utils.cuh>
-#include <vector>
-
-namespace HugeCTR {
-namespace hybrid_embedding {
-
-namespace statistics_kernels {
-
-/** Compute keys to sort the frequent embedding tables.
- * The categories are organized:
- *  - per instance (round-robin)
- *  - then per slot
- *  - and finally in decreasing order of frequency
- *
- * The sort is stable, so the keys only need to be: instance_id * num_tables + table_id
- */
-template <typename dtype>
-static __global__ void category_to_frequent_section(const dtype *__restrict__ categories_sorted,
-                                                    uint32_t *keys,
-                                                    const dtype *__restrict__ table_offsets,
-                                                    size_t num_frequent, size_t num_tables,
-                                                    size_t num_instances) {
-  size_t tid = static_cast<size_t>(blockIdx.x) * static_cast<size_t>(blockDim.x) + threadIdx.x;
-  if (tid < num_frequent) {
-    dtype category = categories_sorted[tid];
-
-    uint32_t table_id = 0;
-    for (table_id = 0; table_id < num_tables - 1 && category >= table_offsets[table_id + 1];
-         ++table_id) {
-    }
-
-    uint32_t instance_id = tid % num_instances;
-
-    keys[tid] = instance_id * num_tables + table_id;
-  }
-}
-
-template <typename T, typename IdxT>
-static __global__ void fill(T *__restrict__ array, T val, IdxT n_elem) {
-  IdxT tid = static_cast<IdxT>(blockIdx.x) * static_cast<IdxT>(blockDim.x) +
-             static_cast<IdxT>(threadIdx.x);
-  if (tid < n_elem) array[tid] = val;
-}
-
-template <typename dtype>
-static __global__ void calculate_category_location_frequent(
-    const dtype *__restrict__ frequent_categories, dtype *category_location, size_t num_frequent,
-    size_t num_instances) {
-  size_t tid = static_cast<size_t>(blockIdx.x) * static_cast<size_t>(blockDim.x) + threadIdx.x;
-  if (tid < num_frequent) {
-    dtype category = frequent_categories[tid];
-    category_location[2 * (size_t)category] = num_instances;
-    category_location[2 * (size_t)category + 1] = tid;
-  }
-}
-
-template <typename dtype>
-static __global__ void calculate_category_location_infrequent(
-    const dtype *__restrict__ infrequent_categories, dtype *category_location,
-    size_t num_infrequent, size_t num_models) {
-  size_t tid = static_cast<size_t>(blockIdx.x) * static_cast<size_t>(blockDim.x) + threadIdx.x;
-  if (tid < num_infrequent) {
-    dtype category = infrequent_categories[tid];
-    category_location[2 * (size_t)category] = tid % num_models;
-    category_location[2 * (size_t)category + 1] = tid / num_models;
-  }
-}
-
-template <typename dtype>
-static __global__ void calculate_infrequent_model_table_offsets(
-    const dtype *__restrict__ categories, const dtype *__restrict__ category_location,
-    const dtype *__restrict__ table_offsets, dtype *offsets, size_t n_tables, dtype n_elem,
-    dtype n_model_elem, uint32_t global_instance_id) {
-  const size_t table_id = threadIdx.x;
-  if (table_id > n_tables) {
-    return;
-  }
-  // Find first category id belonging to that table (not necessarily in this model!)
-  dtype category = table_offsets[table_id];
-
-  // Step 1: binary search of the category
-  dtype start = 0;
-  dtype end = n_elem;
-  while (start < end) {
-    dtype mid = start + (end - start) / 2;
-    dtype value = categories[mid];
-    if (value < category)
-      start = mid + 1;
-    else {
-      end = mid;
-    }
-  }
-
-  // Step 2: increment until the model id matches
-  while (start < n_elem && category_location[2 * (size_t)categories[start]] != global_instance_id) {
-    start++;
-  }
-
-  // Step 3: lookup location and write the offset
-  if (start == n_elem) {
-    // If we are at the end of the array, write the number of elements belonging to this model
-    offsets[table_id] = n_model_elem;
-  } else {
-    // Else, write the location of the first category from this table belonging to this model
-    offsets[table_id] = category_location[2 * (size_t)categories[start] + 1];
-  }
-}
-
-template <typename dtype>
-static __global__ void calculate_frequent_model_table_offsets(
-    const dtype *__restrict__ categories, const dtype *__restrict__ table_offsets, dtype *offsets,
-    size_t n_divs, size_t n_tables, dtype n_elem) {
-  const size_t div_id = blockIdx.x;
-  const size_t table_id = threadIdx.x;
-
-  const dtype n_elem_per_div = n_elem / n_divs;  // Note: num_instances divides num_frequent
-
-  // Find first category id belonging to that table
-  dtype category = table_offsets[table_id];
-
-  // Setup start and end to the bounds of this division
-  dtype start = div_id * n_elem_per_div;
-  dtype end = (div_id + 1) * n_elem_per_div;
-
-  // Binary search
-  while (start < end) {
-    dtype mid = (start + end) / 2;
-    dtype value = categories[mid];
-
-    if (value < category)
-      start = mid + 1;
-    else
-      end = mid;
-  }
-
-  // Write offset
-  offsets[div_id * (n_tables + 1) + table_id] = start;
-}
-
-}  // namespace statistics_kernels
-
-///
-/// Perform count of categories within the samples and sort the categories by count
-///
-template <typename dtype>
-void Statistics<dtype>::sort_categories_by_count(const Tensor2<dtype> &samples,
-                                                 cudaStream_t stream) {
-  const dtype *d_samples = samples.get_ptr();
-  size_t num_samples = samples.get_size_in_bytes() / sizeof(dtype);
-  dtype *d_categories = categories_sorted.get_ptr();
-  uint32_t *d_counts = counts_sorted.get_ptr();
-  sort_categories_by_count(d_samples, num_samples, d_categories, d_counts, num_unique_categories,
-                           stream);  // Kefengs' function
-  categories_sorted.reset_shape({num_unique_categories, 1});
-  counts_sorted.reset_shape({num_unique_categories, 1});
-}
-
-template <typename dtype>
-struct InfrequentSelectOp {
-  const dtype *category_location;
-  const dtype num_categories;
-  __host__ __device__ __forceinline__ InfrequentSelectOp(const dtype *category_location,
-                                                         const dtype num_categories)
-      : category_location(category_location), num_categories(num_categories) {}
-  __device__ __forceinline__ bool operator()(const dtype &category) const {
-    return category_location[2 * (size_t)category + 1] == num_categories;
-  }
-};
-
-template <typename dtype>
-void Statistics<dtype>::reserve_temp_storage(std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf) {
-  size_t size_sort_keys_temp = 0;
-  sort_categories_by_count_temp_storages_.resize(7);
-  HCTR_LIB_THROW(cub::DeviceRadixSort::SortKeys((void *)nullptr, size_sort_keys_temp,
-                                                (dtype *)nullptr, (dtype *)nullptr,
-                                                (int)num_samples, 0, sizeof(dtype) * 8, 0));
-  buf->reserve({size_sort_keys_temp, 1}, &sort_categories_by_count_temp_storages_[0]);
-  buf->reserve({num_samples * sizeof(dtype), 1}, &sort_categories_by_count_temp_storages_[1]);
-  size_t size_unique_categories_temp = 0;
-  HCTR_LIB_THROW(cub::DeviceRunLengthEncode::Encode(
-      (void *)nullptr, size_unique_categories_temp, (dtype *)nullptr, (dtype *)nullptr,
-      (uint32_t *)nullptr, (uint32_t *)nullptr, (int)num_samples, 0));
-
-  buf->reserve({size_unique_categories_temp, 1}, &sort_categories_by_count_temp_storages_[2]);
-  buf->reserve({num_samples * sizeof(dtype), 1}, &sort_categories_by_count_temp_storages_[3]);
-  buf->reserve({num_samples * sizeof(uint32_t), 1}, &sort_categories_by_count_temp_storages_[4]);
-  buf->reserve({sizeof(uint32_t), 1}, &sort_categories_by_count_temp_storages_[5]);
-
-  size_t size_sort_pairs_temp = 0;
-  HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairsDescending(
-      (void *)nullptr, size_sort_pairs_temp, (uint32_t *)nullptr, (uint32_t *)nullptr,
-      (dtype *)nullptr, (dtype *)nullptr, (int)num_samples, 0, sizeof(uint32_t) * 8, 0));
-  buf->reserve({size_sort_pairs_temp, 1}, &sort_categories_by_count_temp_storages_[6]);
-
-  /// TODO: reuse temp storage for operations that can't run concurrently!
-
-  calculate_frequent_categories_temp_storages_.resize(3);
-  size_t size_sort_temp = 0;
-  int bit_width = 1;
-  for (uint32_t i = num_instances * num_tables - 1; i >>= 1;) bit_width++;
-  HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairs(
-      (void *)nullptr, size_sort_temp, (uint32_t *)nullptr, (uint32_t *)nullptr, (dtype *)nullptr,
-      (dtype *)nullptr, (int)num_samples, 0, bit_width, 0));
-
-  buf->reserve({num_samples * sizeof(uint32_t), 1},
-               &calculate_frequent_categories_temp_storages_[0]);
-  buf->reserve({num_samples * sizeof(uint32_t), 1},
-               &calculate_frequent_categories_temp_storages_[1]);
-  buf->reserve({size_sort_temp, 1}, &calculate_frequent_categories_temp_storages_[2]);
-
-  calculate_infrequent_categories_temp_storages_.resize(2);
-  size_t size_select_temp = 0;
-  cub::CountingInputIterator<dtype> counting(0);
-  InfrequentSelectOp<dtype> select_op(nullptr, 0);
-  if (static_cast<size_t>(num_categories) < (1ul << 31)) {
-    HCTR_LIB_THROW(cub::DeviceSelect::If((void *)nullptr, size_select_temp, counting,
-                                         (dtype *)nullptr, (dtype *)nullptr, num_categories,
-                                         select_op, 0));
-  } else {
-    HugeCTR::DeviceSelect::If((void *)nullptr, size_select_temp, counting, (dtype *)nullptr,
-                              (dtype *)nullptr, static_cast<size_t>(num_categories), select_op, 0);
-  }
-  buf->reserve({size_select_temp, 1}, &calculate_infrequent_categories_temp_storages_[0]);
-  buf->reserve({sizeof(dtype), 1}, &calculate_infrequent_categories_temp_storages_[1]);
-};
-
-template <typename dtype>
-void Statistics<dtype>::sort_categories_by_count(const dtype *samples, size_t num_samples,
-                                                 dtype *categories_sorted, uint32_t *counts_sorted,
-                                                 uint32_t &num_unique_categories,
-                                                 cudaStream_t stream) {
-  if (num_samples > 0x7fffffff) {
-    HCTR_LOG_S(ERROR, WORLD) << "Num samples: " << std::hex << num_samples << std::dec << std::endl;
-    HCTR_OWN_THROW(Error_t::WrongInput, "num_samples is too large, overflow for int type");
-  }
-  void *p_sort_keys_temp =
-      reinterpret_cast<void *>(sort_categories_by_count_temp_storages_[0].get_ptr());  // void*
-  dtype *p_sort_keys_out =
-      reinterpret_cast<dtype *>(sort_categories_by_count_temp_storages_[1].get_ptr());  // dtype*
-  void *p_unique_categories_temp =
-      reinterpret_cast<void *>(sort_categories_by_count_temp_storages_[2].get_ptr());  // void*
-  dtype *p_unique_categories_out =
-      reinterpret_cast<dtype *>(sort_categories_by_count_temp_storages_[3].get_ptr());  // dtype*
-  uint32_t *p_unique_categories_counts = reinterpret_cast<uint32_t *>(
-      sort_categories_by_count_temp_storages_[4].get_ptr());  // uint32_t*
-  uint32_t *p_num_unique_categories = reinterpret_cast<uint32_t *>(
-      sort_categories_by_count_temp_storages_[5].get_ptr());  // uint32*
-  void *p_sort_pairs_temp =
-      reinterpret_cast<void *>(sort_categories_by_count_temp_storages_[6].get_ptr());  // void*
-
-  size_t temp_size = sort_categories_by_count_temp_storages_[0].get_size_in_bytes();
-  HCTR_LIB_THROW(cub::DeviceRadixSort::SortKeys(p_sort_keys_temp, temp_size, samples,
-                                                p_sort_keys_out, (int)num_samples, 0,
-                                                sizeof(dtype) * 8, stream));
-  size_t sorted_len = (size_t)num_samples;
-  temp_size = sort_categories_by_count_temp_storages_[2].get_size_in_bytes();
-  HCTR_LIB_THROW(cub::DeviceRunLengthEncode::Encode(
-      p_unique_categories_temp, temp_size, p_sort_keys_out, p_unique_categories_out,
-      p_unique_categories_counts, p_num_unique_categories, (int)num_samples, stream));
-  HCTR_LIB_THROW(cudaMemcpyAsync((void *)&num_unique_categories, (void *)p_num_unique_categories,
-                                 sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  temp_size = sort_categories_by_count_temp_storages_[6].get_size_in_bytes();
-  HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairsDescending(
-      p_sort_pairs_temp, temp_size, p_unique_categories_counts, counts_sorted,
-      p_unique_categories_out, categories_sorted, (int)num_unique_categories, 0,
-      sizeof(uint32_t) * 8, stream));
-}
-
-template <typename dtype>
-void Statistics<dtype>::calculate_frequent_and_infrequent_categories(
-    dtype *frequent_categories, dtype *infrequent_categories, dtype *category_location,
-    const size_t num_frequent, const size_t num_infrequent, cudaStream_t stream) {
-  // Fill with default value1
-  constexpr size_t TPB_fill = 256;
-  const size_t total_num_categories = num_categories + 1;  // Add NULL category
-  const size_t n_blocks_fill = ceildiv<size_t>(2 * total_num_categories, TPB_fill);
-  statistics_kernels::fill<<<n_blocks_fill, TPB_fill, 0, stream>>>(
-      category_location, (dtype)num_categories, 2 * total_num_categories);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-  // Frequent category generation
-  if (num_frequent > 0) {
-    uint32_t *p_keys_in = reinterpret_cast<uint32_t *>(
-        calculate_frequent_categories_temp_storages_[0].get_ptr());  // uint32_t*
-    uint32_t *p_keys_out = reinterpret_cast<uint32_t *>(
-        calculate_frequent_categories_temp_storages_[1].get_ptr());  // uint32_t*
-    void *p_sort_temp = reinterpret_cast<void *>(
-        calculate_frequent_categories_temp_storages_[2].get_ptr());  // void*
-    size_t sort_temp_size = calculate_frequent_categories_temp_storages_[2].get_size_in_bytes();
-
-    // Generate keys
-    constexpr size_t TPB_keys = 256;
-    const size_t n_blocks_keys = ceildiv<size_t>(num_frequent, TPB_keys);
-    statistics_kernels::category_to_frequent_section<<<n_blocks_keys, TPB_keys, 0, stream>>>(
-        categories_sorted.get_ptr(), p_keys_in, table_offsets.get_ptr(), num_frequent, num_tables,
-        num_instances);
-    HCTR_LIB_THROW(cudaPeekAtLastError());
-
-    // Sort
-    int bit_width = 1;
-    for (uint32_t i = num_instances * num_tables - 1; i >>= 1;) bit_width++;
-    HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairs(
-        p_sort_temp, sort_temp_size, p_keys_in, p_keys_out, categories_sorted.get_ptr(),
-        frequent_categories, (int)num_frequent, 0, bit_width, stream));
-    constexpr size_t TPB_loc = 256;
-    const size_t n_blocks_loc_freq = (size_t)ceildiv<dtype>(num_frequent, TPB_loc);
-    statistics_kernels::
-        calculate_category_location_frequent<<<n_blocks_loc_freq, TPB_loc, 0, stream>>>(
-            frequent_categories, category_location, num_frequent, num_instances);
-  }
-  // Infrequent category generation
-  if (num_infrequent > 0) {
-    // TODO: combine select and writing to category_location with a custom output iterator
-    void *p_select_temp = reinterpret_cast<void *>(
-        calculate_infrequent_categories_temp_storages_[0].get_ptr());  // void*
-    dtype *p_num_selected = reinterpret_cast<dtype *>(
-        calculate_infrequent_categories_temp_storages_[1].get_ptr());  // dtype*
-    size_t select_temp_size = calculate_infrequent_categories_temp_storages_[0].get_size_in_bytes();
-
-    cub::CountingInputIterator<dtype> counting(0);
-    InfrequentSelectOp<dtype> select_op(category_location, num_categories);
-    if (static_cast<size_t>(num_categories) < (1ul << 31)) {
-      HCTR_LIB_THROW(cub::DeviceSelect::If(p_select_temp, select_temp_size, counting,
-                                           infrequent_categories, p_num_selected, num_categories,
-                                           select_op, stream));
-    } else {
-      HugeCTR::DeviceSelect::If(p_select_temp, select_temp_size, counting, infrequent_categories,
-                                p_num_selected, static_cast<size_t>(num_categories), select_op,
-                                stream);
-    }
-
-    constexpr size_t TPB_loc = 256;
-    const size_t n_blocks_loc_infreq = (size_t)ceildiv<dtype>(num_infrequent, TPB_loc);
-    statistics_kernels::
-        calculate_category_location_infrequent<<<n_blocks_loc_infreq, TPB_loc, 0, stream>>>(
-            infrequent_categories, category_location, num_infrequent, num_instances);
-    HCTR_LIB_THROW(cudaPeekAtLastError());
-  }
-}
-
-template <typename dtype>
-void Statistics<dtype>::calculate_infrequent_model_table_offsets(
-    std::vector<dtype> &h_infrequent_model_table_offsets, const dtype *infrequent_categories,
-    const Tensor2<dtype> &category_location, uint32_t global_instance_id,
-    const dtype num_infrequent, cudaStream_t stream) {
-  dtype num_model_infrequent = num_infrequent / num_instances +
-                               (global_instance_id < num_infrequent % num_instances ? 1 : 0);
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-
-  statistics_kernels::calculate_infrequent_model_table_offsets<<<1, 64, 0, stream>>>(
-      infrequent_categories, category_location.get_ptr(), table_offsets.get_ptr(),
-      infrequent_model_table_offsets.get_ptr(), num_tables, num_infrequent, num_model_infrequent,
-      global_instance_id);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  h_infrequent_model_table_offsets.resize(num_tables + 1);
-  HCTR_LIB_THROW(cudaMemcpyAsync(h_infrequent_model_table_offsets.data(),
-                                 infrequent_model_table_offsets.get_ptr(),
-                                 (num_tables + 1) * sizeof(dtype), cudaMemcpyDeviceToHost, stream));
-}
-
-template <typename dtype>
-void Statistics<dtype>::calculate_frequent_model_table_offsets(
-    std::vector<dtype> &h_frequent_model_table_offsets, const dtype *frequent_categories,
-    const dtype num_frequent, cudaStream_t stream) {
-  statistics_kernels::
-      calculate_frequent_model_table_offsets<<<num_instances, num_tables + 1, 0, stream>>>(
-          frequent_categories, table_offsets.get_ptr(), frequent_model_table_offsets.get_ptr(),
-          num_instances, num_tables, num_frequent);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-
-  h_frequent_model_table_offsets.resize(num_instances * (num_tables + 1));
-  HCTR_LIB_THROW(cudaMemcpyAsync(
-      h_frequent_model_table_offsets.data(), frequent_model_table_offsets.get_ptr(),
-      num_instances * (num_tables + 1) * sizeof(dtype), cudaMemcpyDeviceToHost, stream));
-}
-
-template class Statistics<uint32_t>;
-template class Statistics<long long>;
-template class Statistics<unsigned long>;
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/hybrid_embedding/utils.cu b/HugeCTR/src/embeddings/hybrid_embedding/utils.cu
deleted file mode 100644
index 5c450f1337..0000000000
--- a/HugeCTR/src/embeddings/hybrid_embedding/utils.cu
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <cassert>
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-void download_tensor(std::vector<dtype>& h_tensor, const Tensor2<dtype> tensor,
-                     cudaStream_t stream) {
-  size_t tensor_size = tensor.get_num_elements();
-  h_tensor.resize(tensor_size);
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-  HCTR_LIB_THROW(cudaMemcpyAsync(h_tensor.data(), tensor.get_ptr(), tensor.get_size_in_bytes(),
-                                 cudaMemcpyDeviceToHost, stream));
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-}
-
-template <typename dtype>
-void upload_tensor(const std::vector<dtype>& h_tensor, Tensor2<dtype> tensor, cudaStream_t stream) {
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-  assert(tensor.get_num_elements() >= h_tensor.size());
-  HCTR_LIB_THROW(cudaMemcpyAsync(tensor.get_ptr(), h_tensor.data(), h_tensor.size() * sizeof(dtype),
-                                 cudaMemcpyHostToDevice, stream));
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-}
-
-__global__ void offsets_kernel(const uint32_t* indices, uint32_t* indices_offsets,
-                               uint32_t num_instances, uint32_t multiplier) {
-  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (tid < num_instances) {
-    uint32_t searched_value = multiplier * tid;
-    uint32_t num_selected = indices_offsets[num_instances];
-
-    // Binary search
-    uint32_t i = 0;
-    uint32_t j = num_selected;
-    while (i < j) {
-      uint32_t m = (i + j) / 2;
-      uint32_t value = __ldg(indices + m);
-
-      if (value < searched_value)
-        i = m + 1;
-      else
-        j = m;
-    }
-
-    // Write offset
-    indices_offsets[tid] = i;
-  }
-}
-
-template <typename dtype, typename stype>
-__global__ void modulo_kernel(dtype* buffer, const stype* d_num_elements, dtype divisor) {
-  const stype num_elements = __ldg(d_num_elements);
-  for (stype i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elements;
-       i += blockDim.x * gridDim.x)
-    buffer[i] %= divisor;
-}
-
-__global__ void model_id_kernel(const uint32_t* indices_offsets, uint32_t* src_model_id,
-                                const uint32_t* d_num_elements) {
-  // Find model id
-  uint32_t num_elements = __ldg(d_num_elements);
-  for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elements;
-       i += blockDim.x * gridDim.x) {
-    uint32_t model_id = 0;
-    uint32_t next_offset = indices_offsets[1];
-    while (next_offset <= i) {
-      model_id++;
-      next_offset = indices_offsets[model_id + 1];
-    }
-    src_model_id[i] = model_id;
-  }
-}
-
-template void download_tensor<uint32_t>(std::vector<uint32_t>& h_tensor,
-                                        const Tensor2<uint32_t> tensor, cudaStream_t stream);
-template void download_tensor<unsigned long>(std::vector<size_t>& h_tensor,
-                                             const Tensor2<size_t> tensor, cudaStream_t stream);
-template void download_tensor<long long>(std::vector<long long>& h_tensor,
-                                         const Tensor2<long long> tensor, cudaStream_t stream);
-template void download_tensor<__half>(std::vector<__half>& h_tensor, const Tensor2<__half> tensor,
-                                      cudaStream_t stream);
-template void download_tensor<float>(std::vector<float>& h_tensor, const Tensor2<float> tensor,
-                                     cudaStream_t stream);
-template void upload_tensor<uint32_t>(const std::vector<uint32_t>& h_tensor,
-                                      Tensor2<uint32_t> tensor, cudaStream_t stream);
-template void upload_tensor<unsigned long>(const std::vector<size_t>& h_tensor,
-                                           Tensor2<size_t> tensor, cudaStream_t stream);
-template void upload_tensor<long long>(const std::vector<long long>& h_tensor,
-                                       Tensor2<long long> tensor, cudaStream_t stream);
-
-template void upload_tensor<__half>(const std::vector<__half>& h_tensor, Tensor2<__half> tensor,
-                                    cudaStream_t stream);
-template void upload_tensor<float>(const std::vector<float>& h_tensor, Tensor2<float> tensor,
-                                   cudaStream_t stream);
-
-template __global__ void modulo_kernel(uint32_t* buffer, const uint32_t* d_num_elements,
-                                       uint32_t divisor);
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/src/embeddings/hybrid_sparse_embedding.cu b/HugeCTR/src/embeddings/hybrid_sparse_embedding.cu
deleted file mode 100644
index 3cc1f37c1f..0000000000
--- a/HugeCTR/src/embeddings/hybrid_sparse_embedding.cu
+++ /dev/null
@@ -1,820 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-
-#include <collectives/all_reduce_comm.hpp>
-#include <common.hpp>
-#include <embedding.hpp>
-#include <embeddings/hybrid_embedding/calibration_data.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/indices_container.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <embeddings/hybrid_sparse_embedding.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-template <typename dtype, typename emtype>
-HybridSparseEmbedding<dtype, emtype>::HybridSparseEmbedding(
-    const SparseTensors<dtype> &train_input_tensors,
-    const SparseTensors<dtype> &evaluate_input_tensors,
-    const HybridSparseEmbeddingParams &embedding_params,
-    const std::vector<BuffPtr<emtype>> &grouped_wgrad_buff,
-    const GpuLearningRateSchedulers lr_scheds, bool graph_mode,
-    const std::shared_ptr<ResourceManager> &resource_manager)
-    : embedding_params_(embedding_params),
-      resource_manager_(resource_manager),
-      grouped_wgrad_buff_(grouped_wgrad_buff),
-      grouped_all_reduce_(grouped_wgrad_buff[0] != NULL),
-      lr_scheds_(lr_scheds),
-      graph_mode_(graph_mode),
-      current_train_batch_size_(get_batch_size(true)),
-      current_eval_batch_size_(get_batch_size(false)) {
-  try {
-    // 0. Error check
-    if (embedding_params_.train_batch_size < 1 || embedding_params_.evaluate_batch_size < 1 ||
-        embedding_params_.slot_num < 1 || embedding_params_.embedding_vec_size < 1) {
-      HCTR_OWN_THROW(Error_t::WrongInput,
-                     "batchsize < 1 || slot_num < 1 || embedding_vec_size < 1");
-    }
-
-    if (embedding_params_.embedding_vec_size > 1024) {
-      HCTR_OWN_THROW(Error_t::WrongInput,
-                     "the embedding_vec_size can not be more than 1024 in embedding layer");
-    }
-
-    size_t total_gpu_count = resource_manager_->get_global_gpu_count();
-    size_t local_gpu_count = resource_manager_->get_local_gpu_count();
-
-    if (train_input_tensors.size() != local_gpu_count ||
-        evaluate_input_tensors.size() != local_gpu_count) {
-      HCTR_OWN_THROW(Error_t::WrongInput,
-                     "either train_input_tensors.size() or evaluate_input_tensors.size() isn't "
-                     "local_gpu_count_");
-    }
-
-    HCTR_LOG_S(INFO, ROOT) << "Using Hybrid Embedding with train batch " << get_batch_size(true)
-                           << " and eval batch " << get_batch_size(false) << std::endl;
-
-    // 1. initialize optimizer
-    for (size_t id = 0; id < local_gpu_count; id++) {
-      OptParams opt_params;
-      opt_params.optimizer = embedding_params_.opt_params.optimizer;
-      opt_params.lr = embedding_params_.opt_params.lr;
-      opt_params.update_type = embedding_params_.opt_params.update_type;
-      opt_params.scaler = embedding_params_.opt_params.scaler;
-      opt_params_.emplace_back(opt_params);
-    }
-    // 2. reserve buffers for different tensors
-    data_statistics_.reserve(local_gpu_count);
-    model_.reserve(local_gpu_count);
-    calibration_.reserve(local_gpu_count);
-    statistics_.reserve(local_gpu_count);
-    train_output_tensors_.reserve(local_gpu_count);
-    evaluate_output_tensors_.reserve(local_gpu_count);
-    if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-      frequent_embeddings_single_node_.reserve(local_gpu_count);
-    } else {
-      frequent_embeddings_multi_node_.reserve(local_gpu_count);
-    }
-
-    infrequent_embeddings_single_node_.reserve(local_gpu_count);
-    infrequent_embeddings_ib_nvlink_.reserve(local_gpu_count);
-    infrequent_embeddings_ib_nvlink_hier_.reserve(local_gpu_count);
-
-    assert(bufs_.empty());
-    CudaDeviceContext context;
-    // 2.1. construct data
-    for (uint32_t i = 0; i < local_gpu_count; i++) {
-      int cur_device = get_local_gpu(i).get_device_id();
-      context.set_device(cur_device);
-
-      data_statistics_.emplace_back(embedding_params_.slot_size_array, get_batch_size(true),
-                                    embedding_params_.num_iterations_statistics);
-    }
-
-    // 2.2 construct model
-    for (uint32_t i = 0; i < local_gpu_count; i++) {
-      int cur_device = get_local_gpu(i).get_device_id();
-      context.set_device(cur_device);
-
-      std::vector<uint32_t> num_instances_per_node(resource_manager_->get_num_process(), 0);
-      get_num_instances_per_node(num_instances_per_node);
-      model_.emplace_back(embedding_params_.communication_type,
-                          resource_manager_->get_local_gpu(i)->get_global_id(),
-                          num_instances_per_node, get_categories_num());
-    }
-
-    // 2.3 construct calibration
-    for (uint32_t i = 0; i < local_gpu_count; i++) {
-      int cur_device = get_local_gpu(i).get_device_id();
-      context.set_device(cur_device);
-      calibration_.emplace_back(resource_manager_->get_num_process(), embedding_params_.p_dup_max,
-                                embedding_params_.max_all_reduce_bandwidth,
-                                embedding_params_.max_all_to_all_bandwidth,
-                                embedding_params_.efficiency_bandwidth_ratio);
-    }
-
-    // 2.4 construct Statistics
-    for (uint32_t i = 0; i < local_gpu_count; i++) {
-      int cur_device = get_local_gpu(i).get_device_id();
-      context.set_device(cur_device);
-      const size_t num_samples_statistics = embedding_params_.num_iterations_statistics *
-                                            get_batch_size(true) * embedding_params_.slot_num;
-      statistics_.emplace_back((dtype)num_samples_statistics, embedding_params_.slot_num,
-                               model_[i].num_instances, get_categories_num());
-    }
-
-    for (uint32_t i = 0; i < local_gpu_count; i++) {
-      int cur_device = get_local_gpu(i).get_device_id();
-      context.set_device(cur_device);
-      std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-      bufs_.emplace_back(buf);
-      // 2.5. reserve for train output/ evaluate output tensors
-      Tensor2<emtype> tensor;
-      buf->reserve({get_batch_size_per_gpu(true), get_slot_num(), get_embedding_vec_size()},
-                   &tensor);
-      train_output_tensors_.emplace_back(tensor);
-      buf->reserve({get_batch_size_per_gpu(false), get_slot_num(), get_embedding_vec_size()},
-                   &tensor);
-      evaluate_output_tensors_.emplace_back(tensor);
-
-      // 2.6 construct frequent embedding
-      if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-        frequent_embeddings_single_node_.emplace_back(
-            model_[i], get_local_gpu(i), grouped_wgrad_buff_[i], get_embedding_vec_size(),
-            embedding_params_.max_num_frequent_categories);
-      } else {
-        frequent_embeddings_multi_node_.emplace_back(
-            model_[i], get_local_gpu(i), grouped_wgrad_buff_[i], get_embedding_vec_size(),
-            embedding_params_.max_num_frequent_categories);
-      }
-
-      // 2.7 construct infrequent embedding
-      if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-        infrequent_embeddings_single_node_.emplace_back(model_[i], get_local_gpu(i),
-                                                        get_embedding_vec_size());
-      }
-      if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-        infrequent_embeddings_ib_nvlink_.emplace_back(model_[i], get_local_gpu(i),
-                                                      get_embedding_vec_size());
-      }
-      if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-        infrequent_embeddings_ib_nvlink_hier_.emplace_back(model_[i], get_local_gpu(i),
-                                                           get_embedding_vec_size());
-      }
-
-      // 2.8 construct communication
-      if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-        size_t max_buf_size = embedding_params_.embedding_vec_size *
-                              std::max(get_batch_size(true), get_batch_size(false)) *
-                              embedding_params_.slot_num;
-        infrequent_embeddings_ib_nvlink_.back().init_comms(
-            embedding_params_.embedding_vec_size, &get_local_gpu(i), buf.get(), max_buf_size);
-      }
-
-      // Construct comm buffers
-      if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-        infrequent_embeddings_ib_nvlink_hier_[i].init_comms(
-            embedding_params_.max_num_infrequent_samples, embedding_params_.slot_num,
-            embedding_params_.embedding_vec_size, buf.get(), get_batch_size(true),
-            get_batch_size(false), local_gpu_count);
-      }
-
-      // For global barrier in eval
-      {
-        Tensor2<uint32_t> tensor;
-        buf->reserve({1}, &tensor);
-        d_barrier_store_.push_back(tensor);
-      }
-      buf->allocate();
-    }
-
-    // Frequent AR comm init
-    if ((embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) ||
-        (embedding_params_.communication_type == CommunicationType::IB_NVLink)) {
-      if (!grouped_all_reduce_) {
-        // Do your own all-reduce
-        auto ar_comm = resource_manager_->get_ar_comm();
-        frequent_embedding_handle_ = ar_comm->register_coll();
-        // Frequent all reduce comm
-        for (uint32_t i = 0; i < local_gpu_count; i++) {
-          frequent_embeddings_multi_node_[i].init_ar_comm(ar_comm, frequent_embedding_handle_, i);
-        }
-        ar_comm->register_coll_buf(frequent_embedding_handle_);
-      }
-    }
-
-    // Init after buffer allocation
-    if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-#ifdef ENABLE_MPI
-      ib_comm_ = resource_manager_->get_ib_comm();
-      if (!ib_comm_) {
-        resource_manager_->init_ib_comm();
-        ib_comm_ = resource_manager_->get_ib_comm();
-      }
-      comm_stream_.resize(local_gpu_count);
-
-      std::vector<size_t *> h_model_indices_sizes_ptrs(local_gpu_count);
-      std::vector<size_t *> h_network_indices_sizes_ptrs(local_gpu_count);
-      std::vector<emtype *> h_fwd_send_buffer_ptrs(local_gpu_count);
-      std::vector<emtype *> h_bwd_send_buffer_ptrs(local_gpu_count);
-      for (uint32_t i = 0; i < local_gpu_count; i++) {
-        h_model_indices_sizes_ptrs[i] =
-            infrequent_embeddings_ib_nvlink_hier_[i].model_indices_sizes_.get_ptr();
-        h_network_indices_sizes_ptrs[i] =
-            infrequent_embeddings_ib_nvlink_hier_[i].network_indices_sizes_.get_ptr();
-        h_fwd_send_buffer_ptrs[i] = infrequent_embeddings_ib_nvlink_hier_[i]
-                                        .infrequent_forward_comm_buffers_->send_buffer.get_ptr();
-        h_bwd_send_buffer_ptrs[i] = infrequent_embeddings_ib_nvlink_hier_[i]
-                                        .infrequent_backward_comm_buffers_->send_buffer.get_ptr();
-      }
-
-      // Forward coll init
-      auto infrequent_forward_coll_handle = ib_comm_->register_hier_a2a_v_coll(true);
-      for (uint32_t i = 0; i < local_gpu_count; i++) {
-        int cur_device = get_local_gpu(i).get_device_id();
-        context.set_device(cur_device);
-
-        // download pointers
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            infrequent_embeddings_ib_nvlink_hier_[i].model_indices_sizes_ptrs_.get_ptr(),
-            h_model_indices_sizes_ptrs.data(), sizeof(size_t *) * local_gpu_count,
-            cudaMemcpyHostToDevice, get_local_gpu(i).get_stream()));
-
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            infrequent_embeddings_ib_nvlink_hier_[i].network_indices_sizes_ptrs_.get_ptr(),
-            h_network_indices_sizes_ptrs.data(), sizeof(size_t *) * local_gpu_count,
-            cudaMemcpyHostToDevice, get_local_gpu(i).get_stream()));
-
-        HCTR_LIB_THROW(
-            cudaMemcpyAsync(infrequent_embeddings_ib_nvlink_hier_[i]
-                                .infrequent_forward_comm_buffers_->send_buffer_ptrs.get_ptr(),
-                            h_fwd_send_buffer_ptrs.data(), sizeof(emtype *) * local_gpu_count,
-                            cudaMemcpyHostToDevice, get_local_gpu(i).get_stream()));
-
-        HCTR_LIB_THROW(
-            cudaMemcpyAsync(infrequent_embeddings_ib_nvlink_hier_[i]
-                                .infrequent_backward_comm_buffers_->send_buffer_ptrs.get_ptr(),
-                            h_bwd_send_buffer_ptrs.data(), sizeof(emtype *) * local_gpu_count,
-                            cudaMemcpyHostToDevice, get_local_gpu(i).get_stream()));
-
-        HCTR_LIB_THROW(cudaStreamSynchronize(get_local_gpu(i).get_stream()));
-
-        // Initialize IB comm
-        HCTR_LIB_THROW(cudaStreamCreateWithPriority(&comm_stream_[i], cudaStreamNonBlocking, -100));
-        ib_comm_->set_a2a_coll_stream(infrequent_forward_coll_handle, comm_stream_[i], i);
-
-        ib_comm_->set_a2a_coll_buf(
-            infrequent_forward_coll_handle,
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_forward_comm_buffers_->send_buffer.get_ptr(),
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_forward_comm_buffers_->send_buffer.get_size_in_bytes(),
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_forward_comm_buffers_->recv_buffer.get_ptr(),
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_forward_comm_buffers_->recv_buffer.get_size_in_bytes(),
-            i);
-
-        infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_ =
-            std::make_unique<HierAll2Allv_Multi_IB<emtype>>(
-                i, infrequent_forward_coll_handle,
-                infrequent_embeddings_ib_nvlink_hier_[i].model_indices_sizes_ptrs_.get_ptr(),
-                &get_local_gpu(i), ib_comm_, comm_stream_[i]);
-      }
-      ib_comm_->register_a2a_coll_buf(infrequent_forward_coll_handle);
-
-      // Backward coll init
-      auto infrequent_backward_coll_handle = ib_comm_->register_hier_a2a_v_coll(true);
-      for (uint32_t i = 0; i < local_gpu_count; i++) {
-        int cur_device = get_local_gpu(i).get_device_id();
-        context.set_device(cur_device);
-
-        ib_comm_->set_a2a_coll_stream(infrequent_backward_coll_handle, comm_stream_[i], i);
-        ib_comm_->set_a2a_coll_buf(
-            infrequent_backward_coll_handle,
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_backward_comm_buffers_->send_buffer.get_ptr(),
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_backward_comm_buffers_->send_buffer.get_size_in_bytes(),
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_backward_comm_buffers_->recv_buffer.get_ptr(),
-            infrequent_embeddings_ib_nvlink_hier_[i]
-                .infrequent_backward_comm_buffers_->recv_buffer.get_size_in_bytes(),
-            i);
-
-        infrequent_embeddings_ib_nvlink_hier_[i].infrequent_backward_comms_ =
-            std::make_unique<HierAll2Allv_Multi_IB<emtype>>(
-                i, infrequent_backward_coll_handle,
-                infrequent_embeddings_ib_nvlink_hier_[i].network_indices_sizes_ptrs_.get_ptr(),
-                &get_local_gpu(i), ib_comm_, comm_stream_[i]);
-      }
-      ib_comm_->register_a2a_coll_buf(infrequent_backward_coll_handle);
-#else
-      HCTR_OWN_THROW(Error_t::WrongInput, "MPI is not enabled but trying to use IB_NVLink_Hier");
-#endif
-    }
-
-    // 2.9 Single-node: copy some pointers arrays to device
-    if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-      // Initialize GPU barrier
-      gpu_barrier_ = std::make_unique<GPUBarrier>(resource_manager_->get_local_gpu_count(),
-                                                  resource_manager_->get_local_gpu_device_id_list(),
-                                                  graph_mode_);
-
-      std::vector<const emtype *> frequent_vectors_cache_pointers(local_gpu_count);
-      std::vector<emtype *> interaction_layer_input_pointers_train(local_gpu_count);
-      std::vector<emtype *> interaction_layer_input_pointers_eval(local_gpu_count);
-      std::vector<const emtype *> gradients_pointers(local_gpu_count);
-      std::vector<const emtype *> frequent_partial_gradients_pointers(local_gpu_count);
-
-      for (uint32_t i = 0; i < local_gpu_count; i++) {
-        frequent_vectors_cache_pointers[i] =
-            frequent_embeddings_single_node_[i].get_embedding_vectors_cache().get_ptr();
-        interaction_layer_input_pointers_train[i] = train_output_tensors_[i].get_ptr();
-        gradients_pointers[i] = train_output_tensors_[i].get_ptr();
-        interaction_layer_input_pointers_eval[i] = evaluate_output_tensors_[i].get_ptr();
-        frequent_partial_gradients_pointers[i] =
-            frequent_embeddings_single_node_[i].frequent_data_.get_gradients().get_ptr();
-      }
-
-      for (uint32_t i = 0; i < local_gpu_count; i++) {
-        int cur_device = get_local_gpu(i).get_device_id();
-        context.set_device(cur_device);
-
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            frequent_embeddings_single_node_[i].embedding_vectors_cache_pointers_.get_ptr(),
-            frequent_vectors_cache_pointers.data(), local_gpu_count * sizeof(float *),
-            cudaMemcpyHostToDevice, get_local_gpu(i).get_stream()));
-
-        infrequent_embeddings_single_node_[i].init_pointers(
-            local_gpu_count, get_local_gpu(i).get_stream(), interaction_layer_input_pointers_train,
-            interaction_layer_input_pointers_eval, gradients_pointers);
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            frequent_embeddings_single_node_[i].partial_gradients_pointers_.get_ptr(),
-            frequent_partial_gradients_pointers.data(), local_gpu_count * sizeof(emtype *),
-            cudaMemcpyHostToDevice, get_local_gpu(i).get_stream()));
-      }
-    }
-
-    // Setup default indices
-    train_batch_indices_.emplace_back(model_, train_input_tensors, resource_manager_,
-                                      get_batch_size(true), embedding_params_.slot_size_array,
-                                      embedding_params_.max_num_frequent_categories,
-                                      embedding_params_.communication_type);
-
-    eval_batch_indices_.emplace_back(model_, evaluate_input_tensors, resource_manager_,
-                                     get_batch_size(false), embedding_params_.slot_size_array,
-                                     embedding_params_.max_num_frequent_categories,
-                                     embedding_params_.communication_type);
-
-  } catch (const std::runtime_error &rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::init_model(const SparseTensors<dtype> &data,
-                                                      size_t &wgrad_offset_in_bytes) {
-  size_t local_gpu_count = resource_manager_->get_local_gpu_count();
-#pragma omp parallel for num_threads(local_gpu_count)
-  for (size_t id = 0; id < local_gpu_count; ++id) {
-    int cur_device = get_local_gpu(id).get_device_id();
-    CudaDeviceContext context(cur_device);
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    Tensor2<dtype> tmp_categories;
-    buf->reserve({(size_t)statistics_[id].num_categories, 1}, &tmp_categories);
-    buf->allocate();
-    auto stream = get_local_gpu(id).get_stream();
-    data_statistics_[id].data_to_unique_categories(data[id].get_value_tensor(), stream);
-    model_[id].init_hybrid_model(calibration_[id], statistics_[id], data_statistics_[id],
-                                 tmp_categories, stream);
-    get_frequent_embedding_data(id).initialize_embedding_vectors(data_statistics_[id].table_sizes,
-                                                                 wgrad_offset_in_bytes);
-
-    if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-      infrequent_embeddings_single_node_[id].initialize_embedding_vectors(
-          data_statistics_[id].table_sizes);
-    }
-    if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-      infrequent_embeddings_ib_nvlink_[id].initialize_embedding_vectors(
-          data_statistics_[id].table_sizes);
-    }
-    if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-      infrequent_embeddings_ib_nvlink_hier_[id].initialize_embedding_vectors(
-          data_statistics_[id].table_sizes);
-    }
-
-    if (embedding_params_.max_num_frequent_categories < (size_t)model_[id].num_frequent) {
-      HCTR_OWN_THROW(
-          Error_t::WrongInput,
-          "Found too many frequent categories, please increase 'max_num_frequent_categories'");
-    }
-  }
-  // free statistics_ memory
-  // statistics_.clear();
-  data_statistics_.clear();
-
-  HCTR_LOG_S(INFO, ROOT) << "Initialized hybrid model with " << model_[0].num_frequent
-                         << " frequent categories, probability of being frequent is "
-                         << model_[0].frequent_probability << std::endl;
-
-  size_t avg_train_infrequent = (1 - model_[0].frequent_probability) *
-                                embedding_params_.slot_size_array.size() * get_batch_size(true);
-  size_t avg_evaluate_infrequent = (1 - model_[0].frequent_probability) *
-                                   embedding_params_.slot_size_array.size() * get_batch_size(false);
-
-  HCTR_LOG_S(INFO, ROOT) << "Estimated number of infrequent categories per train batch: "
-                         << avg_train_infrequent << ", eval batch: " << avg_evaluate_infrequent
-                         << std::endl;
-
-  if ((embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) ||
-      (embedding_params_.communication_type == CommunicationType::IB_NVLink)) {
-    size_t wgrad_size =
-        model_[0].num_frequent * embedding_params_.embedding_vec_size * sizeof(emtype);
-
-    if (!grouped_all_reduce_) {
-      // Manage your own all-reduce
-      auto ar_comm = resource_manager_->get_ar_comm();
-      ar_comm->update_size(frequent_embedding_handle_, wgrad_size);
-    } else {
-      wgrad_offset_in_bytes += wgrad_size;
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::setup_buffered_indices(bool is_train,
-                                                                  AsyncReader<dtype> *data_reader) {
-  if (is_train) {
-    // Double buffering for overlapping indices calculation between iterations
-    data_reader->set_tensor_buffering(2);
-  } else {
-    // If get_max_batches_inflight() is > than the number of eval batches in the dataset,
-    // this will cause the batch tensors to be cached. We need the tensors to be cached in order
-    // for the indices to be cached because the index calculation is done in place in these
-    // tensors.
-    // TODO: if OOM then eval_data_reader->set_tensor_buffering(2)
-    data_reader->set_tensor_buffering(data_reader->get_max_batches_inflight());
-  }
-
-  const auto data_tensors = data_reader->get_value_tensor_buffers();
-  auto &batch_indices = is_train ? train_batch_indices_ : eval_batch_indices_;
-  batch_indices.clear();  // remove default
-  for (size_t i = 0; i < data_tensors.size(); ++i) {
-    batch_indices.emplace_back(model_, data_tensors.at(i), resource_manager_,
-                               get_batch_size(is_train), embedding_params_.slot_size_array,
-                               embedding_params_.max_num_frequent_categories,
-                               embedding_params_.communication_type);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::forward(bool is_train) {
-  size_t local_gpu_count = resource_manager_->get_local_gpu_count();
-
-// Index calculations
-#pragma omp parallel for num_threads(local_gpu_count)
-  for (size_t i = 0; i < local_gpu_count; i++) {
-    auto &gpu = get_local_gpu(i);
-    CudaDeviceContext context(gpu.get_device_id());
-
-    index_calculation(is_train, i);
-    infreq_model_forward(i);
-    freq_forward(is_train, i, true);
-    infreq_network_forward(is_train, i);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::backward() {
-  size_t local_gpu_count = resource_manager_->get_local_gpu_count();
-
-#pragma omp parallel for num_threads(local_gpu_count)
-  for (size_t i = 0; i < local_gpu_count; i++) {
-    auto cur_device = get_local_gpu(i).get_device_id();
-    CudaDeviceContext context(cur_device);
-
-    freq_backward(i);
-    infreq_network_backward(i);
-    infreq_model_backward(i);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::update_params() {
-  size_t local_gpu_count = resource_manager_->get_local_gpu_count();
-
-#pragma omp parallel for num_threads(local_gpu_count)
-  for (size_t i = 0; i < local_gpu_count; i++) {
-    auto cur_device = get_local_gpu(i).get_device_id();
-    CudaDeviceContext context(cur_device);
-
-    freq_update_params(i);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::init_params() {
-  // TODO: create init_params()
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::load_parameters(std::string sparse_model) {
-  // TODO: create load_parameters()
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::dump_parameters(std::string sparse_model) const {
-  // TODO: create dump_parameters()
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::set_learning_rate(float lr) {
-  HCTR_OWN_THROW(Error_t::WrongInput, "HybridSparseEmbedding only supports GPU LR scheduler");
-}
-
-template <typename dtype, typename emtype>
-GpuLearningRateSchedulers HybridSparseEmbedding<dtype, emtype>::get_learning_rate_schedulers()
-    const {
-  return lr_scheds_;
-}
-
-template <typename dtype, typename emtype>
-size_t HybridSparseEmbedding<dtype, emtype>::get_params_num() const {
-  return 0;
-}
-
-template <typename dtype, typename emtype>
-size_t HybridSparseEmbedding<dtype, emtype>::get_vocabulary_size() const {
-  // TODO: create get_vocabulary_size()
-  return 0;
-}
-
-template <typename dtype, typename emtype>
-size_t HybridSparseEmbedding<dtype, emtype>::get_max_vocabulary_size() const {
-  // TODO: create get_max_vocabulary_size()
-  return 0;
-}
-
-template <typename dtype, typename emtype>
-std::vector<TensorBag2> HybridSparseEmbedding<dtype, emtype>::get_train_output_tensors() const {
-  return tensors_to_bags(train_output_tensors_);
-}
-
-template <typename dtype, typename emtype>
-std::vector<TensorBag2> HybridSparseEmbedding<dtype, emtype>::get_evaluate_output_tensors() const {
-  return tensors_to_bags(evaluate_output_tensors_);
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::assign_input_tensors(bool is_train, size_t batch_size,
-                                                                size_t inflight_id, bool cached) {
-  if (is_train) {
-    train_inflight_id_ = inflight_id;
-    current_train_batch_size_ = batch_size;
-    current_train_batch_cached_ = cached;
-  } else {
-    eval_inflight_id_ = inflight_id;
-    current_eval_batch_size_ = batch_size;
-    current_eval_batch_cached_ = cached;
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::index_calculation(bool is_train, int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  CudaDeviceContext context(cur_device);
-  auto &gpu = get_local_gpu(i);
-  cudaStream_t stream = gpu.get_stream();
-
-  auto &batch_indices = is_train ? train_batch_indices_.at(train_inflight_id_)
-                                 : eval_batch_indices_.at(eval_inflight_id_);
-
-  if (is_train) {
-    if (!current_train_batch_cached_) {
-      batch_indices.compute(i, current_train_batch_size_, stream);
-    }
-  } else {  // eval
-    if (!current_eval_batch_cached_) {
-      batch_indices.compute(i, current_eval_batch_size_, stream);
-    }
-  }
-
-  // We don't copy the sparse tensor since all the required data are already in the
-  // Data type and indices
-  get_frequent_embedding(i).set_current_indices(&batch_indices.get_frequent(i));
-  get_infrequent_embedding(i).set_current_indices(&batch_indices.get_infrequent(i));
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::freq_forward(bool is_train, int i,
-                                                        bool is_first_eval_batch) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  auto &gpu = get_local_gpu(i);
-  CudaDeviceContext context(cur_device);
-  cudaStream_t stream = gpu.get_stream();
-
-  auto &output = (is_train) ? train_output_tensors_[i] : evaluate_output_tensors_[i];
-  if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-    if (is_train) {
-      frequent_embeddings_single_node_[i].forward_model(stream);
-    } else {
-      if (is_first_eval_batch) {
-        frequent_embeddings_single_node_[i].forward_model_eval(stream);
-      }
-    }
-    gpu_barrier_->sync_all_gpus(stream, i);
-
-    frequent_embeddings_single_node_[i].forward_network(output.get_ptr(), stream);
-  }
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink ||
-      embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-    frequent_embeddings_multi_node_[i].forward_network(output.get_ptr(), stream);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::freq_backward(int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  CudaDeviceContext context(cur_device);
-  auto &gpu = get_local_gpu(i);
-  cudaStream_t stream = gpu.get_stream();
-
-  if (frequent_embeddings_single_node_.size()) {
-    frequent_embeddings_single_node_[i].local_reduce(train_output_tensors_[i].get_ptr(), stream);
-  } else {
-    frequent_embeddings_multi_node_[i].local_reduce(train_output_tensors_[i].get_ptr(), stream);
-    if (!grouped_all_reduce_) {
-      frequent_embeddings_multi_node_[i].communicate(stream);
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::freq_update_params(int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  CudaDeviceContext context(cur_device);
-  float *dev_lr = lr_scheds_[i]->get_learning_rate();
-  float scale = opt_params_[i].scaler;
-  auto &gpu = get_local_gpu(i);
-  cudaStream_t stream = gpu.get_stream();
-
-  if (embedding_params_.communication_type != CommunicationType::NVLink_SingleNode) {
-    frequent_embeddings_multi_node_[i].update_model(dev_lr, scale, stream);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::infreq_model_forward(int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  auto &gpu = get_local_gpu(i);
-  CudaDeviceContext context(cur_device);
-  cudaStream_t stream = gpu.get_stream();
-
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-    infrequent_embeddings_ib_nvlink_[i].forward_model(
-        infrequent_embeddings_ib_nvlink_[i].infrequent_forward_comm_buffers_->send_buffer.get_ptr(),
-        stream);
-  } else if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-    infrequent_embeddings_ib_nvlink_hier_[i].calculate_model_indices_sizes_from_offsets(stream);
-    infrequent_embeddings_ib_nvlink_hier_[i].calculate_network_indices_sizes_from_offsets(stream);
-    infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_->update_sizes(stream);
-    infrequent_embeddings_ib_nvlink_hier_[i].fused_intra_forward_model(
-        infrequent_embeddings_ib_nvlink_hier_[i]
-            .infrequent_forward_comm_buffers_->send_buffer_ptrs.get_ptr(),
-        stream);
-    infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_->initiate_communication(
-        stream);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::infreq_network_forward(bool is_train, int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  auto &gpu = get_local_gpu(i);
-  CudaDeviceContext context(cur_device);
-  cudaStream_t stream = gpu.get_stream();
-
-  auto &output = (is_train) ? train_output_tensors_[i] : evaluate_output_tensors_[i];
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-    infrequent_embeddings_ib_nvlink_[i].infrequent_forward_comms_->communicate(stream);
-    infrequent_embeddings_ib_nvlink_[i].forward_network(
-        infrequent_embeddings_ib_nvlink_[i].infrequent_forward_comm_buffers_->recv_buffer.get_ptr(),
-        output.get_ptr(), stream);
-  } else if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-    infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_->wait_completion(stream);
-    infrequent_embeddings_ib_nvlink_hier_[i].hier_forward_network(
-        infrequent_embeddings_ib_nvlink_hier_[i]
-            .infrequent_forward_comm_buffers_->recv_buffer.get_ptr(),
-        output.get_ptr(), stream);
-  } else {
-    infrequent_embeddings_single_node_[i].forward_network_direct(is_train, stream);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::global_barrier(bool is_train, int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  auto &gpu = get_local_gpu(i);
-  CudaDeviceContext context(cur_device);
-  cudaStream_t stream = gpu.get_stream();
-
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-    if (!is_train) {
-      HCTR_LIB_THROW(ncclAllReduce((const void *)d_barrier_store_[i].get_ptr(),
-                                   d_barrier_store_[i].get_ptr(), sizeof(uint32_t),
-                                   NcclDataType<uint32_t>::getType(), ncclSum,
-                                   get_local_gpu(i).get_nccl(), stream));
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::infreq_network_backward(int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  CudaDeviceContext context(cur_device);
-  auto &gpu = get_local_gpu(i);
-  cudaStream_t stream = gpu.get_stream();
-
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-    infrequent_embeddings_ib_nvlink_[i].update_network(
-        train_output_tensors_[i].get_ptr(),
-        infrequent_embeddings_ib_nvlink_[i]
-            .infrequent_backward_comm_buffers_->send_buffer.get_ptr(),
-        stream);
-  }
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-    infrequent_embeddings_ib_nvlink_hier_[i].infrequent_backward_comms_->update_sizes(stream);
-    infrequent_embeddings_ib_nvlink_hier_[i].fused_intra_update_network(
-        train_output_tensors_[i].get_ptr(),
-        infrequent_embeddings_ib_nvlink_hier_[i]
-            .infrequent_backward_comm_buffers_->send_buffer_ptrs.get_ptr(),
-        stream);
-  }
-}
-
-// Everything that involves network and can be better overlapped with compute
-template <typename dtype, typename emtype>
-void HybridSparseEmbedding<dtype, emtype>::infreq_model_backward(int i) {
-  int cur_device = get_local_gpu(i).get_device_id();
-  CudaDeviceContext context(cur_device);
-  auto &gpu = get_local_gpu(i);
-  cudaStream_t stream = gpu.get_stream();
-  float *dev_lr = lr_scheds_[i]->get_learning_rate();
-  float scale = opt_params_[i].scaler;
-
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-    infrequent_embeddings_ib_nvlink_[i].infrequent_backward_comms_->communicate(stream);
-    infrequent_embeddings_ib_nvlink_[i].update_model(
-        infrequent_embeddings_ib_nvlink_[i]
-            .infrequent_backward_comm_buffers_->recv_buffer.get_ptr(),
-        dev_lr, scale, stream);
-  }
-
-  if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-    infrequent_embeddings_ib_nvlink_hier_[i].infrequent_backward_comms_->communicate(stream);
-
-    infrequent_embeddings_ib_nvlink_hier_[i].hier_update_model(
-        infrequent_embeddings_ib_nvlink_hier_[i]
-            .infrequent_backward_comm_buffers_->recv_buffer.get_ptr(),
-        dev_lr, scale, stream);
-  }
-  if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-    // Synchronize all GPUs before pulling the reduced gradients
-    gpu_barrier_->sync_all_gpus(stream, i);
-
-    float *dev_lr = lr_scheds_[i]->get_learning_rate();
-    float scale = opt_params_[i].scaler;
-    frequent_embeddings_single_node_[i].update_model_direct(dev_lr, scale, stream);
-
-    infrequent_embeddings_single_node_[i].update_model_direct(dev_lr, scale, stream);
-  }
-}
-
-template class HybridSparseEmbedding<uint32_t, __half>;
-template class HybridSparseEmbedding<uint32_t, float>;
-template class HybridSparseEmbedding<long long, __half>;
-template class HybridSparseEmbedding<long long, float>;
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu b/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu
deleted file mode 100644
index 6fce1afb63..0000000000
--- a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu
+++ /dev/null
@@ -1,1334 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <core23/mpi_init_service.hpp>
-#include <embeddings/localized_slot_sparse_embedding_one_hot.hpp>
-#include <io/filesystem.hpp>
-#include <io/io_utils.hpp>
-
-#ifdef ENABLE_MPI
-#include <mpi.h>
-#endif
-
-#include <filesystem>
-#include <numeric>
-
-namespace HugeCTR {
-namespace localized_onehot_filter_keys_kernel {
-
-template <typename TypeKey>
-__global__ void select_value_by_slot_id_kernel(const TypeKey *value, size_t num,
-                                               TypeKey *filter_value, size_t slot_num_per_gpu,
-                                               size_t slot_num, size_t global_id,
-                                               size_t global_num) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int batch_size = tid / slot_num;
-    int slot_id = tid % slot_num;
-    if (slot_id % global_num == global_id) {
-      int res_slot_id = slot_id / global_num;
-      filter_value[batch_size * slot_num_per_gpu + res_slot_id] = __ldg(value + tid);
-    }
-  }
-}
-}  // namespace localized_onehot_filter_keys_kernel
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::filter_keys_per_gpu(
-    bool is_train, size_t id, size_t global_id, size_t global_num) {
-  const SparseTensor<TypeHashKey> &all_gather_key = embedding_data_.get_input_keys(is_train)[id];
-  auto &local_gpu = embedding_data_.get_local_gpu(id);
-  Tensor2<TypeHashKey> value_tensor = embedding_data_.get_value_tensors(is_train)[id];
-  std::shared_ptr<size_t> nnz_ptr = embedding_data_.get_nnz_array(is_train)[id];
-
-  if (all_gather_key.get_dimensions().size() != 2) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "localized embedding all gather key dimension != 2");
-  }
-
-  size_t batch_size = embedding_data_.embedding_params_.get_batch_size(is_train);
-  size_t slot_num_per_gpu = slot_num_per_gpu_[id];
-  size_t slot_num = (all_gather_key.rowoffset_count() - 1) / batch_size;
-
-  constexpr size_t block_size = 256;
-  size_t grid_size = (all_gather_key.nnz() - 1) / block_size + 1;
-  localized_onehot_filter_keys_kernel::
-      select_value_by_slot_id_kernel<<<grid_size, block_size, 0, local_gpu.get_stream()>>>(
-          all_gather_key.get_value_ptr(), all_gather_key.nnz(), value_tensor.get_ptr(),
-          slot_num_per_gpu, slot_num, global_id, global_num);
-
-  *nnz_ptr = (all_gather_key.nnz() / slot_num) * slot_num_per_gpu;
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<
-    TypeHashKey, TypeEmbeddingComp>::data_to_unique_categories_per_gpu(bool is_train, size_t id) {
-  SparseTensor<TypeHashKey> &all_gather_key = embedding_data_.get_input_keys(is_train)[id];
-  auto &local_gpu = embedding_data_.get_local_gpu(id);
-
-  if (all_gather_key.get_dimensions().size() != 2) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "localized embedding all gather key dimension != 2");
-  }
-
-  size_t batch_size = embedding_data_.embedding_params_.get_batch_size(is_train);
-  size_t nnz = all_gather_key.nnz();
-  size_t slot_num = (all_gather_key.rowoffset_count() - 1) / batch_size;
-
-  data_to_unique_categories(all_gather_key.get_value_ptr(),
-                            embedding_data_.embedding_offsets_[id].get_ptr(), slot_num, nnz,
-                            local_gpu.get_stream());
-}
-
-namespace {
-
-template <typename value_type>
-__global__ void upload_value_tensor_kernel(value_type *value_buf, size_t *index_buf,
-                                           value_type *dst_tensor, int emb_vec_size, size_t len) {
-  size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (gid < len) {
-    size_t src_offset = gid * emb_vec_size;
-    size_t dst_offset = index_buf[gid] * emb_vec_size;
-    for (int i = 0; i < emb_vec_size; i++) {
-      dst_tensor[dst_offset + i] = value_buf[src_offset + i];
-    }
-  }
-}
-
-}  // namespace
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::
-    LocalizedSlotSparseEmbeddingOneHot(
-        const Tensors2<TypeHashKey> &train_row_offsets_tensors,
-        const Tensors2<TypeHashKey> &train_value_tensors,
-        const std::vector<std::shared_ptr<size_t>> &train_nnz_array,
-        const Tensors2<TypeHashKey> &evaluate_row_offsets_tensors,
-        const Tensors2<TypeHashKey> &evaluate_value_tensors,
-        const std::vector<std::shared_ptr<size_t>> &evaluate_nnz_array,
-        const SparseEmbeddingHashParams &embedding_params,
-        const std::shared_ptr<ResourceManager> &resource_manager)
-    : embedding_data_(train_row_offsets_tensors, train_value_tensors, train_nnz_array,
-                      evaluate_row_offsets_tensors, evaluate_value_tensors, evaluate_nnz_array,
-                      Embedding_t::LocalizedSlotSparseEmbeddingOneHot, embedding_params,
-                      resource_manager),
-      slot_size_array_(embedding_params.slot_size_array) {
-  embedding_data_.embedding_params_.is_data_parallel =
-      false;  // this ctor is only used for embedding plugin
-  try {
-    max_vocabulary_size_ = 0;
-    for (size_t slot_size : slot_size_array_) {
-      max_vocabulary_size_ += slot_size;
-    }
-
-    max_vocabulary_size_per_gpu_ =
-        cal_max_voc_size_per_gpu(slot_size_array_, embedding_data_.get_resource_manager());
-
-    HCTR_LOG_S(INFO, ROOT) << "max_vocabulary_size_per_gpu_=" << max_vocabulary_size_per_gpu_
-                           << std::endl;
-
-    CudaDeviceContext context;
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) {
-      context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-      size_t gid = embedding_data_.get_local_gpu(id).get_global_id();
-      size_t slot_num_per_gpu =
-          embedding_data_.embedding_params_.slot_num /
-              embedding_data_.get_resource_manager().get_global_gpu_count() +
-          ((gid < embedding_data_.embedding_params_.slot_num %
-                      embedding_data_.get_resource_manager().get_global_gpu_count())
-               ? 1
-               : 0);
-      slot_num_per_gpu_.push_back(slot_num_per_gpu);
-
-      // new GeneralBuffer objects
-      const std::shared_ptr<GeneralBuffer2<CudaAllocator>> &buf = embedding_data_.get_buffer(id);
-
-      // new hash table value vectors
-      {
-        const std::shared_ptr<BufferBlock2<float>> &block = buf->create_block<float>();
-        Tensors2<float> tensors;
-        for (size_t i = 0; i < slot_size_array_.size(); i++) {
-          if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == gid) {
-            Tensor2<float> tensor;
-            block->reserve(
-                {slot_size_array_[i], embedding_data_.embedding_params_.embedding_vec_size},
-                &tensor);
-            tensors.push_back(tensor);
-          }
-        }
-        value_table_tensors_.push_back(tensors);
-        hash_table_value_tensors_.push_back(block->as_tensor());
-      }
-
-      // list of top categories, from single iteration worth of data, so max size is same as
-      // hash_table_value_index_ array
-      {
-        HCTR_LOG_S(INFO, WORLD) << "Initializing size_top_categories_ and top_categories.."
-                                << std::endl;
-        Tensor2<size_t> tensor;
-        buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() *
-                             embedding_data_.embedding_params_.max_feature_num},
-                     &tensor);
-        size_top_categories_.push_back(0);
-        top_categories_.push_back(tensor);
-        // HCTR_LOG_S(INFO, WORLD) << "top_categories size : " << Base::get_universal_batch_size() *
-        // Base::get_max_feature_num() << std::endl;
-      }
-
-      // new hash table value_index that get() from HashTable
-      {
-        Tensor2<size_t> tensor;
-        buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() *
-                             embedding_data_.embedding_params_.max_feature_num},
-                     &tensor);
-        hash_value_index_tensors_.push_back(tensor);
-      }
-
-      // new embedding features reduced by hash table values(results of forward)
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve(
-            {embedding_data_.embedding_params_.get_universal_batch_size() * slot_num_per_gpu,
-             embedding_data_.embedding_params_.embedding_vec_size},
-            &tensor);
-        embedding_feature_tensors_.push_back(tensor);
-      }
-
-      // new wgrad used by backward
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * slot_num_per_gpu,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        wgrad_tensors_.push_back(tensor);
-      }
-
-      // new optimizer params used by update_params
-      switch (embedding_data_.embedding_params_.opt_params.optimizer) {
-        case Optimizer_t::SGD:
-          break;
-
-        default:
-          throw std::runtime_error(
-              std::string("[HCDEBUG][ERROR] Runtime error: Invalid optimizer type\n"));
-      }
-
-      // the tenosrs for storing slot ids
-      // TODO: init to -1 ?
-      {
-        Tensor2<size_t> tensor;
-        buf->reserve({max_vocabulary_size_per_gpu_, 1}, &tensor);
-        hash_table_slot_id_tensors_.push_back(tensor);
-      }
-
-      // temp tensors for all2all
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.get_universal_batch_size_per_gpu() *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        all2all_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_universal_batch_size() *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_forward_temp_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.get_batch_size_per_gpu(true) *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_all2all_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.get_batch_size_per_gpu(true) *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_reorder_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_backward_temp_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<uint32_t> tensor;
-        buf->reserve({1, slot_num_per_gpu}, &tensor);
-        mapping_offsets_per_gpu_tensors_.push_back(tensor);
-      }
-
-// init GenenralBuffers to do real allocation
-#ifndef NDEBUG
-      HCTR_LOG_S(DEBUG, WORLD) << " max_feature_num_:"
-                               << embedding_data_.embedding_params_.max_feature_num << std::endl;
-#endif
-
-    }  // end of for(int id = 0; id < embedding_data_.get_local_gpu_count(); id++)
-
-#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count())
-    {
-      size_t id = omp_get_thread_num();
-      CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id());
-      embedding_data_.get_buffer(id)->allocate();
-      HCTR_LIB_THROW(cudaStreamSynchronize(embedding_data_.get_local_gpu(id).get_stream()));
-    }
-
-    // get the mapping table between local value_index and input value_index
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) {
-      context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-      uint32_t slot_sizes_prefix_sum = 0;
-      uint32_t slot_sizes_prefix_sum_local = 0;
-      int slot_num = 0;
-      for (size_t i = 0; i < slot_size_array_.size(); i++) {
-        size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-        size_t slot_size = slot_size_array_[i];
-        if (i % embedding_data_.get_resource_manager().get_global_gpu_count() == global_id) {
-          uint32_t mapping_offset = slot_sizes_prefix_sum - slot_sizes_prefix_sum_local;
-          HCTR_LIB_THROW(cudaMemcpy(&((mapping_offsets_per_gpu_tensors_[id].get_ptr())[slot_num]),
-                                    &mapping_offset, sizeof(uint32_t), cudaMemcpyHostToDevice));
-          slot_sizes_prefix_sum_local += slot_size;
-          slot_num++;
-        }
-        slot_sizes_prefix_sum += slot_size;
-      }
-    }
-
-    // Check whether the P2P access can be enabled
-    if (embedding_data_.get_resource_manager().get_local_gpu_count() > 1 &&
-        !embedding_data_.get_resource_manager().all_p2p_enabled()) {
-      throw std::runtime_error(
-          std::string("[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot "
-                      "cannot be used on machine without GPU peer2peer access support. \n"));
-    }
-#ifdef ENABLE_MPI
-    {
-      const int num_processor{core23::MpiInitService::get().world_size()};
-      if (num_processor > 1) {
-        throw std::runtime_error(
-            std::string("[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot "
-                        "cannot support multi-node currently. \n"));
-      }
-    }
-#endif
-
-    std::shared_ptr<GeneralBuffer2<CudaManagedAllocator>> unified_buf =
-        GeneralBuffer2<CudaManagedAllocator>::create();
-    unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()},
-                         &train_embedding_features_);
-    unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()},
-                         &evaluate_embedding_features_);
-    unified_buf->allocate();
-
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) {
-      train_embedding_features_.get_ptr()[id] =
-          embedding_data_.get_output_tensors(true)[id].get_ptr();
-      evaluate_embedding_features_.get_ptr()[id] =
-          embedding_data_.get_output_tensors(false)[id].get_ptr();
-    }
-
-  } catch (const std::runtime_error &rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-
-  return;
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::
-    LocalizedSlotSparseEmbeddingOneHot(const SparseTensors<TypeHashKey> &train_keys,
-                                       const SparseTensors<TypeHashKey> &evaluate_keys,
-                                       const SparseEmbeddingHashParams &embedding_params,
-                                       const std::shared_ptr<ResourceManager> &resource_manager)
-    : embedding_data_(Embedding_t::LocalizedSlotSparseEmbeddingOneHot, train_keys, evaluate_keys,
-                      embedding_params, resource_manager),
-      slot_size_array_(embedding_params.slot_size_array) {
-  try {
-    max_vocabulary_size_ = 0;
-    for (size_t slot_size : slot_size_array_) {
-      max_vocabulary_size_ += slot_size;
-    }
-
-    max_vocabulary_size_per_gpu_ =
-        cal_max_voc_size_per_gpu(slot_size_array_, embedding_data_.get_resource_manager());
-
-    HCTR_LOG_S(INFO, ROOT) << "max_vocabulary_size_per_gpu_=" << max_vocabulary_size_per_gpu_
-                           << std::endl;
-
-    CudaDeviceContext context;
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) {
-      context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-      size_t gid = embedding_data_.get_local_gpu(id).get_global_id();
-      size_t slot_num_per_gpu =
-          embedding_data_.embedding_params_.slot_num /
-              embedding_data_.get_resource_manager().get_global_gpu_count() +
-          ((gid < embedding_data_.embedding_params_.slot_num %
-                      embedding_data_.get_resource_manager().get_global_gpu_count())
-               ? 1
-               : 0);
-      slot_num_per_gpu_.push_back(slot_num_per_gpu);
-
-      // new GeneralBuffer objects
-      const std::shared_ptr<GeneralBuffer2<CudaAllocator>> &buf = embedding_data_.get_buffer(id);
-
-      // new hash table value vectors
-      {
-        const std::shared_ptr<BufferBlock2<float>> &block = buf->create_block<float>();
-        Tensors2<float> tensors;
-        for (size_t i = 0; i < slot_size_array_.size(); i++) {
-          if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == gid) {
-            Tensor2<float> tensor;
-            block->reserve(
-                {slot_size_array_[i], embedding_data_.embedding_params_.embedding_vec_size},
-                &tensor);
-            tensors.push_back(tensor);
-          }
-        }
-        value_table_tensors_.push_back(tensors);
-        hash_table_value_tensors_.push_back(block->as_tensor());
-      }
-      {
-        Tensor2<TypeHashKey> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(true),
-                      embedding_data_.embedding_params_.max_feature_num},
-                     &tensor);
-        embedding_data_.train_value_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeHashKey> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(false),
-                      embedding_data_.embedding_params_.max_feature_num},
-                     &tensor);
-        embedding_data_.evaluate_value_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeHashKey> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) *
-                          embedding_data_.embedding_params_.slot_num +
-                      1},
-                     &tensor);
-        embedding_data_.train_row_offsets_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeHashKey> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(false) *
-                          embedding_data_.embedding_params_.slot_num +
-                      1},
-                     &tensor);
-        embedding_data_.evaluate_row_offsets_tensors_.push_back(tensor);
-      }
-      { embedding_data_.train_nnz_array_.push_back(std::make_shared<size_t>(0)); }
-      { embedding_data_.evaluate_nnz_array_.push_back(std::make_shared<size_t>(0)); }
-
-      // list of top categories, from single iteration worth of data, so max size is same as
-      {
-        HCTR_LOG_S(INFO, WORLD) << "Initializing size_top_categories_ and top_categories.."
-                                << std::endl;
-        Tensor2<size_t> tensor;
-        buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() *
-                             embedding_data_.embedding_params_.max_feature_num},
-                     &tensor);
-        size_top_categories_.push_back(0);
-        top_categories_.push_back(tensor);
-      }
-
-      // new hash table value_index that get() from HashTable
-      {
-        Tensor2<size_t> tensor;
-        buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() *
-                             embedding_data_.embedding_params_.max_feature_num},
-                     &tensor);
-        hash_value_index_tensors_.push_back(tensor);
-      }
-
-      // new embedding features reduced by hash table values(results of forward)
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve(
-            {embedding_data_.embedding_params_.get_universal_batch_size() * slot_num_per_gpu,
-             embedding_data_.embedding_params_.embedding_vec_size},
-            &tensor);
-        embedding_feature_tensors_.push_back(tensor);
-      }
-
-      // new wgrad used by backward
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * slot_num_per_gpu,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        wgrad_tensors_.push_back(tensor);
-      }
-
-      // new optimizer params used by update_params
-      switch (embedding_data_.embedding_params_.opt_params.optimizer) {
-        case Optimizer_t::SGD:
-          break;
-
-        default:
-          throw std::runtime_error(
-              std::string("[HCDEBUG][ERROR] Runtime error: Invalid optimizer type\n"));
-      }
-
-      // the tenosrs for storing slot ids
-      // TODO: init to -1 ?
-      {
-        Tensor2<size_t> tensor;
-        buf->reserve({max_vocabulary_size_per_gpu_, 1}, &tensor);
-        hash_table_slot_id_tensors_.push_back(tensor);
-      }
-
-      // temp tensors for all2all
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.get_universal_batch_size_per_gpu() *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        all2all_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_universal_batch_size() *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_forward_temp_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.get_batch_size_per_gpu(true) *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_all2all_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.get_batch_size_per_gpu(true) *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_reorder_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<TypeEmbeddingComp> tensor;
-        buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) *
-                          embedding_data_.embedding_params_.slot_num,
-                      embedding_data_.embedding_params_.embedding_vec_size},
-                     &tensor);
-        utest_backward_temp_tensors_.push_back(tensor);
-      }
-      {
-        Tensor2<uint32_t> tensor;
-        buf->reserve({1, slot_num_per_gpu}, &tensor);
-        mapping_offsets_per_gpu_tensors_.push_back(tensor);
-      }
-
-// init GenenralBuffers to do real allocation
-#ifndef NDEBUG
-      HCTR_LOG_S(DEBUG, WORLD) << " max_feature_num_:"
-                               << embedding_data_.embedding_params_.max_feature_num << std::endl;
-#endif
-
-    }  // end of for(int id = 0; id < embedding_data_.get_local_gpu_count(); id++)
-
-#pragma omp parallel for num_threads(embedding_data_.get_resource_manager().get_local_gpu_count())
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); ++id) {
-      CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id());
-      embedding_data_.get_buffer(id)->allocate();
-
-      // filling rowoffset and slot_size_array
-      cudaStream_t stream = embedding_data_.get_local_gpu(id).get_stream();
-      HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-    }
-
-    {
-      std::vector<TypeHashKey> embedding_offsets;
-      TypeHashKey slot_sizes_prefix_sum = 0;
-      for (size_t i = 0; i < slot_size_array_.size(); i++) {
-        embedding_offsets.push_back(slot_sizes_prefix_sum);
-        slot_sizes_prefix_sum += slot_size_array_[i];
-      }
-      for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); ++id) {
-        CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id());
-
-        HCTR_LIB_THROW(
-            cudaMemcpy(embedding_data_.embedding_offsets_[id].get_ptr(), embedding_offsets.data(),
-                       embedding_offsets.size() * sizeof(TypeHashKey), cudaMemcpyHostToDevice));
-
-        size_t slot_num_per_gpu = slot_num_per_gpu_[id];
-        {
-          std::vector<TypeHashKey> rowoffset_host(
-              embedding_data_.embedding_params_.get_batch_size(true) *
-                  embedding_data_.embedding_params_.slot_num +
-              1);
-          std::iota(rowoffset_host.begin(), rowoffset_host.end(), 0);
-          HCTR_LIB_THROW(cudaMemcpy(
-              embedding_data_.train_row_offsets_tensors_[id].get_ptr(), rowoffset_host.data(),
-              rowoffset_host.size() * sizeof(TypeHashKey), cudaMemcpyHostToDevice));
-        }
-        {
-          std::vector<TypeHashKey> rowoffset_host(
-              embedding_data_.embedding_params_.get_batch_size(false) *
-                  embedding_data_.embedding_params_.slot_num +
-              1);
-          std::iota(rowoffset_host.begin(), rowoffset_host.end(), 0);
-          HCTR_LIB_THROW(cudaMemcpy(
-              embedding_data_.evaluate_row_offsets_tensors_[id].get_ptr(), rowoffset_host.data(),
-              rowoffset_host.size() * sizeof(TypeHashKey), cudaMemcpyHostToDevice));
-        }
-      }
-    }
-
-    // get the mapping table between local value_index and input value_index
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) {
-      context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-      uint32_t slot_sizes_prefix_sum = 0;
-      uint32_t slot_sizes_prefix_sum_local = 0;
-      int slot_num = 0;
-      for (size_t i = 0; i < slot_size_array_.size(); i++) {
-        size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-        size_t slot_size = slot_size_array_[i];
-        if (i % embedding_data_.get_resource_manager().get_global_gpu_count() == global_id) {
-          uint32_t mapping_offset = slot_sizes_prefix_sum - slot_sizes_prefix_sum_local;
-          HCTR_LIB_THROW(cudaMemcpy(&((mapping_offsets_per_gpu_tensors_[id].get_ptr())[slot_num]),
-                                    &mapping_offset, sizeof(uint32_t), cudaMemcpyHostToDevice));
-          slot_sizes_prefix_sum_local += slot_size;
-          slot_num++;
-        }
-        slot_sizes_prefix_sum += slot_size;
-      }
-    }
-
-    // Check whether the P2P access can be enabled
-    if (embedding_data_.get_resource_manager().get_local_gpu_count() > 1 &&
-        !embedding_data_.get_resource_manager().all_p2p_enabled()) {
-      throw std::runtime_error(
-          "[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot cannot be used "
-          "on machine without GPU peer2peer access support.\n");
-    }
-#ifdef ENABLE_MPI
-    {
-      const int num_processor{core23::MpiInitService::get().world_size()};
-      if (num_processor > 1) {
-        throw std::runtime_error(
-            "[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot cannot "
-            "support multi-node currently.\n");
-      }
-    }
-#endif
-
-    std::shared_ptr<GeneralBuffer2<CudaManagedAllocator>> unified_buf =
-        GeneralBuffer2<CudaManagedAllocator>::create();
-    unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()},
-                         &train_embedding_features_);
-    unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()},
-                         &evaluate_embedding_features_);
-    unified_buf->allocate();
-
-    for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) {
-      train_embedding_features_.get_ptr()[id] =
-          embedding_data_.get_output_tensors(true)[id].get_ptr();
-      evaluate_embedding_features_.get_ptr()[id] =
-          embedding_data_.get_output_tensors(false)[id].get_ptr();
-    }
-
-  } catch (const std::runtime_error &rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-
-  return;
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::load_parameters(
-    std::string sparse_model) {
-  const std::string key_file(sparse_model + "/key");
-  const std::string slot_file(sparse_model + "/slot_id");
-  const std::string vec_file(sparse_model + "/emb_vector");
-
-  auto fs = FileSystemBuilder::build_unique_by_path(sparse_model);
-
-  size_t key_file_size_in_byte = fs->get_file_size(key_file);
-  size_t slot_file_size_in_byte = fs->get_file_size(slot_file);
-  size_t vec_file_size_in_byte = fs->get_file_size(vec_file);
-
-  size_t key_size = sizeof(long long);
-  size_t slot_size = sizeof(size_t);
-  size_t vec_size = sizeof(float) * embedding_data_.embedding_params_.embedding_vec_size;
-  size_t key_num = key_file_size_in_byte / key_size;
-  size_t slot_num = slot_file_size_in_byte / slot_size;
-  size_t vec_num = vec_file_size_in_byte / vec_size;
-
-  if (key_num != vec_num || key_file_size_in_byte % key_size != 0 ||
-      vec_file_size_in_byte % vec_size != 0 || key_num != slot_num ||
-      slot_file_size_in_byte % slot_size != 0) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "Error: file size is not correct");
-  }
-
-  auto blobs_buff = GeneralBuffer2<CudaHostAllocator>::create();
-
-  Tensor2<TypeHashKey> keys;
-  blobs_buff->reserve({key_num}, &keys);
-
-  Tensor2<size_t> slot_id;
-  blobs_buff->reserve({slot_num}, &slot_id);
-
-  Tensor2<float> embeddings;
-  blobs_buff->reserve({vec_num, embedding_data_.embedding_params_.embedding_vec_size}, &embeddings);
-
-  blobs_buff->allocate();
-
-  TypeHashKey *key_ptr = keys.get_ptr();
-  size_t *slot_id_ptr = slot_id.get_ptr();
-  float *embedding_ptr = embeddings.get_ptr();
-
-  if (std::is_same<TypeHashKey, long long>::value) {
-    fs->read(key_file, reinterpret_cast<char *>(key_ptr), key_file_size_in_byte, 0);
-  } else {
-    std::vector<long long> i64_key_vec(key_num, 0);
-    fs->read(key_file, reinterpret_cast<char *>(i64_key_vec.data()), key_file_size_in_byte, 0);
-    std::transform(i64_key_vec.begin(), i64_key_vec.end(), key_ptr,
-                   [](long long key) { return static_cast<unsigned>(key); });
-  }
-  fs->read(slot_file, reinterpret_cast<char *>(slot_id_ptr), slot_file_size_in_byte, 0);
-  fs->read(vec_file, reinterpret_cast<char *>(embedding_ptr), vec_file_size_in_byte, 0);
-
-  load_parameters(keys, slot_id, embeddings, key_num,
-                  embedding_data_.embedding_params_.embedding_vec_size, hash_table_value_tensors_,
-                  slot_size_array_, mapping_offsets_per_gpu_tensors_);
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::load_parameters(
-    BufferBag &buf_bag, size_t num) {
-  const TensorBag2 &keys_bag = buf_bag.keys;
-  const TensorBag2 &slot_id_bag = buf_bag.slot_id;
-  const Tensor2<float> &embeddings = buf_bag.embedding;
-  Tensor2<TypeHashKey> keys = Tensor2<TypeHashKey>::stretch_from(keys_bag);
-  Tensor2<size_t> slot_id = Tensor2<size_t>::stretch_from(slot_id_bag);
-
-  load_parameters(keys, slot_id, embeddings, num,
-                  embedding_data_.embedding_params_.embedding_vec_size, hash_table_value_tensors_,
-                  slot_size_array_, mapping_offsets_per_gpu_tensors_);
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::load_parameters(
-    const Tensor2<TypeHashKey> &keys, const Tensor2<size_t> &slot_id,
-    const Tensor2<float> &embeddings, size_t num, size_t embedding_vec_size,
-    Tensors2<float> &hash_table_value_tensors, const std::vector<size_t> &slot_sizes,
-    const Tensors2<uint32_t> &mapping_offsets_per_gpu_tensors) {
-  if (num == 0) return;
-
-  CudaDeviceContext context;
-  if (keys.get_dimensions()[0] < num || embeddings.get_dimensions()[0] < num) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "The rows of keys and embeddings are not consistent.");
-  }
-
-  const TypeHashKey *key_ptr = keys.get_ptr();
-  const size_t *slot_id_ptr = slot_id.get_ptr();
-  const float *embedding_ptr = embeddings.get_ptr();
-
-  // define size
-  size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();
-  size_t chunk_size = 1000;
-  size_t tile_size = 1;  // must be 1, because we need to cal (key&local_gpu_count) to decide
-                         // gpu_id for each <key,value>
-  size_t hash_table_value_tile_size = tile_size * embedding_vec_size;
-  size_t hash_table_value_tile_size_in_B = hash_table_value_tile_size * sizeof(float);
-  size_t hash_table_value_chunk_size = hash_table_value_tile_size * chunk_size;
-  size_t hash_table_value_chunk_size_in_B = hash_table_value_chunk_size * sizeof(float);
-  size_t total_gpu_count = embedding_data_.get_resource_manager().get_global_gpu_count();
-
-  // CAUTION: can not decide how many values for each GPU, so need to allocate enough memory for
-  // each GPU allocate CPU/GPU memory for value/index chunk
-  std::unique_ptr<float *[]> h_hash_table_value_chunk_per_gpu(new float *[local_gpu_count]);
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    HCTR_LIB_THROW(
-        cudaMallocHost(&h_hash_table_value_chunk_per_gpu[id], hash_table_value_chunk_size_in_B));
-  }
-  std::unique_ptr<float *[]> d_hash_table_value_chunk_per_gpu(new float *[local_gpu_count]);
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-    HCTR_LIB_THROW(
-        cudaMalloc(&d_hash_table_value_chunk_per_gpu[id], hash_table_value_chunk_size_in_B));
-  }
-  std::unique_ptr<size_t *[]> h_hash_table_index_chunk_per_gpu(new size_t *[local_gpu_count]);
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    HCTR_LIB_THROW(
-        cudaMallocHost(&h_hash_table_index_chunk_per_gpu[id], chunk_size * sizeof(size_t)));
-  }
-  std::unique_ptr<size_t *[]> d_hash_table_index_chunk_per_gpu(new size_t *[local_gpu_count]);
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-    HCTR_LIB_THROW(cudaMalloc(&d_hash_table_index_chunk_per_gpu[id], chunk_size * sizeof(size_t)));
-  }
-
-  std::unique_ptr<size_t[]> tile_counter_in_chunk_per_gpu(new size_t[local_gpu_count]);
-  memset(tile_counter_in_chunk_per_gpu.get(), 0, sizeof(size_t) * local_gpu_count);
-
-  // The vector that store the relationship between slot_id and slot order on the specific GPU
-  std::vector<size_t> local_slot_id(slot_sizes.size());
-  std::vector<size_t> local_slot_num(local_gpu_count, 0);
-  for (size_t i = 0; i < slot_sizes.size(); i++) {
-    size_t gid = i % total_gpu_count;  // global GPU ID
-    size_t id = embedding_data_.get_resource_manager().get_gpu_local_id_from_global_id(
-        gid);  // local GPU ID (not gpudevice id)
-    int dst_rank =
-        embedding_data_.get_resource_manager().get_process_id_from_gpu_global_id(gid);  // node id
-    if (embedding_data_.get_resource_manager().get_process_id() == dst_rank) {
-      local_slot_id[i] = local_slot_num[id];
-      local_slot_num[id]++;
-    }
-  }
-
-  // Host buffer to keep mapping_offset
-  std::vector<uint32_t *> h_mapping_offsets_per_gpu_tensors(local_gpu_count);
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-    HCTR_LIB_THROW(cudaMallocHost(&h_mapping_offsets_per_gpu_tensors[id],
-                                  local_slot_num[id] * sizeof(uint32_t)));
-    // Copy the mapping offset from GPU to Host
-    HCTR_LIB_THROW(cudaMemcpyAsync(h_mapping_offsets_per_gpu_tensors[id],
-                                   mapping_offsets_per_gpu_tensors[id].get_ptr(),
-                                   local_slot_num[id] * sizeof(uint32_t), cudaMemcpyDeviceToHost,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-  }
-
-  // sync wait
-  functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-  // do upload
-  const size_t loop_num = num / chunk_size;
-  HCTR_LOG_S(INFO, ROOT) << "Start to upload embedding table file to GPUs, total loop_num: "
-                         << loop_num << std::endl;
-  for (size_t i = 0; i < loop_num; i++) {
-    float *value_dst_buf;
-    size_t *tensor_index_dst_buf;
-    for (size_t k = 0; k < chunk_size; k++) {  // process a tile in each loop
-      size_t slot_id = slot_id_ptr[i * chunk_size + k];
-      size_t gid = slot_id % total_gpu_count;  // global GPU ID
-      size_t id = embedding_data_.get_resource_manager().get_gpu_local_id_from_global_id(
-          gid);  // local GPU ID (not gpudevice id)
-      int dst_rank =
-          embedding_data_.get_resource_manager().get_process_id_from_gpu_global_id(gid);  // node id
-
-      if (embedding_data_.get_resource_manager().get_process_id() == dst_rank) {
-        TypeHashKey tile_key = key_ptr[i * chunk_size + k];
-        size_t tensor_index =
-            tile_key - (h_mapping_offsets_per_gpu_tensors[id][local_slot_id[slot_id]]);
-
-        // memcpy hash_table_value to corresponding GPU
-        value_dst_buf = h_hash_table_value_chunk_per_gpu[id] +
-                        tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size;
-        memcpy(value_dst_buf, embedding_ptr + (i * chunk_size + k) * embedding_vec_size,
-               hash_table_value_tile_size_in_B);
-
-        tensor_index_dst_buf =
-            h_hash_table_index_chunk_per_gpu[id] + tile_counter_in_chunk_per_gpu[id];
-        *tensor_index_dst_buf = tensor_index;
-        tile_counter_in_chunk_per_gpu[id] += 1;
-      } else {
-        continue;
-      }
-    }  // end of for(int k = 0; k < (chunk_size * local_gpu_count); k++)
-
-    // memcpy hash_table_slot_id and hash_table_value from CPU to GPU
-    for (size_t id = 0; id < local_gpu_count; id++) {
-      if (tile_counter_in_chunk_per_gpu[id] == 0) {
-        continue;
-      }
-
-      context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-      // Copy value buffer and tensor_index buffer to GPU
-      size_t value_chunk_size = tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size;
-      float *src_buf_value = h_hash_table_value_chunk_per_gpu[id];
-      float *dst_buf_value = d_hash_table_value_chunk_per_gpu[id];
-      HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_value, src_buf_value, value_chunk_size * sizeof(float),
-                                     cudaMemcpyHostToDevice,
-                                     embedding_data_.get_local_gpu(id).get_stream()));
-      size_t *src_buf_index = h_hash_table_index_chunk_per_gpu[id];
-      size_t *dst_buf_index = d_hash_table_index_chunk_per_gpu[id];
-      value_chunk_size = tile_counter_in_chunk_per_gpu[id];
-      HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_index, src_buf_index,
-                                     value_chunk_size * sizeof(size_t), cudaMemcpyHostToDevice,
-                                     embedding_data_.get_local_gpu(id).get_stream()));
-
-      // Call kernel to insert the value into embedding value tensor
-      const size_t grid_size = (tile_counter_in_chunk_per_gpu[id] - 1) / 256 + 1;
-      upload_value_tensor_kernel<<<grid_size, 256, 0,
-                                   embedding_data_.get_local_gpu(id).get_stream()>>>(
-          d_hash_table_value_chunk_per_gpu[id], d_hash_table_index_chunk_per_gpu[id],
-          hash_table_value_tensors[id].get_ptr(), hash_table_value_tile_size,
-          tile_counter_in_chunk_per_gpu[id]);
-    }
-
-    functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-    // set counter value
-    for (size_t id = 0; id < local_gpu_count; id++) {
-      tile_counter_in_chunk_per_gpu[id] = 0;  // reset chunk counter to zero
-    }
-  }  // end of for(int i = 0; i < loop_num; i++)
-
-  // process the remaining data(less than a chunk)
-  const size_t remain_loop_num = num - loop_num * chunk_size;
-  float *value_dst_buf;
-  size_t *tensor_index_dst_buf;
-  for (size_t i = 0; i < remain_loop_num; i++) {  // process one tile in each loop
-
-    size_t slot_id = slot_id_ptr[loop_num * chunk_size + i];
-    size_t gid = slot_id % total_gpu_count;  // global GPU ID
-    size_t id = embedding_data_.get_resource_manager().get_gpu_local_id_from_global_id(
-        gid);  // local GPU ID (not gpudevice id)
-    int dst_rank =
-        embedding_data_.get_resource_manager().get_process_id_from_gpu_global_id(gid);  // node id
-
-    if (embedding_data_.get_resource_manager().get_process_id() == dst_rank) {
-      TypeHashKey tile_key = key_ptr[loop_num * chunk_size + i];
-      size_t tensor_index =
-          tile_key - (h_mapping_offsets_per_gpu_tensors[id][local_slot_id[slot_id]]);
-
-      // memcpy hash_table_value to corresponding GPU
-      value_dst_buf = h_hash_table_value_chunk_per_gpu[id] +
-                      tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size;
-      memcpy(value_dst_buf, embedding_ptr + (loop_num * chunk_size + i) * embedding_vec_size,
-             hash_table_value_tile_size_in_B);
-
-      tensor_index_dst_buf =
-          h_hash_table_index_chunk_per_gpu[id] + tile_counter_in_chunk_per_gpu[id];
-      *tensor_index_dst_buf = tensor_index;
-      tile_counter_in_chunk_per_gpu[id] += 1;
-
-    } else {
-      continue;
-    }
-  }
-
-  // memcpy hash_table_slot_id and hash_table_value from CPU to GPU and insert into embedding
-  // table
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (tile_counter_in_chunk_per_gpu[id] == 0) {
-      continue;
-    }
-
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    // Copy value buffer and tensor_index buffer to GPU
-    size_t value_chunk_size = tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size;
-    float *src_buf_value = h_hash_table_value_chunk_per_gpu[id];
-    float *dst_buf_value = d_hash_table_value_chunk_per_gpu[id];
-    HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_value, src_buf_value, value_chunk_size * sizeof(float),
-                                   cudaMemcpyHostToDevice,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-    size_t *src_buf_index = h_hash_table_index_chunk_per_gpu[id];
-    size_t *dst_buf_index = d_hash_table_index_chunk_per_gpu[id];
-    value_chunk_size = tile_counter_in_chunk_per_gpu[id];
-    HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_index, src_buf_index, value_chunk_size * sizeof(size_t),
-                                   cudaMemcpyHostToDevice,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-
-    // Call kernel to insert the value into embedding value tensor
-    const size_t grid_size = (tile_counter_in_chunk_per_gpu[id] - 1) / 256 + 1;
-    upload_value_tensor_kernel<<<grid_size, 256, 0,
-                                 embedding_data_.get_local_gpu(id).get_stream()>>>(
-        d_hash_table_value_chunk_per_gpu[id], d_hash_table_index_chunk_per_gpu[id],
-        hash_table_value_tensors[id].get_ptr(), hash_table_value_tile_size,
-        tile_counter_in_chunk_per_gpu[id]);
-  }
-
-  // sync wait
-  functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-  HCTR_LOG(INFO, ROOT, "Done\n");
-
-  // release resources
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-    HCTR_LIB_THROW(cudaFree(d_hash_table_value_chunk_per_gpu[id]));
-    HCTR_LIB_THROW(cudaFree(d_hash_table_index_chunk_per_gpu[id]));
-  }
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    HCTR_LIB_THROW(cudaFreeHost(h_hash_table_value_chunk_per_gpu[id]));
-    HCTR_LIB_THROW(cudaFreeHost(h_hash_table_index_chunk_per_gpu[id]));
-    HCTR_LIB_THROW(cudaFreeHost(h_mapping_offsets_per_gpu_tensors[id]));
-  }
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::dump_parameters(
-    std::string sparse_model) const {
-  dump_parameters(sparse_model, embedding_data_.embedding_params_.embedding_vec_size,
-                  hash_table_value_tensors_, slot_size_array_);
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::dump_parameters(
-    BufferBag &buf_bag, size_t *num) const {
-  TensorBag2 keys_bag = buf_bag.keys;
-  TensorBag2 slot_id_bag = buf_bag.slot_id;
-  Tensor2<float> &embeddings = buf_bag.embedding;
-  Tensor2<TypeHashKey> keys = Tensor2<TypeHashKey>::stretch_from(keys_bag);
-  Tensor2<size_t> slot_id = Tensor2<size_t>::stretch_from(slot_id_bag);
-
-  dump_parameters(keys, slot_id, embeddings, num,
-                  embedding_data_.embedding_params_.embedding_vec_size, hash_table_value_tensors_,
-                  slot_size_array_);
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::dump_parameters(
-    const std::string &sparse_model, size_t embedding_vec_size,
-    const Tensors2<float> &hash_table_value_tensors, const std::vector<size_t> &slot_sizes) const {
-  CudaDeviceContext context;
-  size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();
-
-  auto fs = FileSystemBuilder::build_unique_by_path(sparse_model);
-  bool is_local_path = IOUtils::is_local_path(sparse_model);
-
-  const std::string key_file(sparse_model + "/key");
-  const std::string slot_file(sparse_model + "/slot_id");
-  const std::string vec_file(sparse_model + "/emb_vector");
-
-#ifdef ENABLE_MPI
-  HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported.");
-  fs->create_dir(sparse_model);
-  MPI_File key_fh, slot_fh, vec_fh;
-  HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY,
-                               MPI_INFO_NULL, &key_fh));
-  HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, slot_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY,
-                               MPI_INFO_NULL, &slot_fh));
-  HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, vec_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY,
-                               MPI_INFO_NULL, &vec_fh));
-#endif
-
-  // memory allocation
-  std::unique_ptr<size_t[]> count(new size_t[local_gpu_count]);
-  size_t total_count = 0;
-
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-    count[id] = 0;
-    for (size_t i = 0; i < slot_sizes.size(); i++) {
-      size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-      if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) {
-        count[id] += slot_sizes[i];
-      }
-    }
-    total_count += count[id];
-  }
-
-  std::vector<size_t> offset_host(local_gpu_count, 0);
-  std::exclusive_scan(count.get(), count.get() + local_gpu_count, offset_host.begin(), 0);
-
-  TypeHashKey *h_hash_table_key;
-  size_t *h_hash_table_slot_id;
-  float *h_hash_table_value;
-  HCTR_LIB_THROW(cudaMallocHost(&h_hash_table_key, total_count * sizeof(TypeHashKey)));
-  HCTR_LIB_THROW(cudaMallocHost(&h_hash_table_slot_id, total_count * sizeof(size_t)));
-  HCTR_LIB_THROW(
-      cudaMallocHost(&h_hash_table_value, total_count * embedding_vec_size * sizeof(float)));
-
-  std::unique_ptr<TypeHashKey *[]> d_hash_table_key(new TypeHashKey *[local_gpu_count]);
-  std::unique_ptr<size_t *[]> d_hash_table_slot_id(new size_t *[local_gpu_count]);
-
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (count[id] == 0) continue;
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    HCTR_LIB_THROW(cudaMalloc(&d_hash_table_key[id], count[id] * sizeof(TypeHashKey)));
-    HCTR_LIB_THROW(cudaMalloc(&d_hash_table_slot_id[id], count[id] * sizeof(size_t)));
-  }
-
-  // Generate key and slot_id tensor, dump value tensor on GPU
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (count[id] == 0) continue;
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    HCTR_LOG_S(INFO, WORLD) << "Rank" << embedding_data_.get_resource_manager().get_process_id()
-                            << ": Dump embedding table from GPU" << id << std::endl;
-
-    // Loop for each slot
-    size_t buffer_offset = 0;
-    for (size_t i = 0; i < slot_sizes.size(); i++) {
-      size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-      if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) {
-        // Generate key buffer
-        size_t key_offset = 0;
-        for (size_t j = 0; j < i; j++) {
-          key_offset += slot_sizes[j];
-        }
-        functors_.memset_liner(d_hash_table_key[id] + buffer_offset, (TypeHashKey)key_offset,
-                               (TypeHashKey)1, slot_sizes[i],
-                               embedding_data_.get_local_gpu(id).get_stream());
-
-        // Generate slot_id
-        functors_.memset_const(d_hash_table_slot_id[id] + buffer_offset, i, slot_sizes[i],
-                               embedding_data_.get_local_gpu(id).get_stream());
-
-        buffer_offset += slot_sizes[i];
-      }
-    }
-    // Copy key buffer to host
-    HCTR_LIB_THROW(cudaMemcpyAsync(h_hash_table_key + offset_host[id], d_hash_table_key[id],
-                                   count[id] * sizeof(TypeHashKey), cudaMemcpyDeviceToHost,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-    // Copy value buffer to host
-    HCTR_LIB_THROW(cudaMemcpyAsync(
-        h_hash_table_value + offset_host[id] * embedding_vec_size,
-        hash_table_value_tensors[id].get_ptr(), count[id] * embedding_vec_size * sizeof(float),
-        cudaMemcpyDeviceToHost, embedding_data_.get_local_gpu(id).get_stream()));
-    // Copy slot_id to host
-    HCTR_LIB_THROW(cudaMemcpyAsync(h_hash_table_slot_id + offset_host[id], d_hash_table_slot_id[id],
-                                   count[id] * sizeof(size_t), cudaMemcpyDeviceToHost,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-  }
-  functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-  long long *h_key_ptr;
-  std::vector<long long> i64_key_vec;
-  if (std::is_same<TypeHashKey, long long>::value) {
-    h_key_ptr = reinterpret_cast<long long *>(h_hash_table_key);
-  } else {
-    i64_key_vec.resize(total_count);
-    std::transform(h_hash_table_key, h_hash_table_key + total_count, i64_key_vec.begin(),
-                   [](unsigned key) { return static_cast<long long>(key); });
-    h_key_ptr = i64_key_vec.data();
-  }
-
-  const size_t key_size = sizeof(long long);
-  const size_t slot_size = sizeof(size_t);
-  const size_t vec_size = sizeof(float) * embedding_vec_size;
-
-  // write sparse model to file
-  HCTR_LOG_S(INFO, WORLD) << "Rank" << embedding_data_.get_resource_manager().get_process_id()
-                          << ": Write hash table <key,value> pairs to file" << std::endl;
-#ifdef ENABLE_MPI
-  MPI_Datatype TYPE_EMB_VECTOR;
-  HCTR_MPI_THROW(MPI_Type_contiguous(embedding_vec_size, MPI_FLOAT, &TYPE_EMB_VECTOR));
-  HCTR_MPI_THROW(MPI_Type_commit(&TYPE_EMB_VECTOR));
-
-  int my_rank = embedding_data_.get_resource_manager().get_process_id();
-  int n_ranks = embedding_data_.get_resource_manager().get_num_process();
-
-  std::vector<size_t> offset_per_rank(n_ranks, 0);
-  HCTR_MPI_THROW(MPI_Allgather(&total_count, sizeof(size_t), MPI_CHAR, offset_per_rank.data(),
-                               sizeof(size_t), MPI_CHAR, MPI_COMM_WORLD));
-  std::exclusive_scan(offset_per_rank.begin(), offset_per_rank.end(), offset_per_rank.begin(), 0);
-
-  size_t key_offset = offset_per_rank[my_rank] * key_size;
-  size_t slot_offset = offset_per_rank[my_rank] * slot_size;
-  size_t vec_offset = offset_per_rank[my_rank] * vec_size;
-
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-  MPI_Status status;
-  HCTR_MPI_THROW(
-      MPI_File_write_at(key_fh, key_offset, h_key_ptr, total_count, MPI_LONG_LONG_INT, &status));
-  HCTR_MPI_THROW(MPI_File_write_at(slot_fh, slot_offset, h_hash_table_slot_id, total_count,
-                                   MPI_SIZE_T, &status));
-  HCTR_MPI_THROW(MPI_File_write_at(vec_fh, vec_offset, h_hash_table_value, total_count,
-                                   TYPE_EMB_VECTOR, &status));
-
-  HCTR_MPI_THROW(MPI_File_close(&key_fh));
-  HCTR_MPI_THROW(MPI_File_close(&slot_fh));
-  HCTR_MPI_THROW(MPI_File_close(&vec_fh));
-  HCTR_MPI_THROW(MPI_Type_free(&TYPE_EMB_VECTOR));
-#else
-  fs->write(key_file, reinterpret_cast<char *>(h_key_ptr), total_count * key_size, true);
-  fs->write(slot_file, reinterpret_cast<char *>(h_hash_table_slot_id), total_count * slot_size,
-            true);
-  fs->write(vec_file, reinterpret_cast<char *>(h_hash_table_value), total_count * vec_size, true);
-#endif
-  HCTR_LOG(INFO, ROOT, "Done\n");
-
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (count[id] == 0) continue;
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    HCTR_LIB_THROW(cudaFree(d_hash_table_key[id]));
-    HCTR_LIB_THROW(cudaFree(d_hash_table_slot_id[id]));
-  }
-  HCTR_LIB_THROW(cudaFreeHost(h_hash_table_key));
-  HCTR_LIB_THROW(cudaFreeHost(h_hash_table_slot_id));
-  HCTR_LIB_THROW(cudaFreeHost(h_hash_table_value));
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::dump_parameters(
-    Tensor2<TypeHashKey> &keys, Tensor2<size_t> &slot_id, Tensor2<float> &embeddings, size_t *num,
-    size_t embedding_vec_size, const Tensors2<float> &hash_table_value_tensors,
-    const std::vector<size_t> &slot_sizes) const {
-  TypeHashKey *key_ptr = keys.get_ptr();
-  size_t *slot_id_ptr = slot_id.get_ptr();
-  float *embedding_ptr = embeddings.get_ptr();
-
-  size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();
-
-  // memory allocation
-  std::unique_ptr<size_t[]> count(new size_t[local_gpu_count]);
-  size_t total_count = 0;
-
-  CudaDeviceContext context;
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-    count[id] = 0;
-    for (size_t i = 0; i < slot_sizes.size(); i++) {
-      size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-      if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) {
-        count[id] += slot_sizes[i];
-      }
-    }
-    total_count += count[id];
-  }
-
-  std::vector<size_t> offset_host(local_gpu_count, 0);
-  std::exclusive_scan(count.get(), count.get() + local_gpu_count, offset_host.begin(), 0);
-  *num = total_count;
-
-  std::unique_ptr<TypeHashKey *[]> d_hash_table_key(new TypeHashKey *[local_gpu_count]);
-  std::unique_ptr<size_t *[]> d_hash_table_slot_id(new size_t *[local_gpu_count]);
-
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (count[id] == 0) continue;
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    HCTR_LIB_THROW(cudaMalloc(&d_hash_table_key[id], count[id] * sizeof(TypeHashKey)));
-    HCTR_LIB_THROW(cudaMalloc(&d_hash_table_slot_id[id], count[id] * sizeof(size_t)));
-  }
-
-  // Generate key and slot_id tensor, dump value tensor on GPU
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (count[id] == 0) continue;
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    // Loop for each slot
-    size_t buffer_offset = 0;
-    for (size_t i = 0; i < slot_sizes.size(); i++) {
-      size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-      if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) {
-        // Generate key buffer
-        size_t key_offset = 0;
-        for (size_t j = 0; j < i; j++) {
-          key_offset += slot_sizes[j];
-        }
-        functors_.memset_liner(d_hash_table_key[id] + buffer_offset,
-                               static_cast<TypeHashKey>(key_offset), static_cast<TypeHashKey>(1),
-                               slot_sizes[i], embedding_data_.get_local_gpu(id).get_stream());
-
-        // Generate slot_id
-        functors_.memset_const(d_hash_table_slot_id[id] + buffer_offset, i, slot_sizes[i],
-                               embedding_data_.get_local_gpu(id).get_stream());
-
-        buffer_offset += slot_sizes[i];
-      }
-    }
-    // Copy key buffer to host
-    HCTR_LIB_THROW(cudaMemcpyAsync(key_ptr + offset_host[id], d_hash_table_key[id],
-                                   count[id] * sizeof(TypeHashKey), cudaMemcpyDeviceToHost,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-    // Copy value buffer to host
-    HCTR_LIB_THROW(cudaMemcpyAsync(
-        embedding_ptr + offset_host[id] * embedding_vec_size,
-        hash_table_value_tensors[id].get_ptr(), count[id] * embedding_vec_size * sizeof(float),
-        cudaMemcpyDeviceToHost, embedding_data_.get_local_gpu(id).get_stream()));
-    // Copy slot_id to host
-    HCTR_LIB_THROW(cudaMemcpyAsync(slot_id_ptr + offset_host[id], d_hash_table_slot_id[id],
-                                   count[id] * sizeof(size_t), cudaMemcpyDeviceToHost,
-                                   embedding_data_.get_local_gpu(id).get_stream()));
-  }
-  functors_.sync_all_gpus(embedding_data_.get_resource_manager());
-
-  for (size_t id = 0; id < local_gpu_count; id++) {
-    if (count[id] == 0) continue;
-    context.set_device(embedding_data_.get_local_gpu(id).get_device_id());
-
-    HCTR_LIB_THROW(cudaFree(d_hash_table_key[id]));
-    HCTR_LIB_THROW(cudaFree(d_hash_table_slot_id[id]));
-  }
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::init_embedding(
-    const std::vector<size_t> slot_sizes, size_t embedding_vec_size,
-    std::vector<Tensors2<float>> &hash_table_value_tensors,
-    Tensors2<size_t> &hash_table_slot_id_tensors) {
-  size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();
-  size_t total_gpu_count = embedding_data_.get_resource_manager().get_global_gpu_count();
-
-#ifndef NDEBUG
-  HCTR_LOG_S(DEBUG, ROOT) << "local_gpu_count=" << local_gpu_count
-                          << ", total_gpu_count=" << total_gpu_count << std::endl;
-#endif
-
-#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count())
-  {
-    size_t id = omp_get_thread_num();
-    size_t device_id = embedding_data_.get_local_gpu(id).get_device_id();
-    size_t global_id = embedding_data_.get_local_gpu(id).get_global_id();
-
-#ifndef NDEBUG
-    HCTR_LOG_S(DEBUG, ROOT) << "id=" << id << ", device_id=" << device_id
-                            << ", global_id=" << global_id << std::endl;
-#endif
-
-    functors_.init_embedding_per_gpu(global_id, total_gpu_count, slot_sizes, embedding_vec_size,
-                                     hash_table_value_tensors[id], hash_table_slot_id_tensors[id],
-                                     embedding_data_.get_local_gpu(id));
-
-    HCTR_LIB_THROW(cudaStreamSynchronize(embedding_data_.get_local_gpu(id).get_stream()));
-    HCTR_LOG_S(INFO, ROOT) << "gpu" << id << " init embedding done" << std::endl;
-  }
-
-  return;
-}
-
-template <typename TypeHashKey, typename TypeEmbeddingComp>
-void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::reset() {
-  CudaDeviceContext context;
-  for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) {
-    functors_.init_embedding_per_gpu(
-        embedding_data_.get_local_gpu(i).get_global_id(),
-        embedding_data_.get_resource_manager().get_global_gpu_count(), slot_size_array_,
-        embedding_data_.embedding_params_.embedding_vec_size, value_table_tensors_[i],
-        hash_table_slot_id_tensors_[i], embedding_data_.get_local_gpu(i));
-  }
-
-  for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) {
-    HCTR_LIB_THROW(cudaStreamSynchronize(embedding_data_.get_local_gpu(i).get_stream()));
-  }
-}
-
-template class LocalizedSlotSparseEmbeddingOneHot<unsigned int, float>;
-template class LocalizedSlotSparseEmbeddingOneHot<long long, float>;
-template class LocalizedSlotSparseEmbeddingOneHot<unsigned int, __half>;
-template class LocalizedSlotSparseEmbeddingOneHot<long long, __half>;
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/embeddings/update_params_functor.cu b/HugeCTR/src/embeddings/update_params_functor.cu
deleted file mode 100644
index b11aecd5f3..0000000000
--- a/HugeCTR/src/embeddings/update_params_functor.cu
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <thrust/sort.h>  // for implicitly including cub headers
-
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/sparse_embedding_functors.hpp>
-#include <utils.cuh>
-
-#define max_size_top_categories 16
-#define num_samples_per_block 128
-#define embedding_block_size 128
-
-namespace HugeCTR {
-
-size_t get_max_size_top_categories() { return max_size_top_categories; }
-size_t get_num_samples_per_block() { return num_samples_per_block; }
-size_t get_embedding_block_size() { return embedding_block_size; }
-
-namespace {
-
-// TODO: it must be moved to SparseOptimizer
-// The local memory version of the atomic update kernel - opt_sgd_atomic_kernel for one hot
-// embedding.
-//
-// This function updates the embedding vectors of the top-n features in shared memory
-// before writing the accumulated result to global memory. This reduces the number of
-// global memory accesses, locks and collisions.
-//
-// num_samples_per_block number of samples are updated per block and they are iterated over,
-// such that all threads update the embedding vector of a single feature simultaneously.
-//
-// shared ds_top_features_index : the row indices of the top-n - top_features_size - features
-// shared ds_embedding : the embedding vector corresponding to the top features (rows)
-template <typename TypeEmbeddingComp>
-__global__ void opt_sgd_cached_kernel(int nnz, int embedding_vec_size, float lr_scale,
-                                      const size_t *top_categories,
-                                      const size_t top_categories_size,
-                                      const size_t *hash_value_index,
-                                      const TypeEmbeddingComp *wgrad, float *hash_table_value) {
-  int bid = blockIdx.x;
-  int tid = threadIdx.x;
-
-  // read a number of top_categories_size top categories indices from global memory
-  // note: max_size_top_n (16) less than warp size
-  __shared__ size_t ds_top_categories[max_size_top_categories];
-  if (tid < top_categories_size) {
-    ds_top_categories[tid] = top_categories[tid];
-  }
-  //__syncthreads();
-
-  // reads num_samples_per_block values indices from hash_value_index into shared memory
-  __shared__ size_t ds_category[num_samples_per_block];  // embedding indices for current block
-  for (int ds_offset = 0; ds_offset < num_samples_per_block; ds_offset += blockDim.x) {
-    int ds_index = ds_offset + tid;
-    int key_id = bid * num_samples_per_block + ds_index;
-    if (ds_index < num_samples_per_block && key_id < nnz) {
-      ds_category[ds_index] = hash_value_index[key_id];
-    }
-  }
-  __syncthreads();
-
-  // map sample category indices to top_category indices
-  __shared__ int
-      ds_index_top_categories[num_samples_per_block];  // index to top category index array,
-                                                       // max_size_top_categories if not present
-  {
-    for (int ci_offset = 0; ci_offset < num_samples_per_block; ci_offset += blockDim.x) {
-      int index_ds_category = ci_offset + tid;
-      if (index_ds_category < num_samples_per_block) {
-        // loop over top features
-        int i_top = max_size_top_categories;  // one past end
-        if (index_ds_category + bid * num_samples_per_block < nnz) {
-          int category_embedding_index = ds_category[index_ds_category];
-          for (int k = 0; k < top_categories_size; ++k) {
-            if (category_embedding_index == ds_top_categories[k]) i_top = k;
-          }
-        }
-        ds_index_top_categories[index_ds_category] = i_top;
-      }
-    }
-  }
-  __syncthreads();
-
-  // store the sum of deltaw in ds_embedding
-  // TODO: make this work for embedding size > 128
-  __shared__ float ds_embedding[max_size_top_categories][embedding_block_size];
-  // initialize the local embedding vectors
-  for (int i = 0; i < top_categories_size; ++i) {
-    if (tid < embedding_block_size) {
-      ds_embedding[i][tid] = 0.f;
-    }
-  }
-  __syncthreads();
-
-  unsigned int update_top_category = 0;  // bit indicator sequence
-
-  size_t key_id_local = 0;
-  for (size_t key_id = bid * num_samples_per_block;
-       key_id < nnz && key_id < (bid + 1) * num_samples_per_block; ++key_id) {
-    if (tid < embedding_vec_size) {
-      int index_top_category = ds_index_top_categories[key_id_local];
-      size_t category_embedding_index = ds_category[key_id_local];
-      if (index_top_category < max_size_top_categories) {
-        // write to shared memory
-        update_top_category = (update_top_category | (1 << index_top_category));
-        // write results to embedding vector in shared memory
-        float deltaw = -lr_scale * TypeConvertFunc<float, TypeEmbeddingComp>::convert(
-                                       wgrad[key_id * embedding_vec_size + tid]);
-        ds_embedding[index_top_category][tid] += deltaw;
-      } else {
-        // write to global memory using atomic
-        float deltaw = -lr_scale * TypeConvertFunc<float, TypeEmbeddingComp>::convert(
-                                       wgrad[key_id * embedding_vec_size + tid]);
-
-        // atomic update
-        size_t feature_index = category_embedding_index * embedding_vec_size + tid;
-        atomicAdd(&hash_table_value[feature_index], deltaw);
-      }
-    }
-
-    key_id_local++;
-  }
-  __syncthreads();
-
-  // write the embedding vectors for top features which are in shared memory to global memory
-  // for (int i=0; i < max_size_top_categories; ++i) { // maybe this is actually more optimized
-  if (tid < embedding_vec_size) {
-    for (int i = 0; i < top_categories_size; ++i) {
-      // only those that were updated
-      if ((update_top_category & (1 << i)) > 0) {
-        size_t category_embedding_index = ds_top_categories[i];
-        size_t embedding_element_index = category_embedding_index * embedding_vec_size + tid;
-        atomicAdd(&hash_table_value[embedding_element_index], ds_embedding[i][tid]);
-      }
-    }
-  }
-}
-
-// only support LocalizedSlotSparseEmbeddingOneHot
-template <typename TypeEmbeddingComp>
-__global__ void opt_sgd_atomic_kernel(int nnz, int embedding_vec_size, float lr_scale,
-                                      const size_t *hash_value_index,
-                                      const TypeEmbeddingComp *wgrad, float *hash_table_value) {
-  int bid = blockIdx.x;
-  int tid = threadIdx.x;
-
-  if (tid < embedding_vec_size && bid < nnz) {
-    for (int key_id = bid; key_id < nnz; key_id += gridDim.x) {
-      // for one-hot, the max_feature_per_slot is 1, so sample_id is equal to key_id
-      float deltaw = -lr_scale * TypeConvertFunc<float, TypeEmbeddingComp>::convert(
-                                     wgrad[key_id * embedding_vec_size + tid]);
-
-      // atomic update
-      size_t value_index = hash_value_index[key_id];
-      size_t feature_index = value_index * embedding_vec_size + tid;
-      atomicAdd(&hash_table_value[feature_index], deltaw);
-    }
-  }
-}
-
-}  // namespace
-
-template <typename TypeEmbeddingComp>
-void SparseEmbeddingFunctors::opt_sgd_atomic_cached<TypeEmbeddingComp>(
-    size_t num_samples, size_t embedding_vec_size, const size_t *hash_value_index, float lr,
-    float scaler, const TypeEmbeddingComp *wgrad, float *hash_table_value, size_t *top_categories,
-    size_t &size_top_categories, cudaStream_t stream, bool force_stats) {
-  static bool perform_stats = true;
-  if (perform_stats || force_stats) {
-    uint32_t num_unique_categories;
-    /// TODO: refactor instead of using placeholder values for the other params
-    hybrid_embedding::Statistics<size_t> statistics(num_samples, 1, 1, 1);
-
-    statistics.sort_categories_by_count(hash_value_index, (uint32_t)num_samples, top_categories,
-                                        statistics.counts_sorted.get_ptr(), num_unique_categories,
-                                        stream);
-    size_top_categories = std::min((size_t)num_unique_categories, (size_t)max_size_top_categories);
-
-    perform_stats = false;
-  }
-
-  float lr_scale = lr / scaler;
-  // treats num_samples_per_block samples
-  size_t grid_size = max(1ul, (num_samples - 1) / num_samples_per_block + 1);
-  // each thread sets one embedding vector element
-  size_t block_size = embedding_vec_size;
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-  opt_sgd_cached_kernel<<<grid_size, block_size, 0, stream>>>(
-      num_samples, embedding_vec_size, lr_scale, top_categories, size_top_categories,
-      hash_value_index, wgrad, hash_table_value);
-  HCTR_LIB_THROW(cudaPeekAtLastError());
-}
-
-template <typename TypeEmbeddingComp>
-void SparseEmbeddingFunctors::update_params<TypeEmbeddingComp>(
-    size_t embedding_vec_size, const OptParams &opt_params, size_t nnz,
-    const Tensor2<size_t> &hash_value_index, const Tensor2<TypeEmbeddingComp> &wgrad,
-    Tensor2<float> &hash_table_value, Tensor2<size_t> &top_categories, size_t &size_top_categories,
-    size_t sm_count, cudaStream_t stream, bool force_stats) {
-  try {
-    if (opt_params.optimizer == Optimizer_t::SGD && opt_params.hyperparams.sgd.atomic_update) {
-      float lr_scale = opt_params.lr / opt_params.scaler;
-
-      opt_sgd_atomic_cached<TypeEmbeddingComp>(nnz, embedding_vec_size, hash_value_index.get_ptr(),
-                                               opt_params.lr, opt_params.scaler, wgrad.get_ptr(),
-                                               hash_table_value.get_ptr(), top_categories.get_ptr(),
-                                               size_top_categories, stream, force_stats);
-    } else {
-      HCTR_OWN_THROW(Error_t::WrongInput, "Error: Invalid opitimizer type");
-    }
-
-  } catch (const std::runtime_error &rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-
-  return;
-}
-
-template void SparseEmbeddingFunctors::opt_sgd_atomic_cached<float>(
-    size_t num_samples, size_t embedding_vec_size, const size_t *hash_value_index, float lr,
-    float scaler, const float *wgrad, float *hash_table_value, size_t *top_categories,
-    size_t &size_top_categories, cudaStream_t stream, bool force_stats);
-
-template void SparseEmbeddingFunctors::opt_sgd_atomic_cached<__half>(
-    size_t num_samples, size_t embedding_vec_size, const size_t *hash_value_index, float lr,
-    float scaler, const __half *wgrad, float *hash_table_value, size_t *top_categories,
-    size_t &size_top_categories, cudaStream_t stream, bool force_stats);
-
-template void SparseEmbeddingFunctors::update_params<float>(
-    size_t embedding_vec_size, const OptParams &opt_params, size_t nnz,
-    const Tensor2<size_t> &hash_value_index, const Tensor2<float> &wgrad,
-    Tensor2<float> &hash_table_value, Tensor2<size_t> &top_categories, size_t &size_top_categories,
-    size_t sm_count, cudaStream_t stream, bool force_stats);
-
-template void SparseEmbeddingFunctors::update_params<__half>(
-    size_t embedding_vec_size, const OptParams &opt_params, size_t nnz,
-    const Tensor2<size_t> &hash_value_index, const Tensor2<__half> &wgrad,
-    Tensor2<float> &hash_table_value, Tensor2<size_t> &top_categories, size_t &size_top_categories,
-    size_t sm_count, cudaStream_t stream, bool force_stats);
-
-}  // namespace HugeCTR
diff --git a/HugeCTR/src/exchange_wgrad.cpp b/HugeCTR/src/exchange_wgrad.cpp
index 627fa27023..d1232652a2 100644
--- a/HugeCTR/src/exchange_wgrad.cpp
+++ b/HugeCTR/src/exchange_wgrad.cpp
@@ -21,17 +21,18 @@ namespace HugeCTR {
 
 template <typename T>
 NetworkExchangeWgrad<T>::NetworkExchangeWgrad(
-    const std::shared_ptr<ResourceManager>& resource_manager)
-    : resource_manager_(resource_manager), num_gpus_(resource_manager->get_local_gpu_count()) {
+    const std::shared_ptr<ResourceManager>& resource_manager,
+    const std::shared_ptr<CollectiveManager>& collective_manager)
+    : collective_manager_(collective_manager), num_gpus_(resource_manager->get_local_gpu_count()) {
   // TODO remove it after Hybrid embedding is deprecated
   null_wgrad_buffs_.resize(num_gpus_, nullptr);
-  auto ar_comm = resource_manager_->get_ar_comm();
+  auto ar_comm = collective_manager_->get_ar_comm();
   ar_handle_ = ar_comm->register_coll();
 }
 template <typename T>
 void NetworkExchangeWgrad<T>::init_ar_comm(const std::vector<void*>& ptr, size_t sizes) {
   network_wgrad_size_ = sizes;
-  auto ar_comm = resource_manager_->get_ar_comm();
+  auto ar_comm = collective_manager_->get_ar_comm();
   for (size_t g = 0; g < num_gpus_; g++) {
     HCTR_CHECK_HINT(ptr[g], "buffer does not exist");
     ar_comm->set_coll_buf(ar_handle_, ptr[g], network_wgrad_size_, g);
@@ -46,24 +47,25 @@ void NetworkExchangeWgrad<T>::update_embed_wgrad_size(size_t size) {
 
 template <typename T>
 void NetworkExchangeWgrad<T>::allreduce(size_t device_id, cudaStream_t stream) {
-  auto ar_comm = resource_manager_->get_ar_comm();
+  auto ar_comm = collective_manager_->get_ar_comm();
   ar_comm->all_reduce(ar_handle_, stream, device_id);
 }
 
 template <typename T>
 GroupedExchangeWgrad<T>::GroupedExchangeWgrad(
-    const std::shared_ptr<ResourceManager>& resource_manager)
-    : resource_manager_(resource_manager), num_gpus_(resource_manager->get_local_gpu_count()) {
+    const std::shared_ptr<ResourceManager>& resource_manager,
+    const std::shared_ptr<CollectiveManager>& collective_manager)
+    : collective_manager_(collective_manager), num_gpus_(resource_manager->get_local_gpu_count()) {
   // TODO remove it after Hybrid embedding is deprecated
   embed_wgrad_buffs_.resize(num_gpus_, nullptr);
 
-  auto ar_comm = resource_manager_->get_ar_comm();
+  auto ar_comm = collective_manager_->get_ar_comm();
   ar_handle_ = ar_comm->register_coll();
 }
 template <typename T>
 void GroupedExchangeWgrad<T>::init_ar_comm(const std::vector<void*>& ptr, size_t sizes) {
   network_wgrad_size_ = sizes;
-  auto ar_comm = resource_manager_->get_ar_comm();
+  auto ar_comm = collective_manager_->get_ar_comm();
   for (size_t g = 0; g < num_gpus_; g++) {
     HCTR_CHECK_HINT(ptr[g], "buffer does not exist");
     ar_comm->set_coll_buf(ar_handle_, ptr[g], network_wgrad_size_, g);
@@ -79,7 +81,7 @@ void GroupedExchangeWgrad<T>::update_embed_wgrad_size(size_t size) {
 
 template <typename T>
 void GroupedExchangeWgrad<T>::allreduce(size_t device_id, cudaStream_t stream) {
-  auto ar_comm = resource_manager_->get_ar_comm();
+  auto ar_comm = collective_manager_->get_ar_comm();
   ar_comm->all_reduce(ar_handle_, stream, device_id);
 }
 
diff --git a/HugeCTR/src/pybind/add_dense_layer.cpp b/HugeCTR/src/pybind/add_dense_layer.cpp
index c590ba57ab..0cfae5649f 100644
--- a/HugeCTR/src/pybind/add_dense_layer.cpp
+++ b/HugeCTR/src/pybind/add_dense_layer.cpp
@@ -97,8 +97,6 @@ void save_graph_to_json(nlohmann::json& layer_config_array,
   for (size_t i = 0; i < input_param.data_reader_sparse_param_array.size(); ++i) {
     nlohmann::json input_sparse_config;
     input_sparse_config["top"] = input_param.data_reader_sparse_param_array[i].top_name;
-    input_sparse_config["type"] =
-        READER_SPARSE_TYPE_TO_STRING[input_param.data_reader_sparse_param_array[i].type];
     input_sparse_config["nnz_per_slot"] =
         input_param.data_reader_sparse_param_array[i].nnz_per_slot;
     input_sparse_config["is_fixed_length"] =
@@ -131,26 +129,6 @@ void save_graph_to_json(nlohmann::json& layer_config_array,
     if (sparse_embedding_params[i].slot_size_array.size() > 0) {
       sparse_hparam_config["slot_size_array"] = sparse_embedding_params[i].slot_size_array;
     }
-    if (sparse_embedding_params[i].embedding_type == Embedding_t::HybridSparseEmbedding) {
-      sparse_hparam_config["max_num_frequent_categories"] =
-          sparse_embedding_params[i].hybrid_embedding_param.max_num_frequent_categories;
-      sparse_hparam_config["max_num_infrequent_samples"] =
-          sparse_embedding_params[i].hybrid_embedding_param.max_num_infrequent_samples;
-      sparse_hparam_config["p_dup_max"] =
-          sparse_embedding_params[i].hybrid_embedding_param.p_dup_max;
-      sparse_hparam_config["max_all_reduce_bandwidth"] =
-          sparse_embedding_params[i].hybrid_embedding_param.max_all_reduce_bandwidth;
-      sparse_hparam_config["max_all_to_all_bandwidth"] =
-          sparse_embedding_params[i].hybrid_embedding_param.max_all_to_all_bandwidth;
-      sparse_hparam_config["efficiency_bandwidth_ratio"] =
-          sparse_embedding_params[i].hybrid_embedding_param.efficiency_bandwidth_ratio;
-      sparse_hparam_config["communication_type"] =
-          HE_COMM_TYPE_TO_STRING[sparse_embedding_params[i]
-                                     .hybrid_embedding_param.communication_type];
-      sparse_hparam_config["hybrid_embedding_type"] =
-          HE_TYPE_TO_STRING[sparse_embedding_params[i]
-                                .hybrid_embedding_param.hybrid_embedding_type];
-    }
     sparse_config["sparse_embedding_hparam"] = sparse_hparam_config;
     nlohmann::json optimizer_config;
     nlohmann::json optimizer_hparam_config;
diff --git a/HugeCTR/src/pybind/add_input.cpp b/HugeCTR/src/pybind/add_input.cpp
index 55f05cf0dc..facd22a60d 100644
--- a/HugeCTR/src/pybind/add_input.cpp
+++ b/HugeCTR/src/pybind/add_input.cpp
@@ -20,7 +20,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
-#include <data_readers/async_reader/async_reader_adapter.hpp>
 #include <data_readers/data_reader.hpp>
 #include <data_readers/multi_hot/async_data_reader.hpp>
 #include <pybind/model.hpp>
@@ -107,8 +106,7 @@ void add_input(Input& input, DataReaderParams& reader_params,
                std::vector<std::vector<TensorEntity>>& train_tensor_entries_list,
                std::vector<std::vector<TensorEntity>>& evaluate_tensor_entries_list,
                std::shared_ptr<IDataReader>& train_data_reader,
-               std::shared_ptr<IDataReader>& evaluate_data_reader,
-               std::shared_ptr<IDataReader>& init_data_reader, size_t batch_size,
+               std::shared_ptr<IDataReader>& evaluate_data_reader, size_t batch_size,
                size_t batch_size_eval, bool use_mixed_precision, bool repeat_dataset,
                bool train_intra_iteration_overlap, size_t num_iterations_statistics,
                const std::shared_ptr<ResourceManager> resource_manager) {
@@ -187,138 +185,8 @@ void add_input(Input& input, DataReaderParams& reader_params,
           eval_num_batches_per_thread, input.data_reader_sparse_param_array, total_label_dim,
           dense_dim, use_mixed_precision, false, schedule_h2d, is_float_dense));
 
-    } else {  // use original one-hot async reader
-      bool is_float_dense = reader_params.async_param.is_dense_float;
-      HCTR_CHECK_HINT(!is_float_dense, "One-hot RawAsync Reader only supports int32 dense type\n");
-      if (!repeat_dataset) {
-        HCTR_OWN_THROW(
-            Error_t::WrongInput,
-            "Epoch mode cannot be used with RawAsync reader, please set repeat_dataset as true");
-      }
-      std::string proc_file("/proc/sys/fs/aio-max-nr"), max_nr_str;
-      std::ifstream tmp_fs(proc_file, std::ifstream::in);
-      if (!tmp_fs.good()) {
-        HCTR_OWN_THROW(Error_t::InvalidEnv, "Can't read /proc/sys/fs/aio-max-nr");
-      }
-      int max_nr_requests_allowed_system = -1;
-      int actual_nr_requests = 2;
-      std::getline(tmp_fs, max_nr_str);
-      max_nr_requests_allowed_system = std::stoi(max_nr_str);
-      tmp_fs.close();
-      // TODO currently label+dense have to be int
-      size_t bytes_per_batch =
-          ((total_label_dim + dense_dim) * sizeof(int) + total_max_sparse_dim * sizeof(TypeKey)) *
-          batch_size;
-      Alignment_t aligned_type = reader_params.async_param.aligned_type;
-      int num_threads = reader_params.async_param.num_threads;
-      int num_batches_per_thread = reader_params.async_param.num_batches_per_thread;
-      int max_num_requests_per_thread = reader_params.async_param.max_num_requests_per_thread;
-      int io_depth = reader_params.async_param.io_depth;
-      int io_alignment = reader_params.async_param.io_alignment;
-      bool shuffle = reader_params.async_param.shuffle;
-
-      // Could be different if eval and train datasets are on different storage systems
-      int max_logical_sector_size =
-          std::max(get_logical_sector_size(source_data), get_logical_sector_size(eval_source));
-
-      if (max_logical_sector_size > io_alignment) {
-        HCTR_LOG_C(WARNING, WORLD, "Invalid io_alignment of ", io_alignment, ", using ",
-                   max_logical_sector_size, '\n');
-        io_alignment = max_logical_sector_size;
-      }
-
-      int io_block_size = io_alignment;
-      // TODO train_reader + evaluate_reader + init_reader?
-      int max_nr_requests_user = max_num_requests_per_thread * num_threads;
-      int max_num_batches = num_batches_per_thread * num_threads;
-
-      // note that nr_requests =  max_num_batches * (bytes_per_batch / io_block_size + 2). Each
-      // batch has at least 2 io requests
-      if (max_nr_requests_user > max_nr_requests_allowed_system) {
-        HCTR_LOG(
-            WARNING, WORLD,
-            "Too many concurrent io requests, will automatically compute (overall #io requests "
-            "= num_batches_per_thread * num_threads * (bytes_per_batch / io_block_size+2).\n");
-        max_nr_requests_user =
-            std::max(2, (max_nr_requests_allowed_system - 1) / max_num_batches) * max_num_batches;
-      }
-      if (max_nr_requests_user > max_nr_requests_allowed_system ||
-          max_num_batches * 2 >= max_nr_requests_user) {
-        HCTR_DIE("Too many batches for each thread!\n");
-      }
-      HCTR_LOG_S(INFO, ROOT) << "total_max_sparse_dim = " << total_max_sparse_dim << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "max_nr_requests_user = " << max_nr_requests_user << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "bytes_per_batch = " << bytes_per_batch << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "max_num_batches = " << max_num_batches << std::endl;
-      int next_nr_requests = 0;
-      for (int io_blk = io_alignment;; io_blk += io_alignment) {
-        actual_nr_requests = max_num_batches * (bytes_per_batch / io_blk + 2);
-        next_nr_requests = max_num_batches * (bytes_per_batch / (io_blk + 1) + 2);
-        // upper_bound
-        if ((actual_nr_requests <= max_nr_requests_user && actual_nr_requests > next_nr_requests) ||
-            bytes_per_batch < io_blk) {
-          io_block_size = io_blk;
-          break;
-        }
-      }
-      // int num_blocks_per_batch = max_nr_requests_user / max_num_batches - 2;
-
-      HCTR_CHECK_HINT(io_block_size % io_alignment == 0,
-                      " params_.io_block_size \% params_.io_alignment != 0");
-
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_threads = " << num_threads << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_batches_per_thread = " << num_batches_per_thread
-                             << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: total_io_nr_requests = " << actual_nr_requests
-                             << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_block_size = " << io_block_size << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_depth = " << io_depth << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_alignment = " << io_alignment << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: shuffle = " << (shuffle ? "ON" : "OFF") << std::endl;
-      HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_iterations_statistics = "
-                             << num_iterations_statistics << std::endl;
-
-      const bool wait_for_gpu_idle = train_intra_iteration_overlap;  // scheduling H2D
-      train_data_reader.reset(new AsyncReader<TypeKey>(
-          source_data, batch_size, total_label_dim, dense_dim, input.data_reader_sparse_param_array,
-          use_mixed_precision, resource_manager, num_threads, num_batches_per_thread, io_block_size,
-          io_depth, io_alignment, shuffle, wait_for_gpu_idle, aligned_type));
-
-      // If we want to cache eval, make sure we have enough buffers
-      auto eval_num_batches_per_thread = num_batches_per_thread;
-      int cache_eval_data = reader_params.cache_eval_data;
-      if (cache_eval_data > num_threads * num_batches_per_thread) {
-        eval_num_batches_per_thread = (cache_eval_data + num_threads - 1) / num_threads;
-        HCTR_LOG_S(INFO, ROOT) << "AsyncReader: eval reader increased batches per thread to "
-                               << eval_num_batches_per_thread << " to accommodate for the caching"
-                               << std::endl;
-      }
-
-      // Small IO block may lead to too many AIO requests which hang,
-      // so use a larger one for eval and init which are typically larger than train
-      evaluate_data_reader.reset(new AsyncReader<TypeKey>(
-          eval_source, batch_size_eval, total_label_dim, dense_dim,
-          input.data_reader_sparse_param_array, use_mixed_precision, resource_manager, num_threads,
-          eval_num_batches_per_thread, io_block_size * 8, io_depth, io_alignment, false, false,
-          aligned_type));
-
-      init_data_reader.reset(new AsyncReader<TypeKey>(
-          source_data, num_iterations_statistics * batch_size, total_label_dim, dense_dim,
-          input.data_reader_sparse_param_array, use_mixed_precision, resource_manager, 1, 1,
-          io_block_size * 8, 4, io_alignment, false, false, aligned_type));
-
-      auto train_data_reader_as =
-          std::dynamic_pointer_cast<AsyncReader<TypeKey>>(train_data_reader);
-      auto evaluate_data_reader_as =
-          std::dynamic_pointer_cast<AsyncReader<TypeKey>>(evaluate_data_reader);
-
-      if (input.data_reader_sparse_param_array.size() > 1) {
-        HCTR_OWN_THROW(Error_t::WrongInput, "Only one sparse input is supported.");
-      }
-      const auto& sparse_input =
-          sparse_input_map.find(input.data_reader_sparse_param_array[0].top_name);
-      sparse_input->second.train_sparse_tensors = train_data_reader_as->get_value_tensor23s();
-      sparse_input->second.evaluate_sparse_tensors = evaluate_data_reader_as->get_value_tensor23s();
+    } else {
+      HCTR_OWN_THROW(Error_t::WrongInput, "Only multi-hot async datareader is supported.");
     }
 
     auto schedulable_train_reader =
@@ -506,13 +374,13 @@ template void add_input<long long>(Input&, DataReaderParams&,
                                    std::vector<std::vector<TensorEntity>>&,
                                    std::vector<std::vector<TensorEntity>>&,
                                    std::shared_ptr<IDataReader>&, std::shared_ptr<IDataReader>&,
-                                   std::shared_ptr<IDataReader>&, size_t, size_t, bool, bool, bool,
-                                   size_t, const std::shared_ptr<ResourceManager>);
+                                   size_t, size_t, bool, bool, bool, size_t,
+                                   const std::shared_ptr<ResourceManager>);
 template void add_input<unsigned int>(Input&, DataReaderParams&,
                                       std::map<std::string, SparseInput<unsigned int>>&,
                                       std::vector<std::vector<TensorEntity>>&,
                                       std::vector<std::vector<TensorEntity>>&,
                                       std::shared_ptr<IDataReader>&, std::shared_ptr<IDataReader>&,
-                                      std::shared_ptr<IDataReader>&, size_t, size_t, bool, bool,
-                                      bool, size_t, const std::shared_ptr<ResourceManager>);
+                                      size_t, size_t, bool, bool, bool, size_t,
+                                      const std::shared_ptr<ResourceManager>);
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/pybind/add_sparse_embedding.cpp b/HugeCTR/src/pybind/add_sparse_embedding.cpp
index 165aa227d2..4d9078ab41 100644
--- a/HugeCTR/src/pybind/add_sparse_embedding.cpp
+++ b/HugeCTR/src/pybind/add_sparse_embedding.cpp
@@ -16,9 +16,7 @@
 
 #include <core23_helper.hpp>
 #include <embeddings/distributed_slot_sparse_embedding_hash.hpp>
-#include <embeddings/hybrid_sparse_embedding.hpp>
 #include <embeddings/localized_slot_sparse_embedding_hash.hpp>
-#include <embeddings/localized_slot_sparse_embedding_one_hot.hpp>
 #include <loss.hpp>
 #include <optimizer.hpp>
 #include <pybind/model.hpp>
@@ -142,35 +140,9 @@ SparseEmbedding get_sparse_embedding_from_json(const nlohmann::json& j_sparse_em
       }
     }
   }
-  HybridEmbeddingParam hybrid_embedding_param;
-  hybrid_embedding_param.max_num_frequent_categories =
-      get_value_from_json_soft<size_t>(j_hparam, "max_num_frequent_categories", 1);
-  hybrid_embedding_param.max_num_infrequent_samples =
-      get_value_from_json_soft<int64_t>(j_hparam, "max_num_infrequent_samples", -1);
-  hybrid_embedding_param.p_dup_max =
-      get_value_from_json_soft<double>(j_hparam, "p_dup_max", 1. / 100);
-  hybrid_embedding_param.max_all_reduce_bandwidth =
-      get_value_from_json_soft<double>(j_hparam, "max_all_reduce_bandwidth", 1.3e11);
-  hybrid_embedding_param.max_all_to_all_bandwidth =
-      get_value_from_json_soft<double>(j_hparam, "max_all_to_all_bandwidth", 1.9e11);
-  hybrid_embedding_param.efficiency_bandwidth_ratio =
-      get_value_from_json_soft<double>(j_hparam, "efficiency_bandwidth_ratio", 1.0);
-  std::string communication_type_string =
-      get_value_from_json_soft<std::string>(j_hparam, "communication_type", "IB_NVLink");
-  std::string hybrid_embedding_type_string =
-      get_value_from_json_soft<std::string>(j_hparam, "hybrid_embedding_type", "Distributed");
-  if (!find_item_in_map(hybrid_embedding_param.communication_type, communication_type_string,
-                        COMMUNICATION_TYPE_MAP)) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "No such communication type: " + communication_type_string);
-  }
-  if (!find_item_in_map(hybrid_embedding_param.hybrid_embedding_type, hybrid_embedding_type_string,
-                        HYBRID_EMBEDDING_TYPE_MAP)) {
-    HCTR_OWN_THROW(Error_t::WrongInput,
-                   "No such hybrid embedding type: " + hybrid_embedding_type_string);
-  }
-  SparseEmbedding sparse_embedding = SparseEmbedding(
-      embedding_type, workspace_size_per_gpu_in_mb, embedding_vec_size, combiner_str, top_name,
-      bottom_name, slot_size_array, embedding_opt_params, hybrid_embedding_param);
+  SparseEmbedding sparse_embedding =
+      SparseEmbedding(embedding_type, workspace_size_per_gpu_in_mb, embedding_vec_size,
+                      combiner_str, top_name, bottom_name, slot_size_array, embedding_opt_params);
   return sparse_embedding;
 }
 
@@ -181,6 +153,7 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding,
                           std::vector<std::vector<TensorEntity>>& evaluate_tensor_entries_list,
                           std::vector<std::shared_ptr<IEmbedding>>& embeddings,
                           const std::shared_ptr<ResourceManager>& resource_manager,
+                          const std::shared_ptr<CollectiveManager>& collective_manager,
                           size_t batch_size, size_t batch_size_eval,
                           OptParams& embedding_opt_params,
                           std::shared_ptr<ExchangeWgrad>& exchange_wgrad, bool use_cuda_graph,
@@ -235,57 +208,6 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding,
           embedding_params, resource_manager));
       break;
     }
-    case Embedding_t::LocalizedSlotSparseEmbeddingOneHot: {
-      const SparseEmbeddingHashParams embedding_params = {batch_size,
-                                                          batch_size_eval,
-                                                          0,
-                                                          sparse_embedding.slot_size_array,
-                                                          embedding_vec_size,
-                                                          sparse_input.max_feature_num_per_sample,
-                                                          sparse_input.slot_num,
-                                                          combiner,  // combiner: 0-sum, 1-mean
-                                                          embedding_opt_params};
-      embeddings.emplace_back(new LocalizedSlotSparseEmbeddingOneHot<TypeKey, TypeFP>(
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<TypeKey>(
-              sparse_input.train_sparse_tensors),
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<TypeKey>(
-              sparse_input.evaluate_sparse_tensors),
-          embedding_params, resource_manager));
-      break;
-    }
-    case Embedding_t::HybridSparseEmbedding: {
-      auto& embed_wgrad_buff =
-          (grouped_all_reduce)
-              ? std::dynamic_pointer_cast<GroupedExchangeWgrad<TypeFP>>(exchange_wgrad)
-                    ->get_embed_wgrad_buffs()
-              : std::dynamic_pointer_cast<NetworkExchangeWgrad<TypeFP>>(exchange_wgrad)
-                    ->get_embed_wgrad_buffs();
-
-      const HybridSparseEmbeddingParams embedding_params = {
-          batch_size,
-          batch_size_eval,
-          num_iterations_statistics,  // TBD
-          sparse_embedding.hybrid_embedding_param.max_num_frequent_categories *
-              std::max(batch_size, batch_size_eval),                           // TBD
-          sparse_embedding.hybrid_embedding_param.max_num_infrequent_samples,  // TBD
-          sparse_embedding.hybrid_embedding_param.p_dup_max,
-          embedding_vec_size,
-          sparse_input.slot_num,
-          sparse_embedding.slot_size_array,
-          sparse_embedding.hybrid_embedding_param.communication_type,
-          sparse_embedding.hybrid_embedding_param.max_all_reduce_bandwidth,
-          sparse_embedding.hybrid_embedding_param.max_all_to_all_bandwidth,  // TBD
-          sparse_embedding.hybrid_embedding_param.efficiency_bandwidth_ratio,
-          sparse_embedding.hybrid_embedding_param.hybrid_embedding_type,
-          embedding_opt_params};
-      embeddings.emplace_back(new HybridSparseEmbedding<TypeKey, TypeFP>(
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<TypeKey>(
-              sparse_input.train_sparse_tensors),
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<TypeKey>(
-              sparse_input.evaluate_sparse_tensors),
-          embedding_params, embed_wgrad_buff, gpu_lr_sches, use_cuda_graph, resource_manager));
-      break;
-    }
     default:
       HCTR_OWN_THROW(Error_t::UnspecificError,
                      "add_sparse_embedding with no specified embedding type.");
@@ -306,25 +228,25 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding,
 template void add_sparse_embedding<long long, float>(
     SparseEmbedding&, std::map<std::string, SparseInput<long long>>&,
     std::vector<std::vector<TensorEntity>>&, std::vector<std::vector<TensorEntity>>&,
-    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&, size_t,
-    size_t, OptParams&, std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t,
-    GpuLearningRateSchedulers&);
+    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&,
+    const std::shared_ptr<CollectiveManager>&, size_t, size_t, OptParams&,
+    std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t, GpuLearningRateSchedulers&);
 template void add_sparse_embedding<long long, __half>(
     SparseEmbedding&, std::map<std::string, SparseInput<long long>>&,
     std::vector<std::vector<TensorEntity>>&, std::vector<std::vector<TensorEntity>>&,
-    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&, size_t,
-    size_t, OptParams&, std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t,
-    GpuLearningRateSchedulers&);
+    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&,
+    const std::shared_ptr<CollectiveManager>&, size_t, size_t, OptParams&,
+    std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t, GpuLearningRateSchedulers&);
 template void add_sparse_embedding<unsigned int, float>(
     SparseEmbedding&, std::map<std::string, SparseInput<unsigned int>>&,
     std::vector<std::vector<TensorEntity>>&, std::vector<std::vector<TensorEntity>>&,
-    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&, size_t,
-    size_t, OptParams&, std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t,
-    GpuLearningRateSchedulers&);
+    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&,
+    const std::shared_ptr<CollectiveManager>&, size_t, size_t, OptParams&,
+    std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t, GpuLearningRateSchedulers&);
 template void add_sparse_embedding<unsigned int, __half>(
     SparseEmbedding&, std::map<std::string, SparseInput<unsigned int>>&,
     std::vector<std::vector<TensorEntity>>&, std::vector<std::vector<TensorEntity>>&,
-    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&, size_t,
-    size_t, OptParams&, std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t,
-    GpuLearningRateSchedulers&);
+    std::vector<std::shared_ptr<IEmbedding>>&, const std::shared_ptr<ResourceManager>&,
+    const std::shared_ptr<CollectiveManager>&, size_t, size_t, OptParams&,
+    std::shared_ptr<ExchangeWgrad>&, bool, bool, size_t, GpuLearningRateSchedulers&);
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/pybind/model.cpp b/HugeCTR/src/pybind/model.cpp
index 8a419ea282..ed0b08a30e 100644
--- a/HugeCTR/src/pybind/model.cpp
+++ b/HugeCTR/src/pybind/model.cpp
@@ -22,15 +22,13 @@
 #include <core23/mpi_init_service.hpp>
 #include <core23_helper.hpp>
 #include <core23_network.hpp>
-#include <data_readers/async_reader/async_reader_adapter.hpp>
 #include <data_readers/multi_hot/async_data_reader.hpp>
-#include <embeddings/hybrid_sparse_embedding.hpp>
 #include <fstream>
 #include <iomanip>
 #include <iterator>
 #include <network_buffer_channels.hpp>
 #include <pybind/model.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 using namespace HugeCTR::MultiHot;
 
@@ -38,58 +36,6 @@ namespace HugeCTR {
 
 namespace {
 
-/**
- * check if device is available.
- * lowest available CC is min_major.min_minor
- * @param device_id gpu id
- * @param min_major minimum compute compatibility required
- * @param min_minor minimum compute compatibility required
- */
-// #define DATA_READING_TEST 1
-static std::vector<std::string>& split(const std::string& s, char delim,
-                                       std::vector<std::string>& elems) {
-  std::istringstream is(s);
-  std::string item;
-  while (std::getline(is, item, delim)) {
-    elems.push_back(item);
-  }
-  return elems;
-}
-
-static std::string join(std::vector<std::string>& strs, std::string delim) {
-  std::string str;
-  const std::vector<std::string>::iterator itlast = strs.end() - 1;
-  for (auto it = strs.begin(); it != strs.end(); it++) {
-    str += *it;
-    if (it != itlast) {
-      str += delim;
-    }
-  }
-  return str;
-}
-
-static std::string get_tensor_shape(std::string tensor_name,
-                                    std::map<std::string, std::vector<size_t>> tensor_shape_info) {
-  std::string shape = "";
-  if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) {
-    shape += "(";
-    for (unsigned int i = 0; i < tensor_shape_info[tensor_name].size(); i++) {
-      shape += std::to_string(tensor_shape_info[tensor_name][i]);
-      shape += ",";
-    }
-    shape.back() = ')';
-  }
-  return shape;
-}
-static std::string get_tensor_shape(std::string tensor_name,
-                                    std::map<std::string, core23::Shape> tensor_shape_info) {
-  std::stringstream ss;
-  if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) {
-    ss << tensor_shape_info[tensor_name];
-  }
-  return ss.str();
-}
-
 static void check_device(int device_id, int min_major, int min_minor) {
   int device_count = 0;
   HCTR_LIB_THROW(cudaGetDeviceCount(&device_count));
@@ -115,25 +61,6 @@ static void check_device(int device_id, int min_major, int min_minor) {
   return;
 }
 
-template <typename TypeKey>
-auto load_key_files(std::vector<std::string> const& key_files) {
-  std::vector<TypeKey> keys_vec;
-  for (auto const& key_file : key_files) {
-    auto key_file_size = std::filesystem::file_size(key_file);
-    auto num_new_keys = key_file_size / sizeof(TypeKey);
-    std::ifstream key_fs(key_file, std::ifstream::binary);
-    if (!key_fs.is_open()) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "Cannot open the file: " + key_file);
-    }
-    auto num_exist_keys = keys_vec.size();
-    keys_vec.resize(num_exist_keys + num_new_keys);
-    key_fs.read(reinterpret_cast<char*>(&keys_vec[num_exist_keys]), key_file_size);
-  }
-  std::sort(keys_vec.begin(), keys_vec.end());
-  keys_vec.erase(std::unique(keys_vec.begin(), keys_vec.end()), keys_vec.end());
-  return keys_vec;
-}
-
 }  // end namespace
 
 DenseLayerComputeConfig::DenseLayerComputeConfig() : async_wgrad(false), fuse_wb(false){};
@@ -235,16 +162,14 @@ SparseEmbedding::SparseEmbedding(Embedding_t embedding_type, size_t workspace_si
                                  size_t embedding_vec_size, const std::string& combiner_str,
                                  std::string sparse_embedding_name, std::string bottom_name,
                                  std::vector<size_t>& slot_size_array,
-                                 std::shared_ptr<OptParamsPy>& embedding_opt_params,
-                                 const HybridEmbeddingParam& hybrid_embedding_param)
+                                 std::shared_ptr<OptParamsPy>& embedding_opt_params)
     : embedding_type(embedding_type),
       workspace_size_per_gpu_in_mb(workspace_size_per_gpu_in_mb),
       embedding_vec_size(embedding_vec_size),
       sparse_embedding_name(sparse_embedding_name),
       bottom_name(bottom_name),
       slot_size_array(slot_size_array),
-      embedding_opt_params(embedding_opt_params),
-      hybrid_embedding_param(hybrid_embedding_param) {
+      embedding_opt_params(embedding_opt_params) {
   if (combiner_str == "sum") {
     combiner = 0;
   } else if (combiner_str == "mean") {
@@ -364,21 +289,26 @@ void init_learning_rate_scheduler(std::shared_ptr<LearningRateScheduler>& lr_sch
 }
 
 void init_exchange_wgrad(const std::shared_ptr<ResourceManager>& resource_manager,
+                         const std::shared_ptr<CollectiveManager>& collective_manager,
                          std::shared_ptr<ExchangeWgrad>& exchange_wgrad, const Solver& solver) {
   HCTR_LOG(INFO, ROOT, "Using All-reduce algorithm: %s\n",
            ALLREDUCE_ALGO_TO_STRING[solver.all_reduce_algo].c_str());
-  resource_manager->set_ar_comm(solver.all_reduce_algo, solver.use_mixed_precision);
+  collective_manager->set_ar_comm(solver.all_reduce_algo, solver.use_mixed_precision);
   if (solver.grouped_all_reduce) {
     if (solver.use_mixed_precision) {
-      exchange_wgrad = std::make_shared<GroupedExchangeWgrad<__half>>(resource_manager);
+      exchange_wgrad =
+          std::make_shared<GroupedExchangeWgrad<__half>>(resource_manager, collective_manager);
     } else {
-      exchange_wgrad = std::make_shared<GroupedExchangeWgrad<float>>(resource_manager);
+      exchange_wgrad =
+          std::make_shared<GroupedExchangeWgrad<float>>(resource_manager, collective_manager);
     }
   } else {
     if (solver.use_mixed_precision) {
-      exchange_wgrad = std::make_shared<NetworkExchangeWgrad<__half>>(resource_manager);
+      exchange_wgrad =
+          std::make_shared<NetworkExchangeWgrad<__half>>(resource_manager, collective_manager);
     } else {
-      exchange_wgrad = std::make_shared<NetworkExchangeWgrad<float>>(resource_manager);
+      exchange_wgrad =
+          std::make_shared<NetworkExchangeWgrad<float>>(resource_manager, collective_manager);
     }
   }
 }
@@ -408,13 +338,12 @@ Model::Model(const Solver& solver, const DataReaderParams& reader_params,
   } else {
     HCTR_LOG(INFO, ROOT, "Initialize model: %s\n", solver_.model_name.c_str());
   }
-  resource_manager_ = ResourceManagerExt::create(solver.vvgpu, solver.seed, solver.device_layout);
-
+  resource_manager_ = ResourceManagerCore::create(solver.vvgpu, solver.seed, solver.device_layout);
+  collective_manager_ = std::make_shared<CollectiveManager>(resource_manager_);
   embedding_para_io_ = std::shared_ptr<embedding::EmbeddingParameterIO>(
       new embedding::EmbeddingParameterIO(resource_manager_));
-  init_exchange_wgrad(resource_manager_, exchange_wgrad_, solver_);
+  init_exchange_wgrad(resource_manager_, collective_manager_, exchange_wgrad_, solver_);
 
-  graph_scheduler_ = std::make_unique<GraphScheduler>(resource_manager_);
   for (auto dev : resource_manager_->get_local_gpu_device_id_list()) {
     if (solver_.use_mixed_precision) {
       check_device(dev, 7,
@@ -507,719 +436,6 @@ void Model::construct_from_json(const std::string& graph_config_file, bool inclu
   HCTR_LOG(INFO, ROOT, "Load the model graph from %s successfully\n", graph_config_file.c_str());
 }
 
-// deep copy
-void Model::create_copy_ops_for_network_input(const std::string& dense_name,
-                                              const std::string& label_name, bool is_train) {
-  auto& copy_ops = is_train ? graph_.train_copy_ops_ : graph_.evaluate_copy_ops_;
-  auto& tensor_entries_list =
-      is_train ? train_tensor_entities_list_ : evaluate_tensor_entities_list_;
-
-  int num_local_gpus = resource_manager_->get_local_gpu_count();
-  // copy ops for dense & label
-  copy_ops.resize(2 * num_local_gpus);
-
-  for (int id = 0; id < num_local_gpus; ++id) {
-    core23::Device device(core23::DeviceType::GPU,
-                          resource_manager_->get_local_gpu(id)->get_device_id());
-    for (auto& tensor_entry : tensor_entries_list[id]) {
-      if (tensor_entry.name == dense_name) {
-        copy_ops[id].reset(
-            new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor));
-        tensor_entry.tensor = copy_ops[id]->get_tensorbag();
-      } else if (tensor_entry.name == label_name) {
-        copy_ops[id + num_local_gpus].reset(
-            new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor));
-        tensor_entry.tensor = copy_ops[id + num_local_gpus]->get_tensorbag();
-      } else {
-        HCTR_OWN_THROW(Error_t::WrongInput, "wrong tensor entry name when creating copy_op.");
-      }
-    }
-  }
-}
-
-void Model::add(Input& input) {
-  std::string label_name = input.labels_.begin()->first;
-  int label_dim = input.labels_.begin()->second;
-
-  // If multiple labels, treat them as 1 big label and add a split layer (below)
-  if (input.labels_.size() > 1) {
-    label_name = "combined_multi_label";
-    label_dim = std::accumulate(std::begin(input.labels_), std::end(input.labels_), 0,
-                                [](const int previous, const std::pair<std::string, int>& p) {
-                                  return previous + p.second;
-                                });
-  }
-
-  input_params_.push_back(input);
-  activate_tensor(tensor_active_, label_name);
-  activate_tensor(tensor_active_, input.dense_name);
-  data_input_info_.push_back(label_name);
-  data_input_info_.push_back(input.dense_name);
-  tensor_shape_info_raw_.insert(
-      std::make_pair(label_name, std::vector<int>{solver_.batchsize, label_dim}));
-  tensor_shape_info_raw_.insert(
-      std::make_pair(input.dense_name, std::vector<int>{solver_.batchsize, input.dense_dim}));
-  if (solver_.use_embedding_collection) {
-    std::vector<std::string> top_name_list;
-    std::vector<int> nnz_per_slot;
-    bool is_fixed_length = true;
-    int num_slot = 0;
-    for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) {
-      auto& p = input.data_reader_sparse_param_array[i];
-      top_name_list.push_back(p.top_name);
-      if (p.slot_num != 1) {
-        HCTR_OWN_THROW(
-            Error_t::WrongInput,
-            "To use embedding collection, slots_num should be set to 1 in each sparse_param. "
-            "Please refer to notebooks/embedding_collection.ipynb and separate your multi-slot "
-            "output into multiple single-slot output");
-      }
-      nnz_per_slot.push_back(p.nnz_per_slot[0]);
-      if (!p.is_fixed_length) is_fixed_length = false;
-      num_slot += 1;
-      hotness_map_.insert({p.top_name, p.max_feature_num});
-    }
-    std::string concat_top_name = join(top_name_list, ",");
-    DataReaderSparseParam concat_data_reader_sparse_param{concat_top_name, nnz_per_slot,
-                                                          is_fixed_length, num_slot};
-    input.data_reader_sparse_param_array = {concat_data_reader_sparse_param};
-  }
-  std::vector<std::string> sparse_names;
-  for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) {
-    sparse_names.push_back(input.data_reader_sparse_param_array[i].top_name);
-    tensor_shape_info_raw_.insert(std::make_pair(
-        input.data_reader_sparse_param_array[i].top_name,
-        std::vector<int>{solver_.batchsize, input.data_reader_sparse_param_array[i].slot_num}));
-  }
-  data_input_info_.push_back(join(sparse_names, ","));
-  for (unsigned int i = 0; i < input.data_reader_sparse_param_array.size(); i++) {
-    activate_tensor(tensor_active_, input.data_reader_sparse_param_array[i].top_name);
-  }
-  if (solver_.i64_input_key) {
-    add_input<long long>(input, reader_params_, sparse_input_map_64_, train_tensor_entities_list_,
-                         evaluate_tensor_entities_list_, train_data_reader_, evaluate_data_reader_,
-                         init_data_reader_, solver_.batchsize, solver_.batchsize_eval,
-                         solver_.use_mixed_precision, solver_.repeat_dataset,
-                         solver_.train_intra_iteration_overlap, solver_.num_iterations_statistics,
-                         resource_manager_);
-  } else {
-    add_input<unsigned int>(input, reader_params_, sparse_input_map_32_,
-                            train_tensor_entities_list_, evaluate_tensor_entities_list_,
-                            train_data_reader_, evaluate_data_reader_, init_data_reader_,
-                            solver_.batchsize, solver_.batchsize_eval, solver_.use_mixed_precision,
-                            solver_.repeat_dataset, solver_.train_intra_iteration_overlap,
-                            solver_.num_iterations_statistics, resource_manager_);
-  }
-
-  if (solver_.use_embedding_collection and solver_.train_inter_iteration_overlap) {
-    create_copy_ops_for_network_input(input.dense_name, label_name, true);
-  }
-  if (solver_.use_embedding_collection and solver_.eval_inter_iteration_overlap) {
-    create_copy_ops_for_network_input(input.dense_name, label_name, false);
-  }
-
-  // Add label weights to model
-  for (std::map<std::string, float>::iterator iter = input.label_weights_.begin();
-       iter != input.label_weights_.end(); ++iter) {
-    label_weights_.insert(std::make_pair(iter->first, iter->second));
-  }
-
-  // If multiple labels provided, add a Slice layer to handle breaking up the label
-  if (input.labels_.size() > 1) {
-    std::vector<std::string> label_names;
-    std::vector<std::pair<int, int>> ranges;
-    int idx = 0;
-
-    for (std::map<std::string, int>::iterator iter = input.labels_.begin();
-         iter != input.labels_.end(); ++iter) {
-      label_names.push_back(iter->first);
-      if (iter->second < 1) {
-        HCTR_OWN_THROW(Error_t::WrongInput, "Each label dimension must be at lesat 1.");
-      }
-      ranges.push_back(std::make_pair(idx, idx + iter->second));
-      idx += iter->second;
-    }
-    std::vector<std::string> bottom_name{"combined_multi_label"};
-    DenseLayer label_slice_layer = DenseLayer(Layer_t::Slice, bottom_name, label_names);
-    label_slice_layer.ranges = ranges;
-
-    add(label_slice_layer);
-  }
-}
-
-void Model::add(SparseEmbedding& sparse_embedding) {
-  if (resource_manager_->get_num_process() == 1 && solver_.grouped_all_reduce &&
-      sparse_embedding.embedding_type == Embedding_t::HybridSparseEmbedding) {
-    HCTR_DIE("Grouped all reduce for HybridEmbedding is not supported on single node\n");
-  }
-  if ((reader_params_.data_reader_type == DataReaderType_t::RawAsync &&
-       sparse_embedding.embedding_type != Embedding_t::HybridSparseEmbedding) ||
-      (reader_params_.data_reader_type != DataReaderType_t::RawAsync &&
-       sparse_embedding.embedding_type == Embedding_t::HybridSparseEmbedding)) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "Raw async reader and hybrid embedding must come together");
-  }
-  OptParams embedding_opt_params;
-  if (!(sparse_embedding.embedding_opt_params)->initialized) {
-    sparse_embedding.embedding_opt_params = opt_params_py_;
-    sparse_embedding.initialize_max_vocabulary_size_per_gpu();
-  }
-  sparse_embedding.max_vocabulary_size_global =
-      sparse_embedding.max_vocabulary_size_per_gpu * resource_manager_->get_global_gpu_count();
-  sparse_embedding_params_.push_back(sparse_embedding);
-  deactivate_tensor(tensor_active_, sparse_embedding.bottom_name);
-  activate_tensor(tensor_active_, sparse_embedding.sparse_embedding_name);
-  int slot_num = tensor_shape_info_raw_[sparse_embedding.bottom_name][1];
-  tensor_shape_info_raw_.insert(
-      std::make_pair(sparse_embedding.sparse_embedding_name,
-                     std::vector<int>{solver_.batchsize, slot_num,
-                                      static_cast<int>(sparse_embedding.embedding_vec_size)}));
-  input_output_info_.push_back(
-      std::make_pair(sparse_embedding.bottom_name, sparse_embedding.sparse_embedding_name));
-  layer_info_.push_back(EMBEDDING_TYPE_TO_STRING[sparse_embedding.embedding_type]);
-
-  embedding_opt_params_list_.push_back(sparse_embedding.embedding_opt_params);
-  init_optimizer_params(embedding_opt_params, solver_, sparse_embedding.embedding_opt_params);
-  if (solver_.i64_input_key && !solver_.use_mixed_precision) {
-    add_sparse_embedding<long long, float>(
-        sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_,
-        evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize,
-        solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph,
-        solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_);
-  } else if (solver_.i64_input_key && solver_.use_mixed_precision) {
-    add_sparse_embedding<long long, __half>(
-        sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_,
-        evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize,
-        solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph,
-        solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_);
-  } else if (!solver_.i64_input_key && !solver_.use_mixed_precision) {
-    add_sparse_embedding<unsigned int, float>(
-        sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_,
-        evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize,
-        solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph,
-        solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_);
-  } else {
-    add_sparse_embedding<unsigned int, __half>(
-        sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_,
-        evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize,
-        solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph,
-        solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_);
-  }
-  embeddings_map_.insert(
-      std::make_pair(sparse_embedding.sparse_embedding_name, embeddings_.back()));
-  embedding_dependent_tensors_.insert(sparse_embedding.sparse_embedding_name);
-}
-
-void Model::add(DenseLayer& dense_layer) {
-  for (auto& top_name : dense_layer.top_names) {
-    if (tensor_shape_info_raw_.find(top_name) != tensor_shape_info_raw_.end()) {
-      HCTR_OWN_THROW(Error_t::WrongInput, top_name + ", top tensor name already exists");
-    }
-  }
-  for (auto& bottom_name : dense_layer.bottom_names) {
-    if (tensor_shape_info_raw_.find(bottom_name) == tensor_shape_info_raw_.end()) {
-      HCTR_OWN_THROW(Error_t::WrongInput, bottom_name + ", bottom tensor name does not exists");
-    }
-  }
-  calculate_tensor_dimensions(tensor_shape_info_raw_, dense_layer);
-  dense_layer_params_raw_.push_back(dense_layer);
-}
-
-template <typename emb_t>
-void allocate_ebc_output_helper_for_feature_major(
-    std::shared_ptr<ResourceManager> resource_manager_, size_t batch_size_per_gpu,
-    const EmbeddingCollectionConfig& ebc_config,
-    const embedding::EmbeddingCollectionParam& ebc_param,
-    std::vector<std::vector<TensorEntity>>& tensor_entries_list_,
-    std::vector<core23::Tensor>& ebc_output) {
-  HCTR_CHECK(ebc_config.output_layout_ == embedding::EmbeddingLayout::FeatureMajor);
-  int num_local_gpus = resource_manager_->get_local_gpu_count();
-  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
-    CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
-    core23::Device device(core23::DeviceType::GPU,
-                          resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
-    auto buffer_channel = core23::GetRandomBufferChannel();
-    core23::Tensor head_tensor;
-    core23::BufferParams buffer_param{.channel = buffer_channel};
-    core23::TensorParams tensor_param = core23::TensorParams().buffer_params(buffer_param);
-    int64_t concat_dims = 0;
-    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
-      const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
-      std::string top_name = ebc_config.top_names_[lookup_id];
-      int64_t emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat)
-                                 ? lookup_param.max_hotness * lookup_param.ev_size
-                                 : lookup_param.ev_size;
-
-      core23::Tensor tmp_tensor(tensor_param.shape({(int64_t)batch_size_per_gpu, 1ll, emb_out_dims})
-                                    .device(device)
-                                    .data_type(core23::ToScalarType<emb_t>::value));
-      concat_dims += emb_out_dims;
-      tensor_entries_list_[local_gpu_id].push_back({top_name, tmp_tensor});
-      if (!lookup_id) {
-        head_tensor = tmp_tensor;
-      }
-    }
-    // allocate
-    void* starting_address = head_tensor.data();
-    core23::Tensor continous_emb_output = core23::Tensor::bind(
-        starting_address, core23::Shape({static_cast<int64_t>(batch_size_per_gpu), concat_dims}),
-        core23::ToScalarType<emb_t>::value, device);
-    ebc_output.push_back(continous_emb_output);
-  }
-}
-
-template <typename emb_t>
-void allocate_ebc_output_helper_for_batch_major(
-    std::shared_ptr<ResourceManager> resource_manager_, size_t batch_size_per_gpu,
-    const EmbeddingCollectionConfig& ebc_config,
-    const embedding::EmbeddingCollectionParam& ebc_param,
-    std::vector<std::vector<TensorEntity>>& tensor_entries_list_,
-    std::vector<core23::Tensor>& ebc_output) {
-  int num_local_gpus = resource_manager_->get_local_gpu_count();
-  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
-    CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
-
-    core23::Device device(core23::DeviceType::GPU,
-                          resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
-    core23::TensorParams tensor_param;
-    int64_t emb_out_dims = 0;
-    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
-      const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
-
-      emb_out_dims += (lookup_param.combiner == embedding::Combiner::Concat)
-                          ? lookup_param.max_hotness * lookup_param.ev_size
-                          : lookup_param.ev_size;
-    }
-
-    core23::Tensor continous_emb_output(
-        tensor_param.shape({(int64_t)batch_size_per_gpu, emb_out_dims})
-            .device(device)
-            .data_type(core23::ToScalarType<emb_t>::value));
-    continous_emb_output.data();
-    ebc_output.push_back(continous_emb_output);
-
-    tensor_entries_list_[local_gpu_id].push_back(
-        {ebc_config.batch_major_output_name_, continous_emb_output});
-  }
-}
-
-std::vector<int> get_table_id_to_vocabulary_size(
-    const std::vector<embedding::EmbeddingTableParam>& table_params, bool need_vocabulary_size) {
-  // indices only need to initialize table offset
-  if (!need_vocabulary_size) {
-    return {};
-  }
-
-  // 2. init table_id_to_vocabulary_size and check if there is dynamic table
-  std::vector<int> table_id_to_vocabulary_size;
-  std::transform(table_params.begin(), table_params.end(),
-                 std::back_inserter(table_id_to_vocabulary_size),
-                 [](const embedding::EmbeddingTableParam& table_param) {
-                   return table_param.max_vocabulary_size;
-                 });
-
-  std::for_each(table_id_to_vocabulary_size.begin(), table_id_to_vocabulary_size.end(),
-                [](int vocabulary_size) {
-                  HCTR_CHECK_HINT(vocabulary_size > 0, "vocabuary_size should > 0.");
-                });
-  return table_id_to_vocabulary_size;
-}
-
-void Model::add(const EmbeddingCollectionConfig& user_ebc_config) {
-  auto ebc_config = split_column_wise_sharding_config(user_ebc_config);
-  TableNameToIDDict table_name_to_id_dict =
-      create_table_name_to_id_dict_from_ebc_config(ebc_config);
-  int global_ebc_id = static_cast<int>(ebc_list_.size());
-  for (auto& [name, id] : table_name_to_id_dict) {
-    HCTR_CHECK_HINT(ebc_name_to_global_id_dict_.find(name) == ebc_name_to_global_id_dict_.end(),
-                    "Duplicate table name: ", name, "\n");
-    ebc_name_to_global_id_dict_[name] = {global_ebc_id, id};
-  }
-  int num_total_gpus = resource_manager_->get_global_gpu_count();
-  int num_local_gpus = resource_manager_->get_local_gpu_count();
-
-  int num_lookup = ebc_config.lookup_configs_.size();
-  core23::DataType key_type =
-      solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32;
-  core23::DataType index_type =
-      solver_.i64_input_key ? core23::ScalarType::UInt64 : core23::ScalarType::UInt32;
-  core23::DataType offset_type =
-      solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32;
-  core23::DataType emb_type =
-      solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float;
-  core23::DataType wgrad_type =
-      solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float;
-  embedding::EmbeddingLayout input_layout_ =
-      reader_params_.data_reader_type == DataReaderType_t::RawAsync
-          ? embedding::EmbeddingLayout::FeatureMajor
-          : embedding::EmbeddingLayout::BatchMajor;
-
-  std::vector<std::string> bottom_name_list;
-  for (auto& bottom_name : ebc_config.bottom_names_) {
-    bottom_name_list.push_back(bottom_name);
-  }
-
-  std::string bottom_name = join(bottom_name_list, ",");
-  deactivate_tensor(tensor_active_, bottom_name);
-
-  layer_info_.push_back("EmbeddingCollection" + std::to_string(ebc_list_.size()));
-
-  auto lookup_params = create_lookup_params_from_ebc_config(table_name_to_id_dict, ebc_config);
-  for (int lookup_id = 0; lookup_id < num_lookup; ++lookup_id) {
-    auto b_name = ebc_config.bottom_names_[ebc_config.dr_lookup_ids_[lookup_id]];
-    lookup_params[lookup_id].max_hotness = hotness_map_[b_name];
-  }
-
-  auto shard_matrix = create_shard_matrix_from_ebc_config(table_name_to_id_dict, ebc_config);
-
-  auto grouped_emb_params =
-      create_grouped_embedding_param_from_ebc_config(table_name_to_id_dict, ebc_config);
-
-  int num_table = ebc_config.emb_table_config_list_.size();
-  auto emb_table_list = create_table_params_from_ebc_config(table_name_to_id_dict, ebc_config);
-  for (auto& p : emb_table_list) {
-    if (p.opt_param.optimizer == Optimizer_t::NOT_INITIALIZED) {
-      p.opt_param = opt_params_;
-    }
-  }
-
-  embedding::AllreduceStrategy allreduce_strategy = ebc_config.allreduce_strategy_;
-  if (solver_.grouped_all_reduce) {
-    allreduce_strategy = embedding::AllreduceStrategy::GroupDense;
-  }
-
-  auto compression_param =
-      create_compression_param_from_ebc_config(table_name_to_id_dict, ebc_config);
-  embedding::EmbeddingCollectionParam ebc_param{num_table,
-                                                num_lookup,
-                                                lookup_params,
-                                                shard_matrix,
-                                                grouped_emb_params,
-                                                solver_.batchsize,
-                                                key_type,
-                                                index_type,
-                                                offset_type,
-                                                emb_type,
-                                                wgrad_type,
-                                                input_layout_,
-                                                ebc_config.output_layout_,
-                                                ebc_config.sort_strategy_,
-                                                ebc_config.keys_preprocess_strategy_,
-                                                allreduce_strategy,
-                                                ebc_config.comm_strategy_,
-                                                compression_param};
-
-  embedding::EmbeddingCollectionParam eval_ebc_param{num_table,
-                                                     num_lookup,
-                                                     lookup_params,
-                                                     shard_matrix,
-                                                     grouped_emb_params,
-                                                     solver_.batchsize_eval,
-                                                     key_type,
-                                                     index_type,
-                                                     offset_type,
-                                                     emb_type,
-                                                     wgrad_type,
-                                                     input_layout_,
-                                                     ebc_config.output_layout_,
-                                                     ebc_config.sort_strategy_,
-                                                     ebc_config.keys_preprocess_strategy_,
-                                                     ebc_config.allreduce_strategy_,
-                                                     ebc_config.comm_strategy_,
-                                                     compression_param};
-
-  std::vector<std::shared_ptr<core::CoreResourceManager>> core_list;
-
-  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
-    auto core_resource_manager =
-        std::make_shared<hctr_internal::HCTRCoreResourceManager>(resource_manager_, local_gpu_id);
-    core_list.push_back(core_resource_manager);
-  }
-  ebc_list_.push_back(std::make_unique<embedding::EmbeddingCollection>(
-      resource_manager_, core_list, ebc_param, eval_ebc_param, emb_table_list, exchange_wgrad_));
-  embedding_para_io_->add_embedding_collection((ebc_list_[ebc_list_.size() - 1]).get());
-
-  auto prepare_ebc_input = [&](auto& sparse_input_map, bool is_longlong) {
-    core23::DataType SparseType = is_longlong ? core23::DataType(core23::ScalarType::Int64)
-                                              : core23::DataType(core23::ScalarType::UInt32);
-    auto tensor_as_type = [&](core23::Tensor input, core23::DataType expected_type) {
-      auto origin_type = input.data_type();
-      HCTR_CHECK_HINT(origin_type.size() == expected_type.size(),
-                      "Size not equal, cannot reinterpret type");
-      return core23::Tensor::bind(input.data(), input.shape(), expected_type, input.device());
-    };
-    auto train_sparse_tensors = sparse_input_map[bottom_name].train_sparse_tensors;
-    auto evaluate_sparse_tensors = sparse_input_map[bottom_name].evaluate_sparse_tensors;
-
-    for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
-      CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
-      core23::Device device{core23::DeviceType::GPU,
-                            static_cast<core23::DeviceIndex>(
-                                resource_manager_->get_local_gpu(local_gpu_id)->get_device_id())};
-      auto train_key_tensor =
-          tensor_as_type(train_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType);
-      train_ebc_key_list_.push_back(train_key_tensor);
-
-      auto train_bucket_range_tensor =
-          tensor_as_type(train_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType);
-      train_ebc_bucket_range_list_.push_back(train_bucket_range_tensor);
-
-      train_ebc_num_keys_list_.push_back(train_sparse_tensors[local_gpu_id].get_nnz_ptr().get());
-
-      auto evaluate_key_tensor =
-          tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType);
-      evaluate_ebc_key_list_.push_back(evaluate_key_tensor);
-
-      auto evaluate_bucket_range_tensor =
-          tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType);
-      evaluate_ebc_bucket_range_list_.push_back(evaluate_bucket_range_tensor);
-
-      evaluate_ebc_num_keys_list_.push_back(
-          evaluate_sparse_tensors[local_gpu_id].get_nnz_ptr().get());
-    }
-  };
-
-  if (reader_params_.data_reader_type != DataReaderType_t::RawAsync) {
-    if (solver_.i64_input_key) {
-      prepare_ebc_input(sparse_input_map_64_, true);
-    } else {
-      prepare_ebc_input(sparse_input_map_32_, false);
-    }
-  }
-
-  // activate_ebc_output_tensor
-  size_t batch_size_per_gpu = solver_.batchsize / num_total_gpus;
-  size_t eval_batch_size_per_gpu = solver_.batchsize_eval / num_total_gpus;
-  if (ebc_param.output_layout_ == embedding::EmbeddingLayout::FeatureMajor) {
-    std::vector<std::string> top_name_list;
-    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
-      embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
-      int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat)
-                             ? lookup_param.max_hotness * lookup_param.ev_size
-                             : lookup_param.ev_size;
-
-      std::string top_name = ebc_config.top_names_[lookup_id];
-      top_name_list.push_back(top_name);
-
-      activate_tensor(tensor_active_, top_name);
-      tensor_shape_info_raw_.insert({top_name, {solver_.batchsize, 1, emb_out_dims}});
-      embedding_dependent_tensors_.insert(top_name);
-    }
-    input_output_info_.push_back(std::make_pair(bottom_name, join(top_name_list, ",")));
-    if (solver_.use_mixed_precision) {
-      allocate_ebc_output_helper_for_feature_major<__half>(
-          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
-          train_ebc_outptut_);
-      allocate_ebc_output_helper_for_feature_major<__half>(
-          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
-          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
-    } else {
-      allocate_ebc_output_helper_for_feature_major<float>(
-          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
-          train_ebc_outptut_);
-      allocate_ebc_output_helper_for_feature_major<float>(
-          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
-          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
-    }
-  } else {
-    int concate_out_dims = 0;
-    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
-      embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
-
-      int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat)
-                             ? lookup_param.max_hotness * lookup_param.ev_size
-                             : lookup_param.ev_size;
-      concate_out_dims += emb_out_dims;
-    }
-
-    activate_tensor(tensor_active_, ebc_config.batch_major_output_name_);
-    tensor_shape_info_raw_.insert(
-        {ebc_config.batch_major_output_name_, {solver_.batchsize, concate_out_dims}});
-    input_output_info_.push_back(std::make_pair(bottom_name, ebc_config.batch_major_output_name_));
-    embedding_dependent_tensors_.insert(ebc_config.batch_major_output_name_);
-
-    // allocate output buffer
-    if (solver_.use_mixed_precision) {
-      allocate_ebc_output_helper_for_batch_major<__half>(
-          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
-          train_ebc_outptut_);
-      allocate_ebc_output_helper_for_batch_major<__half>(
-          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
-          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
-    } else {
-      allocate_ebc_output_helper_for_batch_major<float>(
-          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
-          train_ebc_outptut_);
-      allocate_ebc_output_helper_for_batch_major<float>(
-          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
-          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
-    }
-  }
-
-  train_ddl_output_.clear();
-  cache_train_ddl_output_.clear();
-  evaluate_ddl_output_.clear();
-  cache_evaluate_ddl_output_.clear();
-  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
-    train_ddl_output_.push_back(
-        allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param));
-    if (solver_.train_inter_iteration_overlap) {
-      cache_train_ddl_output_.push_back(
-          allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param));
-    }
-    evaluate_ddl_output_.push_back(
-        allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param));
-    if (solver_.eval_inter_iteration_overlap) {
-      cache_evaluate_ddl_output_.push_back(
-          allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param));
-    }
-  }
-
-  // create data distributors
-  train_data_distributor_ = std::make_shared<DataDistributor>(core_list, ebc_param, emb_table_list,
-                                                              ebc_config.dr_lookup_ids_);
-  eval_data_distributor_ = std::make_shared<DataDistributor>(
-      core_list, eval_ebc_param, emb_table_list, ebc_config.dr_lookup_ids_);
-}
-
-void Model::pre_add_dense_layer(DenseLayer& dense_layer) {
-  embedding_dependent_ = false;
-  for (auto& bottom_name : dense_layer.bottom_names) {
-    deactivate_tensor(tensor_active_, bottom_name);
-    if (embedding_dependent_tensors_.find(bottom_name) != embedding_dependent_tensors_.end()) {
-      embedding_dependent_ = true;
-    }
-  }
-  for (auto& top_name : dense_layer.top_names) {
-    activate_tensor(tensor_active_, top_name);
-    if (embedding_dependent_) {
-      embedding_dependent_tensors_.insert(top_name);
-    }
-  }
-  std::string input_names = join(dense_layer.bottom_names, ",");
-  std::string output_names = join(dense_layer.top_names, ",");
-  input_output_info_.push_back(std::make_pair(input_names, output_names));
-  if (solver_.use_mixed_precision) {
-    layer_info_.push_back(LAYER_TYPE_TO_STRING_MP[dense_layer.layer_type]);
-  } else {
-    layer_info_.push_back(LAYER_TYPE_TO_STRING[dense_layer.layer_type]);
-  }
-}
-
-void Model::graph_analysis() {
-  HCTR_LOG(INFO, ROOT, "Graph analysis to resolve tensor dependency\n");
-  std::map<std::string, unsigned int> tensor_usage;
-  std::map<std::string, DenseLayer> tensor_slice_layer;
-  std::map<std::string, unsigned int> tensor_slice_index;
-  for (auto& dense_layer : dense_layer_params_raw_) {
-    for (auto& bottom_name : dense_layer.bottom_names) {
-      analyze_tensor(tensor_usage, bottom_name);
-    }
-  }
-  for (auto iter = tensor_usage.begin(); iter != tensor_usage.end(); iter++) {
-    if (iter->second > 5) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "The graph should not include more than 5-way branches");
-    }
-    if (iter->second > 1) {
-      std::vector<std::string> bottom_names{iter->first};
-      std::vector<std::string> top_names;
-      std::vector<std::pair<int, int>> ranges;
-      for (unsigned int i = 0; i < iter->second; i++) {
-        top_names.push_back(iter->first + "_slice" + std::to_string(i));
-        auto dims = tensor_shape_info_raw_[iter->first].size();
-        ranges.emplace_back(std::make_pair(0, tensor_shape_info_raw_[iter->first][dims - 1]));
-      }
-      DenseLayer slice_layer(Layer_t::Slice, bottom_names, top_names);
-      slice_layer.ranges = ranges;
-      tensor_slice_layer.insert(std::pair<std::string, DenseLayer>(iter->first, slice_layer));
-      tensor_slice_index.insert(std::pair<std::string, unsigned int>(iter->first, 0));
-      HCTR_LOG(INFO, ROOT, "Add Slice layer for tensor: %s, creating %d copies\n",
-               iter->first.c_str(), iter->second);
-    }
-  }
-  for (auto& dense_layer : dense_layer_params_raw_) {
-    bool flag = true;
-    for (auto& bottom_name : dense_layer.bottom_names) {
-      if (tensor_usage[bottom_name] > 1) {
-        flag = false;
-        break;
-      }
-    }
-    if (flag) {
-      dense_layer_params_.push_back(dense_layer);
-    } else {
-      DenseLayer new_dense_layer = dense_layer;
-      for (unsigned int i = 0; i < new_dense_layer.bottom_names.size(); i++) {
-        std::string old_bottom_name = new_dense_layer.bottom_names[i];
-        if (tensor_slice_index.find(old_bottom_name) != tensor_slice_index.end()) {
-          auto iter = tensor_slice_layer.find(old_bottom_name);
-          if (tensor_slice_index[old_bottom_name] == 0) {
-            dense_layer_params_.push_back(iter->second);
-          }
-          std::string new_bottom_name = iter->second.top_names[tensor_slice_index[old_bottom_name]];
-          tensor_slice_index[old_bottom_name] += 1;
-          new_dense_layer.bottom_names[i] = new_bottom_name;
-        }
-      }
-      dense_layer_params_.push_back(new_dense_layer);
-    }
-  }
-  add_dense_layers(dense_layer_params_);
-}
-
-void Model::compile() {
-  if (!graph_finalized_) {
-    graph_analysis();
-    graph_finalized_ = true;
-  }
-  if (data_input_info_.size() < 3 || layer_info_.size() < 2) {
-    HCTR_OWN_THROW(Error_t::IllegalCall, "The model should include input and at least two layers");
-  }
-  HCTR_PRINT(INFO,
-             "===================================================Model "
-             "Compile===================================================\n");
-  build_networks();
-
-  // TODO: this is a WAR; need to find a way to remove the preallocation
-  for (int local_gpu_id = 0; local_gpu_id < resource_manager_->get_local_gpu_count();
-       ++local_gpu_id) {
-    auto device_id = resource_manager_->get_local_gpu(local_gpu_id)->get_device_id();
-    core23::Device device(core23::DeviceType::GPU, device_id);
-    bool success = core23::AllocateBuffers(device);
-    if (!success) {
-      HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl;
-    }
-  }
-  core23::Device device_h(core23::DeviceType::CPU);
-  bool success = core23::AllocateBuffers(device_h);
-  if (!success) {
-    HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl;
-  }
-  initialize();
-  create_metrics();
-  create_pipelines();
-}
-
-void Model::compile(std::vector<std::string>& label_names, std::vector<float>& label_weights) {
-  update_label_weights(label_names, label_weights);
-  compile();
-}
-
-void Model::update_label_weights(std::vector<std::string>& label_names,
-                                 std::vector<float>& label_weights) {
-  // Add implementation and support in next merge request
-  if (label_names.size() != label_weights.size()) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "Must have the same number of label names and weights");
-  }
-  std::map<std::string, float>::iterator loss_lookup;
-  for (size_t i = 0; i < label_names.size(); ++i) {
-    loss_lookup = label_weights_.find(label_names[i]);
-    if (loss_lookup == label_weights_.end()) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "Label name not found: " + label_names[i]);
-    }
-    loss_lookup->second = label_weights[i];
-  }
-}
-
 void Model::load_dense_optimizer_states(const std::string& dense_opt_states_file) {
   if (!buff_allocated_) {
     HCTR_OWN_THROW(Error_t::IllegalCall,
@@ -1448,76 +664,6 @@ void Model::embedding_dump(const std::string& path, const std::vector<std::strin
   }
 }
 
-void Model::summary() {
-  if (!graph_finalized_) {
-    graph_analysis();
-    graph_finalized_ = true;
-  }
-  if (data_input_info_.size() < 3 || layer_info_.size() < 2) {
-    HCTR_OWN_THROW(Error_t::IllegalCall,
-                   "The model should include input and at "
-                   "least two layers");
-  }
-  for (auto tensor_entry : train_tensor_entities_list_[0]) {
-    tensor_shape_info_.insert(std::make_pair(tensor_entry.name, tensor_entry.tensor.shape()));
-  }
-  HCTR_PRINT(INFO,
-             "============================================"
-             "=======Model "
-             "Summary====================================="
-             "==============\n");
-  auto log = HCTR_LOG_S(INFO, ROOT);
-  log << "Model structure on each GPU" << std::endl;
-  log << std::left << std::setw(40) << std::setfill(' ') << "Label" << std::left << std::setw(30)
-      << std::setfill(' ') << "Dense" << std::left << std::setw(30) << std::setfill(' ') << "Sparse"
-      << std::endl;
-  log << std::left << std::setw(40) << std::setfill(' ') << data_input_info_[0] << std::left
-      << std::setw(30) << std::setfill(' ') << data_input_info_[1] << " " << std::left
-      << std::setw(30) << std::setfill(' ') << data_input_info_[2] << std::endl;
-  log << std::left << std::setw(40) << std::setfill(' ')
-      << get_tensor_shape(data_input_info_[0], tensor_shape_info_) << std::left << std::setw(40)
-      << std::setfill(' ') << get_tensor_shape(data_input_info_[1], tensor_shape_info_)
-      << std::endl;
-  log << "————————————————————————————————————————————————"
-         "—————————————————————————————————"
-         "—————————————————————————————————"
-      << std::endl;
-  log << std::left << std::setw(40) << std::setfill(' ') << "Layer Type" << std::left
-      << std::setw(30) << std::setfill(' ') << "Input Name" << std::left << std::setw(30)
-      << std::setfill(' ') << "Output Name" << std::left << std::setw(30) << std::setfill(' ')
-      << "Output Shape" << std::endl;
-  log << "————————————————————————————————————————————————"
-         "—————————————————————————————————"
-         "—————————————————————————————————"
-      << std::endl;
-  for (size_t i = 0; i < layer_info_.size(); ++i) {
-    std::vector<std::string> layer_type{layer_info_[i]};
-    std::vector<std::string> input_names;
-    std::vector<std::string> output_names;
-    split(input_output_info_[i].first, ',', input_names);
-    split(input_output_info_[i].second, ',', output_names);
-    size_t lines =
-        input_names.size() > output_names.size() ? input_names.size() : output_names.size();
-    layer_type.insert(layer_type.end(), lines - 1, "");
-    if (lines > input_names.size()) {
-      input_names.insert(input_names.end(), lines - input_names.size(), "");
-    }
-    if (lines > output_names.size()) {
-      output_names.insert(output_names.end(), lines - output_names.size(), "");
-    }
-    for (size_t j = 0; j < lines; j++) {
-      log << std::left << std::setw(40) << std::setfill(' ') << layer_type[j] << std::left
-          << std::setw(30) << std::setfill(' ') << input_names[j] << std::left << std::setw(30)
-          << std::setfill(' ') << output_names[j] << std::left << std::setw(30) << std::setfill(' ')
-          << get_tensor_shape(output_names[j], tensor_shape_info_) << std::endl;
-    }
-    log << "----------------------------------------------"
-           "-----------------------------------"
-           "---------------------------------"
-        << std::endl;
-  }
-}
-
 void Model::set_source(std::string source, std::string eval_source) {
   if (solver_.repeat_dataset) {
     HCTR_OWN_THROW(Error_t::IllegalCall,
@@ -1863,13 +1009,6 @@ void Model::fit(int num_epochs, int max_iter, int display, int eval_interval, in
   }  // end if else
   high_level_eval_ = false;
 }
-void Model::exchange_wgrad(size_t device_id) {
-  auto& gpu_resource = resource_manager_->get_local_gpu(device_id);
-  CudaCPUDeviceContext context(gpu_resource->get_device_id());
-  if (resource_manager_->get_global_gpu_count() > 1) {
-    exchange_wgrad_->allreduce(device_id, gpu_resource->get_stream());
-  }
-}
 
 bool Model::skip_prefetch_in_last_batch(bool is_train) {
   bool inter_overlap =
@@ -1924,9 +1063,6 @@ bool Model::train() {
     // a file list source, set "num_workers" to a dvisior
     // of the number of data files in the file list. We
     // will look into some alternatives in the long term.
-    if (is_scheduled_datareader() and is_scheduled_embedding()) {
-      graph_scheduler_->trickling();
-    }
 
     const char* const skip_h2d_env = std::getenv("SKIP_H2D");
     bool skip_h2d = (skip_h2d_env != nullptr && 1 == std::atoi(skip_h2d_env));
@@ -1956,11 +1092,6 @@ bool Model::train() {
       return true;
     }
 
-    if (is_scheduled_datareader() && is_scheduled_embedding()) {
-      train_pipeline(current_batchsize);
-      return true;
-    }
-
     auto network_update = [&](int id) { networks_[id]->update_params(); };
 
     for (auto& one_embedding : embeddings_) {
@@ -2036,26 +1167,22 @@ bool Model::eval() {
       return true;
     }
 
-    if (is_scheduled_datareader() && is_scheduled_embedding()) {
-      evaluate_pipeline(current_batchsize);
-    } else {
-      for (size_t i = 0; i < embeddings_.size(); ++i) {
-        auto& one_embedding = embeddings_.at(i);
-        one_embedding->forward(false);
-      }
+    for (size_t i = 0; i < embeddings_.size(); ++i) {
+      auto& one_embedding = embeddings_.at(i);
+      one_embedding->forward(false);
+    }
 
 #pragma omp parallel num_threads(number_of_networks())
-      {
-        size_t id = omp_get_thread_num();
-        auto gpu = resource_manager_->get_local_gpu(id);
+    {
+      size_t id = omp_get_thread_num();
+      auto gpu = resource_manager_->get_local_gpu(id);
 
-        // doesn't do anything if eval_overlap disabled
-        graph_.evaluate_pipeline_[id].run();
-      }
+      // doesn't do anything if eval_overlap disabled
+      graph_.evaluate_pipeline_[id].run();
+    }
 
-      for (auto& metric : metrics_) {
-        metric->global_reduce(number_of_networks());
-      }
+    for (auto& metric : metrics_) {
+      metric->global_reduce(number_of_networks());
     }
 #endif
 
@@ -2376,254 +1503,6 @@ void Model::check_out_tensor(Tensor_t tensor_type, int index, float* global_resu
   }
 }
 
-void Model::create_networks() {
-  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
-    networks_.emplace_back(new Network(resource_manager_->get_local_cpu(),
-                                       resource_manager_->get_local_gpu(i),
-                                       solver_.use_mixed_precision));
-  }
-  train_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count());
-  evaluate_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count());
-}
-
-void Model::build_networks() {
-  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
-    networks_[i]->create_and_set_optimizer(opt_params_);
-  }
-  auto aligned_size = 16 * resource_manager_->get_local_gpu_count();
-  core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel()
-                                                                 : GetWgradBufferChannel()};
-  for (int g = 0; g < resource_manager_->get_local_gpu_count(); g++) {
-    auto device_id = resource_manager_->get_local_gpu(g)->get_device_id();
-    core23::Device device(core23::DeviceType::GPU, device_id);
-    auto wgrad_buffer = core23::GetBuffer(bp, device);
-    auto wgrad_size = wgrad_buffer->reserved_size();
-    size_t padded_bytes = wgrad_size % aligned_size;
-    padded_bytes += aligned_size - padded_bytes;
-    // alignment requirements from grouped allreduce.
-    wgrad_tensor_successor_.emplace_back(core23::TensorParams()
-                                             .device(device)
-                                             .shape({static_cast<int64_t>(padded_bytes)})
-                                             .data_type(core23::ScalarType::Char)
-                                             .buffer_params(bp));
-  }
-  buff_allocated_ = true;
-}
-
-void Model::initialize() {
-#ifndef DATA_READING_TEST
-
-#pragma omp parallel num_threads(number_of_networks())
-  {
-    size_t id = omp_get_thread_num();
-    networks_[id]->initialize();
-    if (solver_.use_algorithm_search) {
-      networks_[id]->search_algorithm();
-    }
-    HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(id)->get_stream()));
-  }
-
-  int num_gpus = resource_manager_->get_local_gpu_count();
-  std::vector<void*> wgrad_buffer_ptrs;
-  size_t wgrad_buffer_size{};
-  core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel()
-                                                                 : GetWgradBufferChannel()};
-  for (int g = 0; g < num_gpus; g++) {
-    auto device_id = resource_manager_->get_local_gpu(g)->get_device_id();
-    core23::Device device(core23::DeviceType::GPU, device_id);
-    auto wgrad_buffer = core23::GetBuffer(bp, device);
-    auto [ptr_, size_] = wgrad_buffer->decay();
-    wgrad_buffer_size = size_;
-    HCTR_CHECK_HINT(size_ && ptr_, "wgrad is null or it's a confederal buffer");
-    wgrad_buffer_ptrs.push_back(ptr_);
-  }
-  exchange_wgrad_->init_ar_comm(wgrad_buffer_ptrs, wgrad_buffer_size);
-#endif
-  init_params_for_dense_();
-  if (solver_.perf_logging) {
-    for (size_t i = 0; i < dense_layer_params_.size(); i++) {
-      bool is_trainable =
-          TRAINABLE_LAYERS.find(dense_layer_params_[i].layer_type) != TRAINABLE_LAYERS.end();
-      if (is_trainable) {
-        std::string output_names = join(dense_layer_params_[i].top_names, "-");
-        HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization", output_names);
-      }
-    }
-  }
-  init_params_for_sparse_();
-}  // namespace HugeCTR
-void Model::create_metrics() {
-  int num_total_gpus = resource_manager_->get_global_gpu_count();
-  int label_dim = input_params_[0].labels_.begin()->second;
-  if (input_params_[0].labels_.size() > 1) {
-    auto labs = input_params_[0].labels_;
-    label_dim = std::accumulate(std::begin(labs), std::end(labs), 0,
-                                [](const int previous, const std::pair<std::string, int>& p) {
-                                  return previous + p.second;
-                                });
-  }
-
-  auto num_metrics = [&]() { return networks_[0]->get_raw_metrics_all().size(); };
-  for (const auto& metric : solver_.metrics_spec) {
-    // Only AUC is currently supported for models with more than one loss layer
-    if ((metric.first != metrics::Type::AUC) && num_metrics() > 1) {
-      HCTR_OWN_THROW(Error_t::WrongInput,
-                     "Metrics besides AUC are not supported for multi-task models.");
-    }
-
-    metrics_.emplace_back(std::move(metrics::Metric::Create(
-        metric.first, solver_.use_mixed_precision, solver_.batchsize_eval / num_total_gpus,
-        solver_.max_eval_batches, label_dim, resource_manager_)));
-  }
-}
-
-void Model::create_pipelines() {
-  // TODO: currently it is only for HE
-  if (embeddings_.size() == 1) {
-    auto lr_scheds = embeddings_[0]->get_learning_rate_schedulers();
-    for (size_t i = 0; i < lr_scheds.size(); i++) {
-      networks_[i]->set_learning_rate_scheduler(lr_scheds[i]);
-    }
-  }
-
-  if (is_scheduled_datareader() && is_scheduled_embedding()) {
-    // will create pipeline for sparse embedding and dense network
-    create_train_pipeline(networks_);
-    create_evaluate_pipeline(networks_);
-  } else {
-    if (solver_.use_embedding_collection) {
-      create_train_pipeline_with_ebc(networks_);
-      create_evaluate_pipeline_with_ebc(networks_);
-    } else {
-      // will create pipeline for dense network.
-      create_train_network_pipeline(networks_);
-      create_eval_network_pipeline(networks_);
-    }
-  }
-
-  size_t embed_wgrad_size = 0;
-  if (!reader_params_.async_param.multi_hot_reader) {
-    auto train_data_reader_ar_i64 = dynamic_cast<AsyncReader<long long>*>(train_data_reader_.get());
-    auto eval_data_reader_ar_i64 =
-        dynamic_cast<AsyncReader<long long>*>(evaluate_data_reader_.get());
-    auto init_data_reader_ar_i64 = dynamic_cast<AsyncReader<long long>*>(init_data_reader_.get());
-
-    auto train_data_reader_ar_i32 =
-        dynamic_cast<AsyncReader<unsigned int>*>(train_data_reader_.get());
-    auto eval_data_reader_ar_i32 =
-        dynamic_cast<AsyncReader<unsigned int>*>(evaluate_data_reader_.get());
-    auto init_data_reader_ar_i32 =
-        dynamic_cast<AsyncReader<unsigned int>*>(init_data_reader_.get());
-
-    // FIXME:
-    // If doing async indices, the Hybrid Sparse Embedding needs access to the sparse tensor buffers
-    // since we need to initialize the Frequent & Infrequent indices with those exact buffers.
-    // Otherwise we allocate two copies (one in AsyncReader and the other in HSE) which will cause
-    // us to OOM. We need to refactor the Frequent/Infrequent Embedding and IndicesView classes to
-    // not require the sparse tensor buffers on construction.
-    for (size_t i = 0; i < sparse_embedding_params_.size(); i++) {
-      if (sparse_embedding_params_[i].embedding_type == Embedding_t::HybridSparseEmbedding) {
-        if (solver_.use_mixed_precision && solver_.i64_input_key) {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<long long, __half>*>(embeddings_[i].get());
-          if (solver_.train_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i64);
-          }
-          if (solver_.eval_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i64);
-          }
-        } else if (solver_.use_mixed_precision && !solver_.i64_input_key) {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<unsigned int, __half>*>(embeddings_[i].get());
-          if (solver_.train_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i32);
-          }
-          if (solver_.eval_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i32);
-          }
-        } else if (!solver_.use_mixed_precision && solver_.i64_input_key) {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<long long, float>*>(embeddings_[i].get());
-          if (solver_.train_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i64);
-          }
-          if (solver_.eval_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i64);
-          }
-        } else {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<unsigned int, float>*>(embeddings_[i].get());
-          if (solver_.train_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i32);
-          }
-          if (solver_.eval_inter_iteration_overlap) {
-            hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i32);
-          }
-        }
-      }
-    }
-
-    // start to touch dataset, so we can record run_start
-    if (solver_.perf_logging) {
-      HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "init_stop");
-      HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "run_start");
-    }
-
-    if (init_data_reader_ar_i32) {
-      init_data_reader_ar_i32->start();
-      init_data_reader_ar_i32->read_a_batch_to_device();
-    }
-    if (init_data_reader_ar_i64) {
-      init_data_reader_ar_i64->start();
-      init_data_reader_ar_i64->read_a_batch_to_device();
-    }
-
-    for (size_t i = 0; i < sparse_embedding_params_.size(); i++) {
-      if (sparse_embedding_params_[i].embedding_type == Embedding_t::HybridSparseEmbedding) {
-        if (solver_.use_mixed_precision && solver_.i64_input_key) {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<long long, __half>*>(embeddings_[i].get());
-          hybrid_embedding->init_model(init_data_reader_ar_i64->get_value_tensors(),
-                                       embed_wgrad_size);
-        } else if (solver_.use_mixed_precision && !solver_.i64_input_key) {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<unsigned int, __half>*>(embeddings_[i].get());
-          hybrid_embedding->init_model(init_data_reader_ar_i32->get_value_tensors(),
-                                       embed_wgrad_size);
-        } else if (!solver_.use_mixed_precision && solver_.i64_input_key) {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<long long, float>*>(embeddings_[i].get());
-          hybrid_embedding->init_model(init_data_reader_ar_i64->get_value_tensors(),
-                                       embed_wgrad_size);
-        } else {
-          auto hybrid_embedding =
-              dynamic_cast<HybridSparseEmbedding<unsigned int, float>*>(embeddings_[i].get());
-          hybrid_embedding->init_model(init_data_reader_ar_i32->get_value_tensors(),
-                                       embed_wgrad_size);
-        }
-      }
-    }
-  } else {
-    if (solver_.perf_logging) {
-      HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "init_stop");
-      HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "run_start");
-    }
-  }
-
-  if (solver_.perf_logging) {
-    for (size_t i = 0; i < sparse_embedding_params_.size(); i++) {
-      HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization",
-                    sparse_embedding_params_[i].sparse_embedding_name);
-    }
-  }
-
-#ifdef ENABLE_MPI
-  if (resource_manager_->get_num_process() > 1) {
-    resource_manager_->set_ready_to_transfer();
-  }
-#endif
-}
-
 size_t Model::number_of_networks() const { return networks_.size(); }
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/pybind/model_compile.cpp b/HugeCTR/src/pybind/model_compile.cpp
new file mode 100644
index 0000000000..99600f3240
--- /dev/null
+++ b/HugeCTR/src/pybind/model_compile.cpp
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_profiler_api.h>
+
+#include <algorithm>
+#include <core/hctr_impl/hctr_backend.hpp>
+#include <core23/logger.hpp>
+#include <core23/mpi_init_service.hpp>
+#include <core23_helper.hpp>
+#include <core23_network.hpp>
+#include <data_readers/multi_hot/async_data_reader.hpp>
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+#include <network_buffer_channels.hpp>
+#include <pybind/model.hpp>
+#include <resource_managers/resource_manager_core.hpp>
+#include <sstream>
+using namespace HugeCTR::MultiHot;
+
+namespace HugeCTR {
+namespace {
+
+static std::string join(std::vector<std::string>& strs, std::string delim) {
+  std::string str;
+  const std::vector<std::string>::iterator itlast = strs.end() - 1;
+  for (auto it = strs.begin(); it != strs.end(); it++) {
+    str += *it;
+    if (it != itlast) {
+      str += delim;
+    }
+  }
+  return str;
+}
+
+static std::vector<std::string>& split(const std::string& s, char delim,
+                                       std::vector<std::string>& elems) {
+  std::istringstream is(s);
+  std::string item;
+  while (std::getline(is, item, delim)) {
+    elems.push_back(item);
+  }
+  return elems;
+}
+
+static std::string get_tensor_shape(std::string tensor_name,
+                                    std::map<std::string, std::vector<size_t>> tensor_shape_info) {
+  std::string shape = "";
+  if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) {
+    shape += "(";
+    for (unsigned int i = 0; i < tensor_shape_info[tensor_name].size(); i++) {
+      shape += std::to_string(tensor_shape_info[tensor_name][i]);
+      shape += ",";
+    }
+    shape.back() = ')';
+  }
+  return shape;
+}
+
+static std::string get_tensor_shape(std::string tensor_name,
+                                    std::map<std::string, core23::Shape> tensor_shape_info) {
+  std::stringstream ss;
+  if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) {
+    ss << tensor_shape_info[tensor_name];
+  }
+  return ss.str();
+}
+
+}  // namespace
+
+void Model::add(Input& input) {
+  std::string label_name = input.labels_.begin()->first;
+  int label_dim = input.labels_.begin()->second;
+
+  // If multiple labels, treat them as 1 big label and add a split layer (below)
+  if (input.labels_.size() > 1) {
+    label_name = "combined_multi_label";
+    label_dim = std::accumulate(std::begin(input.labels_), std::end(input.labels_), 0,
+                                [](const int previous, const std::pair<std::string, int>& p) {
+                                  return previous + p.second;
+                                });
+  }
+
+  input_params_.push_back(input);
+  activate_tensor(tensor_active_, label_name);
+  activate_tensor(tensor_active_, input.dense_name);
+  data_input_info_.push_back(label_name);
+  data_input_info_.push_back(input.dense_name);
+  tensor_shape_info_raw_.insert(
+      std::make_pair(label_name, std::vector<int>{solver_.batchsize, label_dim}));
+  tensor_shape_info_raw_.insert(
+      std::make_pair(input.dense_name, std::vector<int>{solver_.batchsize, input.dense_dim}));
+  if (solver_.use_embedding_collection) {
+    std::vector<std::string> top_name_list;
+    std::vector<int> nnz_per_slot;
+    bool is_fixed_length = true;
+    int num_slot = 0;
+    for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) {
+      auto& p = input.data_reader_sparse_param_array[i];
+      top_name_list.push_back(p.top_name);
+      if (p.slot_num != 1) {
+        HCTR_OWN_THROW(
+            Error_t::WrongInput,
+            "To use embedding collection, slots_num should be set to 1 in each sparse_param. "
+            "Please refer to notebooks/embedding_collection.ipynb and separate your multi-slot "
+            "output into multiple single-slot output");
+      }
+      nnz_per_slot.push_back(p.nnz_per_slot[0]);
+      if (!p.is_fixed_length) is_fixed_length = false;
+      num_slot += 1;
+      hotness_map_.insert({p.top_name, p.max_feature_num});
+    }
+    std::string concat_top_name = join(top_name_list, ",");
+    DataReaderSparseParam concat_data_reader_sparse_param{concat_top_name, nnz_per_slot,
+                                                          is_fixed_length, num_slot};
+    input.data_reader_sparse_param_array = {concat_data_reader_sparse_param};
+  }
+  std::vector<std::string> sparse_names;
+  for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) {
+    sparse_names.push_back(input.data_reader_sparse_param_array[i].top_name);
+    tensor_shape_info_raw_.insert(std::make_pair(
+        input.data_reader_sparse_param_array[i].top_name,
+        std::vector<int>{solver_.batchsize, input.data_reader_sparse_param_array[i].slot_num}));
+  }
+  data_input_info_.push_back(join(sparse_names, ","));
+  for (unsigned int i = 0; i < input.data_reader_sparse_param_array.size(); i++) {
+    activate_tensor(tensor_active_, input.data_reader_sparse_param_array[i].top_name);
+  }
+  if (solver_.i64_input_key) {
+    add_input<long long>(input, reader_params_, sparse_input_map_64_, train_tensor_entities_list_,
+                         evaluate_tensor_entities_list_, train_data_reader_, evaluate_data_reader_,
+                         solver_.batchsize, solver_.batchsize_eval, solver_.use_mixed_precision,
+                         solver_.repeat_dataset, solver_.train_intra_iteration_overlap,
+                         solver_.num_iterations_statistics, resource_manager_);
+  } else {
+    add_input<unsigned int>(input, reader_params_, sparse_input_map_32_,
+                            train_tensor_entities_list_, evaluate_tensor_entities_list_,
+                            train_data_reader_, evaluate_data_reader_, solver_.batchsize,
+                            solver_.batchsize_eval, solver_.use_mixed_precision,
+                            solver_.repeat_dataset, solver_.train_intra_iteration_overlap,
+                            solver_.num_iterations_statistics, resource_manager_);
+  }
+
+  if (solver_.use_embedding_collection and solver_.train_inter_iteration_overlap) {
+    create_copy_ops_for_network_input(input.dense_name, label_name, true);
+  }
+  if (solver_.use_embedding_collection and solver_.eval_inter_iteration_overlap) {
+    create_copy_ops_for_network_input(input.dense_name, label_name, false);
+  }
+
+  // Add label weights to model
+  for (std::map<std::string, float>::iterator iter = input.label_weights_.begin();
+       iter != input.label_weights_.end(); ++iter) {
+    label_weights_.insert(std::make_pair(iter->first, iter->second));
+  }
+
+  // If multiple labels provided, add a Slice layer to handle breaking up the label
+  if (input.labels_.size() > 1) {
+    std::vector<std::string> label_names;
+    std::vector<std::pair<int, int>> ranges;
+    int idx = 0;
+
+    for (std::map<std::string, int>::iterator iter = input.labels_.begin();
+         iter != input.labels_.end(); ++iter) {
+      label_names.push_back(iter->first);
+      if (iter->second < 1) {
+        HCTR_OWN_THROW(Error_t::WrongInput, "Each label dimension must be at lesat 1.");
+      }
+      ranges.push_back(std::make_pair(idx, idx + iter->second));
+      idx += iter->second;
+    }
+    std::vector<std::string> bottom_name{"combined_multi_label"};
+    DenseLayer label_slice_layer = DenseLayer(Layer_t::Slice, bottom_name, label_names);
+    label_slice_layer.ranges = ranges;
+
+    add(label_slice_layer);
+  }
+}
+
+void Model::add(SparseEmbedding& sparse_embedding) {
+  OptParams embedding_opt_params;
+  if (!(sparse_embedding.embedding_opt_params)->initialized) {
+    sparse_embedding.embedding_opt_params = opt_params_py_;
+    sparse_embedding.initialize_max_vocabulary_size_per_gpu();
+  }
+  sparse_embedding.max_vocabulary_size_global =
+      sparse_embedding.max_vocabulary_size_per_gpu * resource_manager_->get_global_gpu_count();
+  sparse_embedding_params_.push_back(sparse_embedding);
+  deactivate_tensor(tensor_active_, sparse_embedding.bottom_name);
+  activate_tensor(tensor_active_, sparse_embedding.sparse_embedding_name);
+  int slot_num = tensor_shape_info_raw_[sparse_embedding.bottom_name][1];
+  tensor_shape_info_raw_.insert(
+      std::make_pair(sparse_embedding.sparse_embedding_name,
+                     std::vector<int>{solver_.batchsize, slot_num,
+                                      static_cast<int>(sparse_embedding.embedding_vec_size)}));
+  input_output_info_.push_back(
+      std::make_pair(sparse_embedding.bottom_name, sparse_embedding.sparse_embedding_name));
+  layer_info_.push_back(EMBEDDING_TYPE_TO_STRING[sparse_embedding.embedding_type]);
+
+  embedding_opt_params_list_.push_back(sparse_embedding.embedding_opt_params);
+  init_optimizer_params(embedding_opt_params, solver_, sparse_embedding.embedding_opt_params);
+  if (solver_.i64_input_key && !solver_.use_mixed_precision) {
+    add_sparse_embedding<long long, float>(
+        sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_,
+        evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_,
+        solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_,
+        solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics,
+        gpu_lr_sches_);
+  } else if (solver_.i64_input_key && solver_.use_mixed_precision) {
+    add_sparse_embedding<long long, __half>(
+        sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_,
+        evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_,
+        solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_,
+        solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics,
+        gpu_lr_sches_);
+  } else if (!solver_.i64_input_key && !solver_.use_mixed_precision) {
+    add_sparse_embedding<unsigned int, float>(
+        sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_,
+        evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_,
+        solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_,
+        solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics,
+        gpu_lr_sches_);
+  } else {
+    add_sparse_embedding<unsigned int, __half>(
+        sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_,
+        evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_,
+        solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_,
+        solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics,
+        gpu_lr_sches_);
+  }
+  embeddings_map_.insert(
+      std::make_pair(sparse_embedding.sparse_embedding_name, embeddings_.back()));
+  embedding_dependent_tensors_.insert(sparse_embedding.sparse_embedding_name);
+}
+
+void Model::add(DenseLayer& dense_layer) {
+  for (auto& top_name : dense_layer.top_names) {
+    if (tensor_shape_info_raw_.find(top_name) != tensor_shape_info_raw_.end()) {
+      HCTR_OWN_THROW(Error_t::WrongInput, top_name + ", top tensor name already exists");
+    }
+  }
+  for (auto& bottom_name : dense_layer.bottom_names) {
+    if (tensor_shape_info_raw_.find(bottom_name) == tensor_shape_info_raw_.end()) {
+      HCTR_OWN_THROW(Error_t::WrongInput, bottom_name + ", bottom tensor name does not exists");
+    }
+  }
+  calculate_tensor_dimensions(tensor_shape_info_raw_, dense_layer);
+  dense_layer_params_raw_.push_back(dense_layer);
+}
+
+template <typename emb_t>
+void allocate_ebc_output_helper_for_feature_major(
+    std::shared_ptr<ResourceManager> resource_manager_, size_t batch_size_per_gpu,
+    const EmbeddingCollectionConfig& ebc_config,
+    const embedding::EmbeddingCollectionParam& ebc_param,
+    std::vector<std::vector<TensorEntity>>& tensor_entries_list_,
+    std::vector<core23::Tensor>& ebc_output) {
+  HCTR_CHECK(ebc_config.output_layout_ == embedding::EmbeddingLayout::FeatureMajor);
+  int num_local_gpus = resource_manager_->get_local_gpu_count();
+  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
+    CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
+    core23::Device device(core23::DeviceType::GPU,
+                          resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
+    auto buffer_channel = core23::GetRandomBufferChannel();
+    core23::Tensor head_tensor;
+    core23::BufferParams buffer_param{.channel = buffer_channel};
+    core23::TensorParams tensor_param = core23::TensorParams().buffer_params(buffer_param);
+    int64_t concat_dims = 0;
+    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
+      const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
+      std::string top_name = ebc_config.top_names_[lookup_id];
+      int64_t emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat)
+                                 ? lookup_param.max_hotness * lookup_param.ev_size
+                                 : lookup_param.ev_size;
+
+      core23::Tensor tmp_tensor(tensor_param.shape({(int64_t)batch_size_per_gpu, 1ll, emb_out_dims})
+                                    .device(device)
+                                    .data_type(core23::ToScalarType<emb_t>::value));
+      concat_dims += emb_out_dims;
+      tensor_entries_list_[local_gpu_id].push_back({top_name, tmp_tensor});
+      if (!lookup_id) {
+        head_tensor = tmp_tensor;
+      }
+    }
+    // allocate
+    void* starting_address = head_tensor.data();
+    core23::Tensor continous_emb_output = core23::Tensor::bind(
+        starting_address, core23::Shape({static_cast<int64_t>(batch_size_per_gpu), concat_dims}),
+        core23::ToScalarType<emb_t>::value, device);
+    ebc_output.push_back(continous_emb_output);
+  }
+}
+
+template <typename emb_t>
+void allocate_ebc_output_helper_for_batch_major(
+    std::shared_ptr<ResourceManager> resource_manager_, size_t batch_size_per_gpu,
+    const EmbeddingCollectionConfig& ebc_config,
+    const embedding::EmbeddingCollectionParam& ebc_param,
+    std::vector<std::vector<TensorEntity>>& tensor_entries_list_,
+    std::vector<core23::Tensor>& ebc_output) {
+  int num_local_gpus = resource_manager_->get_local_gpu_count();
+  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
+    CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
+
+    core23::Device device(core23::DeviceType::GPU,
+                          resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
+    core23::TensorParams tensor_param;
+    int64_t emb_out_dims = 0;
+    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
+      const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
+
+      emb_out_dims += (lookup_param.combiner == embedding::Combiner::Concat)
+                          ? lookup_param.max_hotness * lookup_param.ev_size
+                          : lookup_param.ev_size;
+    }
+
+    core23::Tensor continous_emb_output(
+        tensor_param.shape({(int64_t)batch_size_per_gpu, emb_out_dims})
+            .device(device)
+            .data_type(core23::ToScalarType<emb_t>::value));
+    continous_emb_output.data();
+    ebc_output.push_back(continous_emb_output);
+
+    tensor_entries_list_[local_gpu_id].push_back(
+        {ebc_config.batch_major_output_name_, continous_emb_output});
+  }
+}
+
+void Model::add(const EmbeddingCollectionConfig& user_ebc_config) {
+  auto ebc_config = split_column_wise_sharding_config(user_ebc_config);
+  TableNameToIDDict table_name_to_id_dict =
+      create_table_name_to_id_dict_from_ebc_config(ebc_config);
+  int global_ebc_id = static_cast<int>(ebc_list_.size());
+  for (auto& [name, id] : table_name_to_id_dict) {
+    HCTR_CHECK_HINT(ebc_name_to_global_id_dict_.find(name) == ebc_name_to_global_id_dict_.end(),
+                    "Duplicate table name: ", name, "\n");
+    ebc_name_to_global_id_dict_[name] = {global_ebc_id, id};
+  }
+  int num_total_gpus = resource_manager_->get_global_gpu_count();
+  int num_local_gpus = resource_manager_->get_local_gpu_count();
+
+  int num_lookup = ebc_config.lookup_configs_.size();
+  core23::DataType key_type =
+      solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32;
+  core23::DataType index_type =
+      solver_.i64_input_key ? core23::ScalarType::UInt64 : core23::ScalarType::UInt32;
+  core23::DataType offset_type =
+      solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32;
+  core23::DataType emb_type =
+      solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float;
+  core23::DataType wgrad_type =
+      solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float;
+  embedding::EmbeddingLayout input_layout_ =
+      reader_params_.data_reader_type == DataReaderType_t::RawAsync
+          ? embedding::EmbeddingLayout::FeatureMajor
+          : embedding::EmbeddingLayout::BatchMajor;
+
+  std::vector<std::string> bottom_name_list;
+  for (auto& bottom_name : ebc_config.bottom_names_) {
+    bottom_name_list.push_back(bottom_name);
+  }
+
+  std::string bottom_name = join(bottom_name_list, ",");
+  deactivate_tensor(tensor_active_, bottom_name);
+
+  layer_info_.push_back("EmbeddingCollection" + std::to_string(ebc_list_.size()));
+
+  auto lookup_params = create_lookup_params_from_ebc_config(table_name_to_id_dict, ebc_config);
+  for (int lookup_id = 0; lookup_id < num_lookup; ++lookup_id) {
+    auto b_name = ebc_config.bottom_names_[ebc_config.dr_lookup_ids_[lookup_id]];
+    lookup_params[lookup_id].max_hotness = hotness_map_[b_name];
+  }
+
+  auto shard_matrix = create_shard_matrix_from_ebc_config(table_name_to_id_dict, ebc_config);
+
+  auto grouped_emb_params =
+      create_grouped_embedding_param_from_ebc_config(table_name_to_id_dict, ebc_config);
+
+  int num_table = ebc_config.emb_table_config_list_.size();
+  auto emb_table_list = create_table_params_from_ebc_config(table_name_to_id_dict, ebc_config);
+  for (auto& p : emb_table_list) {
+    if (p.opt_param.optimizer == Optimizer_t::NOT_INITIALIZED) {
+      p.opt_param = opt_params_;
+    }
+  }
+
+  embedding::AllreduceStrategy allreduce_strategy = ebc_config.allreduce_strategy_;
+  if (solver_.grouped_all_reduce) {
+    allreduce_strategy = embedding::AllreduceStrategy::GroupDense;
+  }
+
+  auto compression_param =
+      create_compression_param_from_ebc_config(table_name_to_id_dict, ebc_config);
+  embedding::EmbeddingCollectionParam ebc_param{num_table,
+                                                num_lookup,
+                                                lookup_params,
+                                                shard_matrix,
+                                                grouped_emb_params,
+                                                solver_.batchsize,
+                                                key_type,
+                                                index_type,
+                                                offset_type,
+                                                emb_type,
+                                                wgrad_type,
+                                                input_layout_,
+                                                ebc_config.output_layout_,
+                                                ebc_config.sort_strategy_,
+                                                ebc_config.keys_preprocess_strategy_,
+                                                allreduce_strategy,
+                                                ebc_config.comm_strategy_,
+                                                compression_param};
+
+  embedding::EmbeddingCollectionParam eval_ebc_param{num_table,
+                                                     num_lookup,
+                                                     lookup_params,
+                                                     shard_matrix,
+                                                     grouped_emb_params,
+                                                     solver_.batchsize_eval,
+                                                     key_type,
+                                                     index_type,
+                                                     offset_type,
+                                                     emb_type,
+                                                     wgrad_type,
+                                                     input_layout_,
+                                                     ebc_config.output_layout_,
+                                                     ebc_config.sort_strategy_,
+                                                     ebc_config.keys_preprocess_strategy_,
+                                                     ebc_config.allreduce_strategy_,
+                                                     ebc_config.comm_strategy_,
+                                                     compression_param};
+
+  std::vector<std::shared_ptr<core::CoreResourceManager>> core_list;
+
+  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
+    auto core_resource_manager =
+        std::make_shared<hctr_internal::HCTRCoreResourceManager>(resource_manager_, local_gpu_id);
+    core_list.push_back(core_resource_manager);
+  }
+  ebc_list_.push_back(std::make_unique<embedding::EmbeddingCollection>(
+      resource_manager_, core_list, ebc_param, eval_ebc_param, emb_table_list, exchange_wgrad_));
+  embedding_para_io_->add_embedding_collection((ebc_list_[ebc_list_.size() - 1]).get());
+
+  auto prepare_ebc_input = [&](auto& sparse_input_map, bool is_longlong) {
+    core23::DataType SparseType = is_longlong ? core23::DataType(core23::ScalarType::Int64)
+                                              : core23::DataType(core23::ScalarType::UInt32);
+    auto tensor_as_type = [&](core23::Tensor input, core23::DataType expected_type) {
+      auto origin_type = input.data_type();
+      HCTR_CHECK_HINT(origin_type.size() == expected_type.size(),
+                      "Size not equal, cannot reinterpret type");
+      return core23::Tensor::bind(input.data(), input.shape(), expected_type, input.device());
+    };
+    auto train_sparse_tensors = sparse_input_map[bottom_name].train_sparse_tensors;
+    auto evaluate_sparse_tensors = sparse_input_map[bottom_name].evaluate_sparse_tensors;
+
+    for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
+      CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id());
+      core23::Device device{core23::DeviceType::GPU,
+                            static_cast<core23::DeviceIndex>(
+                                resource_manager_->get_local_gpu(local_gpu_id)->get_device_id())};
+      auto train_key_tensor =
+          tensor_as_type(train_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType);
+      train_ebc_key_list_.push_back(train_key_tensor);
+
+      auto train_bucket_range_tensor =
+          tensor_as_type(train_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType);
+      train_ebc_bucket_range_list_.push_back(train_bucket_range_tensor);
+
+      train_ebc_num_keys_list_.push_back(train_sparse_tensors[local_gpu_id].get_nnz_ptr().get());
+
+      auto evaluate_key_tensor =
+          tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType);
+      evaluate_ebc_key_list_.push_back(evaluate_key_tensor);
+
+      auto evaluate_bucket_range_tensor =
+          tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType);
+      evaluate_ebc_bucket_range_list_.push_back(evaluate_bucket_range_tensor);
+
+      evaluate_ebc_num_keys_list_.push_back(
+          evaluate_sparse_tensors[local_gpu_id].get_nnz_ptr().get());
+    }
+  };
+
+  if (reader_params_.data_reader_type != DataReaderType_t::RawAsync) {
+    if (solver_.i64_input_key) {
+      prepare_ebc_input(sparse_input_map_64_, true);
+    } else {
+      prepare_ebc_input(sparse_input_map_32_, false);
+    }
+  }
+
+  // activate_ebc_output_tensor
+  size_t batch_size_per_gpu = solver_.batchsize / num_total_gpus;
+  size_t eval_batch_size_per_gpu = solver_.batchsize_eval / num_total_gpus;
+  if (ebc_param.output_layout_ == embedding::EmbeddingLayout::FeatureMajor) {
+    std::vector<std::string> top_name_list;
+    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
+      embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
+      int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat)
+                             ? lookup_param.max_hotness * lookup_param.ev_size
+                             : lookup_param.ev_size;
+
+      std::string top_name = ebc_config.top_names_[lookup_id];
+      top_name_list.push_back(top_name);
+
+      activate_tensor(tensor_active_, top_name);
+      tensor_shape_info_raw_.insert({top_name, {solver_.batchsize, 1, emb_out_dims}});
+      embedding_dependent_tensors_.insert(top_name);
+    }
+    input_output_info_.push_back(std::make_pair(bottom_name, join(top_name_list, ",")));
+    if (solver_.use_mixed_precision) {
+      allocate_ebc_output_helper_for_feature_major<__half>(
+          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
+          train_ebc_outptut_);
+      allocate_ebc_output_helper_for_feature_major<__half>(
+          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
+          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
+    } else {
+      allocate_ebc_output_helper_for_feature_major<float>(
+          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
+          train_ebc_outptut_);
+      allocate_ebc_output_helper_for_feature_major<float>(
+          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
+          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
+    }
+  } else {
+    int concate_out_dims = 0;
+    for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) {
+      embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id];
+
+      int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat)
+                             ? lookup_param.max_hotness * lookup_param.ev_size
+                             : lookup_param.ev_size;
+      concate_out_dims += emb_out_dims;
+    }
+
+    activate_tensor(tensor_active_, ebc_config.batch_major_output_name_);
+    tensor_shape_info_raw_.insert(
+        {ebc_config.batch_major_output_name_, {solver_.batchsize, concate_out_dims}});
+    input_output_info_.push_back(std::make_pair(bottom_name, ebc_config.batch_major_output_name_));
+    embedding_dependent_tensors_.insert(ebc_config.batch_major_output_name_);
+
+    // allocate output buffer
+    if (solver_.use_mixed_precision) {
+      allocate_ebc_output_helper_for_batch_major<__half>(
+          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
+          train_ebc_outptut_);
+      allocate_ebc_output_helper_for_batch_major<__half>(
+          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
+          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
+    } else {
+      allocate_ebc_output_helper_for_batch_major<float>(
+          resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_,
+          train_ebc_outptut_);
+      allocate_ebc_output_helper_for_batch_major<float>(
+          resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param,
+          evaluate_tensor_entities_list_, evaluate_ebc_outptut_);
+    }
+  }
+
+  train_ddl_output_.clear();
+  cache_train_ddl_output_.clear();
+  evaluate_ddl_output_.clear();
+  cache_evaluate_ddl_output_.clear();
+  for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) {
+    train_ddl_output_.push_back(
+        allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param));
+    if (solver_.train_inter_iteration_overlap) {
+      cache_train_ddl_output_.push_back(
+          allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param));
+    }
+    evaluate_ddl_output_.push_back(
+        allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param));
+    if (solver_.eval_inter_iteration_overlap) {
+      cache_evaluate_ddl_output_.push_back(
+          allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param));
+    }
+  }
+
+  // create data distributors
+  train_data_distributor_ = std::make_shared<DataDistributor>(core_list, ebc_param, emb_table_list,
+                                                              ebc_config.dr_lookup_ids_);
+  eval_data_distributor_ = std::make_shared<DataDistributor>(
+      core_list, eval_ebc_param, emb_table_list, ebc_config.dr_lookup_ids_);
+}
+
+void Model::pre_add_dense_layer(DenseLayer& dense_layer) {
+  embedding_dependent_ = false;
+  for (auto& bottom_name : dense_layer.bottom_names) {
+    deactivate_tensor(tensor_active_, bottom_name);
+    if (embedding_dependent_tensors_.find(bottom_name) != embedding_dependent_tensors_.end()) {
+      embedding_dependent_ = true;
+    }
+  }
+  for (auto& top_name : dense_layer.top_names) {
+    activate_tensor(tensor_active_, top_name);
+    if (embedding_dependent_) {
+      embedding_dependent_tensors_.insert(top_name);
+    }
+  }
+  std::string input_names = join(dense_layer.bottom_names, ",");
+  std::string output_names = join(dense_layer.top_names, ",");
+  input_output_info_.push_back(std::make_pair(input_names, output_names));
+  if (solver_.use_mixed_precision) {
+    layer_info_.push_back(LAYER_TYPE_TO_STRING_MP[dense_layer.layer_type]);
+  } else {
+    layer_info_.push_back(LAYER_TYPE_TO_STRING[dense_layer.layer_type]);
+  }
+}
+
+void Model::graph_analysis() {
+  HCTR_LOG(INFO, ROOT, "Graph analysis to resolve tensor dependency\n");
+  std::map<std::string, unsigned int> tensor_usage;
+  std::map<std::string, DenseLayer> tensor_slice_layer;
+  std::map<std::string, unsigned int> tensor_slice_index;
+  for (auto& dense_layer : dense_layer_params_raw_) {
+    for (auto& bottom_name : dense_layer.bottom_names) {
+      analyze_tensor(tensor_usage, bottom_name);
+    }
+  }
+  for (auto iter = tensor_usage.begin(); iter != tensor_usage.end(); iter++) {
+    if (iter->second > 5) {
+      HCTR_OWN_THROW(Error_t::WrongInput, "The graph should not include more than 5-way branches");
+    }
+    if (iter->second > 1) {
+      std::vector<std::string> bottom_names{iter->first};
+      std::vector<std::string> top_names;
+      std::vector<std::pair<int, int>> ranges;
+      for (unsigned int i = 0; i < iter->second; i++) {
+        top_names.push_back(iter->first + "_slice" + std::to_string(i));
+        auto dims = tensor_shape_info_raw_[iter->first].size();
+        ranges.emplace_back(std::make_pair(0, tensor_shape_info_raw_[iter->first][dims - 1]));
+      }
+      DenseLayer slice_layer(Layer_t::Slice, bottom_names, top_names);
+      slice_layer.ranges = ranges;
+      tensor_slice_layer.insert(std::pair<std::string, DenseLayer>(iter->first, slice_layer));
+      tensor_slice_index.insert(std::pair<std::string, unsigned int>(iter->first, 0));
+      HCTR_LOG(INFO, ROOT, "Add Slice layer for tensor: %s, creating %d copies\n",
+               iter->first.c_str(), iter->second);
+    }
+  }
+  for (auto& dense_layer : dense_layer_params_raw_) {
+    bool flag = true;
+    for (auto& bottom_name : dense_layer.bottom_names) {
+      if (tensor_usage[bottom_name] > 1) {
+        flag = false;
+        break;
+      }
+    }
+    if (flag) {
+      dense_layer_params_.push_back(dense_layer);
+    } else {
+      DenseLayer new_dense_layer = dense_layer;
+      for (unsigned int i = 0; i < new_dense_layer.bottom_names.size(); i++) {
+        std::string old_bottom_name = new_dense_layer.bottom_names[i];
+        if (tensor_slice_index.find(old_bottom_name) != tensor_slice_index.end()) {
+          auto iter = tensor_slice_layer.find(old_bottom_name);
+          if (tensor_slice_index[old_bottom_name] == 0) {
+            dense_layer_params_.push_back(iter->second);
+          }
+          std::string new_bottom_name = iter->second.top_names[tensor_slice_index[old_bottom_name]];
+          tensor_slice_index[old_bottom_name] += 1;
+          new_dense_layer.bottom_names[i] = new_bottom_name;
+        }
+      }
+      dense_layer_params_.push_back(new_dense_layer);
+    }
+  }
+  add_dense_layers(dense_layer_params_);
+}
+
+// deep copy
+void Model::create_copy_ops_for_network_input(const std::string& dense_name,
+                                              const std::string& label_name, bool is_train) {
+  auto& copy_ops = is_train ? graph_.train_copy_ops_ : graph_.evaluate_copy_ops_;
+  auto& tensor_entries_list =
+      is_train ? train_tensor_entities_list_ : evaluate_tensor_entities_list_;
+
+  int num_local_gpus = resource_manager_->get_local_gpu_count();
+  // copy ops for dense & label
+  copy_ops.resize(2 * num_local_gpus);
+
+  for (int id = 0; id < num_local_gpus; ++id) {
+    core23::Device device(core23::DeviceType::GPU,
+                          resource_manager_->get_local_gpu(id)->get_device_id());
+    for (auto& tensor_entry : tensor_entries_list[id]) {
+      if (tensor_entry.name == dense_name) {
+        copy_ops[id].reset(
+            new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor));
+        tensor_entry.tensor = copy_ops[id]->get_tensorbag();
+      } else if (tensor_entry.name == label_name) {
+        copy_ops[id + num_local_gpus].reset(
+            new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor));
+        tensor_entry.tensor = copy_ops[id + num_local_gpus]->get_tensorbag();
+      } else {
+        HCTR_OWN_THROW(Error_t::WrongInput, "wrong tensor entry name when creating copy_op.");
+      }
+    }
+  }
+}
+
+void Model::compile() {
+  if (!graph_finalized_) {
+    graph_analysis();
+    graph_finalized_ = true;
+  }
+  if (data_input_info_.size() < 3 || layer_info_.size() < 2) {
+    HCTR_OWN_THROW(Error_t::IllegalCall, "The model should include input and at least two layers");
+  }
+  HCTR_PRINT(INFO,
+             "===================================================Model "
+             "Compile===================================================\n");
+  build_networks();
+
+  // TODO: this is a WAR; need to find a way to remove the preallocation
+  for (int local_gpu_id = 0; local_gpu_id < resource_manager_->get_local_gpu_count();
+       ++local_gpu_id) {
+    auto device_id = resource_manager_->get_local_gpu(local_gpu_id)->get_device_id();
+    core23::Device device(core23::DeviceType::GPU, device_id);
+    bool success = core23::AllocateBuffers(device);
+    if (!success) {
+      HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl;
+    }
+  }
+  core23::Device device_h(core23::DeviceType::CPU);
+  bool success = core23::AllocateBuffers(device_h);
+  if (!success) {
+    HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl;
+  }
+  initialize();
+  create_metrics();
+  create_pipelines();
+}
+
+void Model::update_label_weights(std::vector<std::string>& label_names,
+                                 std::vector<float>& label_weights) {
+  // Add implementation and support in next merge request
+  if (label_names.size() != label_weights.size()) {
+    HCTR_OWN_THROW(Error_t::WrongInput, "Must have the same number of label names and weights");
+  }
+  std::map<std::string, float>::iterator loss_lookup;
+  for (size_t i = 0; i < label_names.size(); ++i) {
+    loss_lookup = label_weights_.find(label_names[i]);
+    if (loss_lookup == label_weights_.end()) {
+      HCTR_OWN_THROW(Error_t::WrongInput, "Label name not found: " + label_names[i]);
+    }
+    loss_lookup->second = label_weights[i];
+  }
+}
+
+void Model::compile(std::vector<std::string>& label_names, std::vector<float>& label_weights) {
+  update_label_weights(label_names, label_weights);
+  compile();
+}
+
+void Model::summary() {
+  if (!graph_finalized_) {
+    graph_analysis();
+    graph_finalized_ = true;
+  }
+  if (data_input_info_.size() < 3 || layer_info_.size() < 2) {
+    HCTR_OWN_THROW(Error_t::IllegalCall,
+                   "The model should include input and at "
+                   "least two layers");
+  }
+  for (auto tensor_entry : train_tensor_entities_list_[0]) {
+    tensor_shape_info_.insert(std::make_pair(tensor_entry.name, tensor_entry.tensor.shape()));
+  }
+  HCTR_PRINT(INFO,
+             "============================================"
+             "=======Model "
+             "Summary====================================="
+             "==============\n");
+  auto log = HCTR_LOG_S(INFO, ROOT);
+  log << "Model structure on each GPU" << std::endl;
+  log << std::left << std::setw(40) << std::setfill(' ') << "Label" << std::left << std::setw(30)
+      << std::setfill(' ') << "Dense" << std::left << std::setw(30) << std::setfill(' ') << "Sparse"
+      << std::endl;
+  log << std::left << std::setw(40) << std::setfill(' ') << data_input_info_[0] << std::left
+      << std::setw(30) << std::setfill(' ') << data_input_info_[1] << " " << std::left
+      << std::setw(30) << std::setfill(' ') << data_input_info_[2] << std::endl;
+  log << std::left << std::setw(40) << std::setfill(' ')
+      << get_tensor_shape(data_input_info_[0], tensor_shape_info_) << std::left << std::setw(40)
+      << std::setfill(' ') << get_tensor_shape(data_input_info_[1], tensor_shape_info_)
+      << std::endl;
+  log << "————————————————————————————————————————————————"
+         "—————————————————————————————————"
+         "—————————————————————————————————"
+      << std::endl;
+  log << std::left << std::setw(40) << std::setfill(' ') << "Layer Type" << std::left
+      << std::setw(30) << std::setfill(' ') << "Input Name" << std::left << std::setw(30)
+      << std::setfill(' ') << "Output Name" << std::left << std::setw(30) << std::setfill(' ')
+      << "Output Shape" << std::endl;
+  log << "————————————————————————————————————————————————"
+         "—————————————————————————————————"
+         "—————————————————————————————————"
+      << std::endl;
+  for (size_t i = 0; i < layer_info_.size(); ++i) {
+    std::vector<std::string> layer_type{layer_info_[i]};
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+    split(input_output_info_[i].first, ',', input_names);
+    split(input_output_info_[i].second, ',', output_names);
+    size_t lines =
+        input_names.size() > output_names.size() ? input_names.size() : output_names.size();
+    layer_type.insert(layer_type.end(), lines - 1, "");
+    if (lines > input_names.size()) {
+      input_names.insert(input_names.end(), lines - input_names.size(), "");
+    }
+    if (lines > output_names.size()) {
+      output_names.insert(output_names.end(), lines - output_names.size(), "");
+    }
+    for (size_t j = 0; j < lines; j++) {
+      log << std::left << std::setw(40) << std::setfill(' ') << layer_type[j] << std::left
+          << std::setw(30) << std::setfill(' ') << input_names[j] << std::left << std::setw(30)
+          << std::setfill(' ') << output_names[j] << std::left << std::setw(30) << std::setfill(' ')
+          << get_tensor_shape(output_names[j], tensor_shape_info_) << std::endl;
+    }
+    log << "----------------------------------------------"
+           "-----------------------------------"
+           "---------------------------------"
+        << std::endl;
+  }
+}
+
+void Model::create_networks() {
+  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
+    networks_.emplace_back(new Network(resource_manager_->get_local_cpu(),
+                                       resource_manager_->get_local_gpu(i),
+                                       solver_.use_mixed_precision));
+  }
+  train_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count());
+  evaluate_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count());
+}
+
+void Model::build_networks() {
+  for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) {
+    networks_[i]->create_and_set_optimizer(opt_params_);
+  }
+  auto aligned_size = 16 * resource_manager_->get_local_gpu_count();
+  core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel()
+                                                                 : GetWgradBufferChannel()};
+  for (int g = 0; g < resource_manager_->get_local_gpu_count(); g++) {
+    auto device_id = resource_manager_->get_local_gpu(g)->get_device_id();
+    core23::Device device(core23::DeviceType::GPU, device_id);
+    auto wgrad_buffer = core23::GetBuffer(bp, device);
+    auto wgrad_size = wgrad_buffer->reserved_size();
+    size_t padded_bytes = wgrad_size % aligned_size;
+    padded_bytes += aligned_size - padded_bytes;
+    // alignment requirements from grouped allreduce.
+    wgrad_tensor_successor_.emplace_back(core23::TensorParams()
+                                             .device(device)
+                                             .shape({static_cast<int64_t>(padded_bytes)})
+                                             .data_type(core23::ScalarType::Char)
+                                             .buffer_params(bp));
+  }
+  buff_allocated_ = true;
+}
+
+void Model::initialize() {
+#ifndef DATA_READING_TEST
+
+#pragma omp parallel num_threads(number_of_networks())
+  {
+    size_t id = omp_get_thread_num();
+    networks_[id]->initialize();
+    if (solver_.use_algorithm_search) {
+      networks_[id]->search_algorithm();
+    }
+    HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(id)->get_stream()));
+  }
+
+  int num_gpus = resource_manager_->get_local_gpu_count();
+  std::vector<void*> wgrad_buffer_ptrs;
+  size_t wgrad_buffer_size{};
+  core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel()
+                                                                 : GetWgradBufferChannel()};
+  for (int g = 0; g < num_gpus; g++) {
+    auto device_id = resource_manager_->get_local_gpu(g)->get_device_id();
+    core23::Device device(core23::DeviceType::GPU, device_id);
+    auto wgrad_buffer = core23::GetBuffer(bp, device);
+    auto [ptr_, size_] = wgrad_buffer->decay();
+    wgrad_buffer_size = size_;
+    HCTR_CHECK_HINT(size_ && ptr_, "wgrad is null or it's a confederal buffer");
+    wgrad_buffer_ptrs.push_back(ptr_);
+  }
+  exchange_wgrad_->init_ar_comm(wgrad_buffer_ptrs, wgrad_buffer_size);
+#endif
+  init_params_for_dense_();
+  if (solver_.perf_logging) {
+    for (size_t i = 0; i < dense_layer_params_.size(); i++) {
+      bool is_trainable =
+          TRAINABLE_LAYERS.find(dense_layer_params_[i].layer_type) != TRAINABLE_LAYERS.end();
+      if (is_trainable) {
+        std::string output_names = join(dense_layer_params_[i].top_names, "-");
+        HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization", output_names);
+      }
+    }
+  }
+  init_params_for_sparse_();
+}
+void Model::create_metrics() {
+  int num_total_gpus = resource_manager_->get_global_gpu_count();
+  int label_dim = input_params_[0].labels_.begin()->second;
+  if (input_params_[0].labels_.size() > 1) {
+    auto labs = input_params_[0].labels_;
+    label_dim = std::accumulate(std::begin(labs), std::end(labs), 0,
+                                [](const int previous, const std::pair<std::string, int>& p) {
+                                  return previous + p.second;
+                                });
+  }
+
+  auto num_metrics = [&]() { return networks_[0]->get_raw_metrics_all().size(); };
+  for (const auto& metric : solver_.metrics_spec) {
+    // Only AUC is currently supported for models with more than one loss layer
+    if ((metric.first != metrics::Type::AUC) && num_metrics() > 1) {
+      HCTR_OWN_THROW(Error_t::WrongInput,
+                     "Metrics besides AUC are not supported for multi-task models.");
+    }
+
+    metrics_.emplace_back(std::move(metrics::Metric::Create(
+        metric.first, solver_.use_mixed_precision, solver_.batchsize_eval / num_total_gpus,
+        solver_.max_eval_batches, label_dim, resource_manager_)));
+  }
+}
+
+void Model::create_pipelines() {
+  // TODO: currently it is only for HE
+  if (embeddings_.size() == 1) {
+    auto lr_scheds = embeddings_[0]->get_learning_rate_schedulers();
+    for (size_t i = 0; i < lr_scheds.size(); i++) {
+      networks_[i]->set_learning_rate_scheduler(lr_scheds[i]);
+    }
+  }
+
+  if (solver_.use_embedding_collection) {
+    create_train_pipeline_with_ebc(networks_);
+    create_evaluate_pipeline_with_ebc(networks_);
+  } else {
+    // will create pipeline for dense network.
+    create_train_network_pipeline(networks_);
+    create_eval_network_pipeline(networks_);
+  }
+
+  if (solver_.perf_logging) {
+    HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "init_stop");
+    HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "run_start");
+  }
+
+  if (solver_.perf_logging) {
+    for (size_t i = 0; i < sparse_embedding_params_.size(); i++) {
+      HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization",
+                    sparse_embedding_params_[i].sparse_embedding_name);
+    }
+  }
+
+#ifdef ENABLE_MPI
+  if (resource_manager_->get_num_process() > 1) {
+    collective_manager_->set_ready_to_transfer();
+  }
+#endif
+}
+
+}  // namespace HugeCTR
\ No newline at end of file
diff --git a/HugeCTR/src/pybind/model_pipeline.cpp b/HugeCTR/src/pybind/model_pipeline.cpp
index a8f9b53fd0..10406befc5 100644
--- a/HugeCTR/src/pybind/model_pipeline.cpp
+++ b/HugeCTR/src/pybind/model_pipeline.cpp
@@ -18,19 +18,24 @@
 #include <algorithm>
 #include <core23/logger.hpp>
 #include <core23_network.hpp>
-#include <data_readers/async_reader/async_reader_adapter.hpp>
-#include <embeddings/hybrid_sparse_embedding.hpp>
 #include <fstream>
 #include <iomanip>
 #include <iterator>
 #include <pybind/model.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 
 namespace HugeCTR {
 
-template <typename NetworkType>
-void Model::create_train_network_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks) {
+void Model::exchange_wgrad(size_t device_id) {
+  auto& gpu_resource = resource_manager_->get_local_gpu(device_id);
+  CudaCPUDeviceContext context(gpu_resource->get_device_id());
+  if (resource_manager_->get_global_gpu_count() > 1) {
+    exchange_wgrad_->allreduce(device_id, gpu_resource->get_stream());
+  }
+}
+
+void Model::create_train_network_pipeline(std::vector<std::shared_ptr<Network>>& networks) {
   graph_.train_pipeline_.resize(resource_manager_->get_local_gpu_count());
 
   auto scheduled_reader = dynamic_cast<SchedulableDataReader*>(train_data_reader_.get());
@@ -64,8 +69,7 @@ void Model::create_train_network_pipeline(std::vector<std::shared_ptr<NetworkTyp
   }
 }
 
-template <typename NetworkType>
-void Model::create_eval_network_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks) {
+void Model::create_eval_network_pipeline(std::vector<std::shared_ptr<Network>>& networks) {
   graph_.evaluate_pipeline_.resize(resource_manager_->get_local_gpu_count());
 
   for (int local_id = 0; local_id < static_cast<int>(resource_manager_->get_local_gpu_count());
@@ -93,466 +97,9 @@ void Model::create_eval_network_pipeline(std::vector<std::shared_ptr<NetworkType
   }
 }
 
-template <typename NetworkType>
-void Model::create_train_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks) {
-  auto scheduled_reader = dynamic_cast<SchedulableDataReader*>(train_data_reader_.get());
-  auto scheduled_embedding = dynamic_cast<SchedulableEmbeding*>(embeddings_[0].get());
-  bool is_train = true;
-  bool use_graph = solver_.use_cuda_graph;
-
-  if (solver_.train_inter_iteration_overlap) {
-    graph_.train_pipeline_.resize(2 * resource_manager_->get_local_gpu_count());
-  } else {
-    graph_.train_pipeline_.resize(resource_manager_->get_local_gpu_count());
-  }
-
-#pragma omp parallel for num_threads(resource_manager_->get_local_gpu_count())
-  for (int local_id = 0; local_id < static_cast<int>(resource_manager_->get_local_gpu_count());
-       local_id++) {
-    auto gpu_resource = resource_manager_->get_local_gpu(local_id);
-    CudaCPUDeviceContext context(gpu_resource->get_device_id());
-
-    // create scheduleable
-    auto iteration_start = std::make_shared<StreamContextScheduleable>([=] {});
-
-    auto schedule_reader = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      if (use_graph && !scheduled_reader->current_batch_incomplete()) {
-        scheduled_reader->schedule_here_graph(stream, local_id);
-      } else {
-        scheduled_reader->schedule_here(stream, local_id);
-      }
-      graph_scheduler_->record_execution(local_id, stream);
-    });
-
-    auto EMB_input_ready_wait = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      scheduled_reader->stream_wait_sparse_tensors(
-          stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete());
-    });
-
-    auto BNET_input_ready_wait = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      scheduled_reader->stream_wait_dense_tensors(
-          stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete());
-    });
-
-    auto schedule_split_3way = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      scheduled_reader->schedule_split_3_way_here(
-          stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete());
-    });
-
-    auto schedule_d2d = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      scheduled_reader->schedule_d2d_here(
-          stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete());
-    });
-
-    auto embedding_index_calculation = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->index_calculation(is_train, local_id); });
-
-    auto cross_iteration_sync = std::make_shared<StreamContextScheduleable>([] {});
-
-    auto embedding_freq_forward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->freq_forward(is_train, local_id); });
-
-    auto embedding_freq_backward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->freq_backward(local_id); });
-
-    auto embedding_freq_update_params = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->freq_update_params(local_id); });
-
-    auto embedding_infreq_model_forward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->infreq_model_forward(local_id); });
-
-    auto embedding_infreq_network_forward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->infreq_network_forward(is_train, local_id); });
-
-    auto embedding_infreq_network_backward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->infreq_network_backward(local_id); });
-
-    auto embedding_infreq_model_backward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->infreq_model_backward(local_id); });
-
-    auto network_init = std::make_shared<StreamContextScheduleable>([=] {
-      if (networks[local_id]->use_mixed_precision_ &&
-          networks[local_id]->optimizer_->get_optimizer_type() != Optimizer_t::SGD) {
-        networks[local_id]->conv_weight_(networks[local_id]->train_weight_tensor_half_,
-                                         networks[local_id]->train_weight_tensor_);
-      }
-    });
-
-    auto bottom_network_fprop = std::make_shared<StreamContextScheduleable>([=] {
-      networks[local_id]->prop_layers(networks[local_id]->bottom_layers_, true, is_train);
-    });
-
-    auto top_network_fprop = std::make_shared<StreamContextScheduleable>(
-        [=] { networks[local_id]->prop_layers(networks[local_id]->top_layers_, true, is_train); });
-
-    auto init_wgrad = std::make_shared<StreamContextScheduleable>([=] {
-      networks[local_id]->train_losses_.begin()->second->regularizer_initialize_wgrad(is_train);
-    });
-
-    auto lr_sched_update = std::make_shared<StreamContextScheduleable>(
-        [=]() { networks[local_id]->lr_sched_->update(); });
-
-    auto cal_loss = std::make_shared<StreamContextScheduleable>([=] {
-      float rterm = networks[local_id]->train_losses_.begin()->second->regularizer_compute_rterm();
-      long long current_batchsize_per_device =
-          scheduled_reader->get_current_batchsize_per_device(local_id);
-
-      networks[local_id]->train_losses_.begin()->second->compute(
-          is_train, current_batchsize_per_device, rterm);
-    });
-
-    auto top_network_bprop = std::make_shared<StreamContextScheduleable>(
-        [=] { networks[local_id]->prop_layers(networks[local_id]->top_layers_, false, is_train); });
-
-    auto bottom_network_bprop = std::make_shared<StreamContextScheduleable>([=] {
-      networks[local_id]->prop_layers(networks[local_id]->bottom_layers_, false, is_train);
-    });
-
-    auto network_exchange_wgrad =
-        std::make_shared<StreamContextScheduleable>([=] { this->exchange_wgrad(local_id); });
-
-    auto update_params =
-        std::make_shared<StreamContextScheduleable>([=] { networks[local_id]->update_params(); });
-
-    auto iteration_end = std::make_shared<StreamContextScheduleable>([] {});
-
-    std::vector<std::shared_ptr<Scheduleable>> scheduleable_list = {
-        iteration_start,
-        EMB_input_ready_wait,
-        embedding_index_calculation,
-        BNET_input_ready_wait,
-        cross_iteration_sync,
-        embedding_infreq_model_forward,
-        embedding_infreq_network_forward,
-        embedding_freq_forward,
-        network_init,
-        bottom_network_fprop,
-        init_wgrad,
-        schedule_reader,
-        top_network_fprop,
-        lr_sched_update,
-        cal_loss,
-        top_network_bprop,
-        embedding_freq_backward,
-        bottom_network_bprop,
-        embedding_infreq_network_backward,
-        embedding_infreq_model_backward,
-        schedule_split_3way,
-        network_exchange_wgrad,
-        schedule_d2d,
-        embedding_freq_update_params,
-        update_params,
-        iteration_end,
-    };
-
-    if (solver_.train_intra_iteration_overlap) {
-      std::string infreq_stream = "side_stream";
-      std::string freq_stream = "freq_stream";
-      std::string network_side_stream = "network_side_stream";
-
-      auto done_iteration_start = iteration_start->record_done();
-      auto done_cross_iteration_sync = cross_iteration_sync->record_done();
-      auto done_embedding_infreq_model_forward = embedding_infreq_model_forward->record_done();
-      auto done_embedding_infreq_network_forward = embedding_infreq_network_forward->record_done();
-      auto done_embedding_freq_forward = embedding_freq_forward->record_done();
-      auto done_bottom_network_fprop = bottom_network_fprop->record_done();
-      auto done_top_network_fprop = top_network_fprop->record_done();
-      auto done_init_wgrad = init_wgrad->record_done();
-      auto done_lr_sched_update = lr_sched_update->record_done();
-      auto done_top_network_bprop = top_network_bprop->record_done();
-      auto done_embedding_freq_backward = embedding_freq_backward->record_done();
-      auto done_bottom_network_bprop = bottom_network_bprop->record_done();
-      auto done_network_exchange_wgrad = network_exchange_wgrad->record_done();
-      auto done_embedding_infreq_network_backward =
-          embedding_infreq_network_backward->record_done();
-      auto done_freq_update_params = embedding_freq_update_params->record_done();
-
-      EMB_input_ready_wait->set_stream(infreq_stream);
-      EMB_input_ready_wait->wait_event({done_iteration_start});
-      embedding_index_calculation->set_stream(infreq_stream);
-      cross_iteration_sync->set_stream(infreq_stream);
-
-      embedding_infreq_model_forward->set_stream(infreq_stream);
-      embedding_infreq_network_forward->set_stream(infreq_stream);
-
-      const bool overlap_infreq_freq =
-          (sparse_embedding_params_[0].hybrid_embedding_param.communication_type !=
-           hybrid_embedding::CommunicationType::NVLink_SingleNode);
-
-      if (overlap_infreq_freq) {
-        embedding_freq_forward->set_stream(freq_stream);
-        embedding_freq_forward->wait_event(
-            {done_cross_iteration_sync, done_embedding_infreq_model_forward});
-      } else {
-        embedding_freq_forward->set_stream(infreq_stream);
-      }
-
-      bottom_network_fprop->wait_event({done_embedding_infreq_model_forward});
-      schedule_reader->wait_event({
-          done_embedding_infreq_network_forward,
-          done_embedding_freq_forward,
-      });
-
-      init_wgrad->set_stream(network_side_stream);
-      init_wgrad->wait_event({done_bottom_network_fprop});
-
-      lr_sched_update->set_stream(network_side_stream);
-      lr_sched_update->wait_event({done_top_network_fprop});
-      top_network_bprop->wait_event({
-          done_init_wgrad,
-          done_lr_sched_update,
-      });
-
-      embedding_freq_backward->set_stream(infreq_stream);
-      embedding_freq_backward->wait_event({done_top_network_bprop});
-
-      network_exchange_wgrad->wait_event({
-          done_embedding_freq_backward,
-          done_bottom_network_bprop,
-      });
-
-      embedding_infreq_network_backward->set_stream(infreq_stream);
-      embedding_infreq_network_backward->wait_event({done_top_network_bprop});
-      embedding_infreq_model_backward->set_stream(infreq_stream);
-
-      embedding_freq_update_params->set_stream(infreq_stream);
-      embedding_freq_update_params->wait_event({done_network_exchange_wgrad});
-      iteration_end->wait_event({
-          done_embedding_infreq_network_backward,
-          done_freq_update_params,
-      });
-    }
-
-    auto graph = std::make_shared<GraphScheduleable>(scheduleable_list);
-    graph_.train_pipeline_[local_id] = Pipeline{"train", gpu_resource, {graph}};
-    if (solver_.train_inter_iteration_overlap) {
-      cudaStream_t s3w_stream = gpu_resource->get_stream("s3w");
-      cudaStream_t d2d_stream = gpu_resource->get_stream("s3w");
-      scheduled_reader->set_schedule_streams(s3w_stream, d2d_stream, local_id);
-
-      auto done_iteration_end = iteration_end->record_done(use_graph);
-      cross_iteration_sync->wait_event({done_iteration_end}, use_graph);
-
-      auto graph2 = std::make_shared<GraphScheduleable>(scheduleable_list);
-      graph_.train_pipeline_[local_id + resource_manager_->get_local_gpu_count()] =
-          Pipeline{"train2", gpu_resource, {graph2}};
-    } else {
-      cudaStream_t s3w_stream = gpu_resource->get_stream("train");
-      cudaStream_t d2d_stream = gpu_resource->get_stream("train");
-      scheduled_reader->set_schedule_streams(s3w_stream, d2d_stream, local_id);
-    }
-  }
-}
-
-void Model::train_pipeline(size_t current_batch_size) {
-  auto scheduled_reader = dynamic_cast<SchedulableDataReader*>(train_data_reader_.get());
-  auto scheduled_embedding = dynamic_cast<SchedulableEmbeding*>(embeddings_[0].get());
-
-  const auto inflight_id = scheduled_reader->get_current_inflight_id();
-  const bool cached = scheduled_reader->is_batch_cached();
-
-  const bool use_graph = solver_.use_cuda_graph && !scheduled_reader->current_batch_incomplete();
-
-  scheduled_embedding->assign_input_tensors(true, current_batch_size, inflight_id, cached);
-
-#pragma omp parallel num_threads(resource_manager_->get_local_gpu_count())
-  {
-    int id = omp_get_thread_num();
-    auto device_id = resource_manager_->get_local_gpu(id)->get_device_id();
-    CudaCPUDeviceContext context(device_id);
-
-    const auto graph_id = solver_.train_inter_iteration_overlap
-                              ? (inflight_id * resource_manager_->get_local_gpu_count() + id)
-                              : id;
-    HCTR_CHECK_HINT(graph_id < graph_.train_pipeline_.size(), "graph_id out of range");
-
-    if (use_graph) {
-      graph_.train_pipeline_[graph_id].run_graph();
-      if (scheduled_reader) {
-        scheduled_reader->update_schedule_graph(id);
-      }
-    } else {
-      graph_.train_pipeline_[graph_id].run();
-    }
-    cudaStream_t graph_stream = resource_manager_->get_local_gpu(id)->get_stream(
-        graph_.train_pipeline_[graph_id].get_stream_name());
-
-    auto train_sync_back_event =
-        resource_manager_->get_local_gpu(id)->get_event("train_sync_back_event");
-    HCTR_LIB_THROW(cudaEventRecord(train_sync_back_event, graph_stream));
-    HCTR_LIB_THROW(cudaStreamWaitEvent(resource_manager_->get_local_gpu(id)->get_stream(),
-                                       train_sync_back_event));
-  }
-}
-
-template <typename NetworkType>
-void Model::create_evaluate_pipeline(std::vector<std::shared_ptr<NetworkType>>& networks) {
-  auto scheduled_reader = dynamic_cast<SchedulableDataReader*>(evaluate_data_reader_.get());
-  auto scheduled_embedding = dynamic_cast<SchedulableEmbeding*>(embeddings_[0].get());
-  bool is_train = false;
-
-  graph_.evaluate_pipeline_.resize(resource_manager_->get_local_gpu_count());
-
-  for (int local_id = 0; local_id < resource_manager_->get_local_gpu_count(); local_id++) {
-    auto gpu_resource = resource_manager_->get_local_gpu(local_id);
-    CudaCPUDeviceContext ctx(gpu_resource->get_device_id());
-
-    // create scheduleable
-    auto iteration_strat = std::make_shared<StreamContextScheduleable>([] {});
-
-    auto EMB_input_ready_wait = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      scheduled_reader->stream_wait_sparse_tensors(stream, local_id, false);
-    });
-
-    auto BNET_input_ready_wait = std::make_shared<StreamContextScheduleable>([=] {
-      auto stream = gpu_resource->get_stream();
-      scheduled_reader->stream_wait_dense_tensors(stream, local_id, false);
-    });
-
-    auto embedding_index_calculation = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->index_calculation(is_train, local_id); });
-
-    auto embedding_freq_forward = std::make_shared<StreamContextScheduleable>([=] {
-      scheduled_embedding->freq_forward(is_train, local_id, this->graph_.is_first_eval_batch_);
-    });
-
-    auto embedding_infreq_model_forward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->infreq_model_forward(local_id); });
-
-    auto embedding_infreq_network_forward = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->infreq_network_forward(is_train, local_id); });
-
-    auto embedding_global_barrier = std::make_shared<StreamContextScheduleable>(
-        [=] { scheduled_embedding->global_barrier(is_train, local_id); });
-
-    auto network_init = std::make_shared<StreamContextScheduleable>([=] {
-      if (networks[local_id]->use_mixed_precision_ &&
-          networks[local_id]->optimizer_->get_optimizer_type() != Optimizer_t::SGD) {
-        networks[local_id]->conv_weight_(networks[local_id]->train_weight_tensor_half_,
-                                         networks[local_id]->train_weight_tensor_);
-      }
-    });
-
-    auto network_eval = std::make_shared<StreamContextScheduleable>([=] {
-      long long current_batchsize_per_device =
-          scheduled_reader->get_current_batchsize_per_device(local_id);
-
-      networks[local_id]->eval(current_batchsize_per_device);
-    });
-
-    auto cal_metrics = std::make_shared<StreamContextScheduleable>([=] {
-      for (auto& metric : metrics_) {
-        auto metric_map = networks[local_id]->get_raw_metrics_all().begin()->second;
-        metric->local_reduce(local_id, metric_map);
-      }
-    });
-
-    std::vector<std::shared_ptr<Scheduleable>> scheduleable_list = {
-        iteration_strat,
-        BNET_input_ready_wait,
-        EMB_input_ready_wait,
-        embedding_index_calculation,
-        embedding_infreq_model_forward,
-        embedding_infreq_network_forward,
-        embedding_freq_forward,
-        embedding_global_barrier,
-        network_init,
-        network_eval,
-        cal_metrics,
-    };
-
-    const bool overlap_infreq_freq =
-        (sparse_embedding_params_[0].hybrid_embedding_param.communication_type !=
-         hybrid_embedding::CommunicationType::NVLink_SingleNode) &&
-        solver_.eval_intra_iteration_overlap;
-    std::string eval_embedding = "eval_embedding";
-    std::string eval_freq = "eval_freq";
-
-    if (solver_.eval_inter_iteration_overlap) {
-      // s3w_stream should be the same with embedding stream
-      cudaStream_t s3w_stream = gpu_resource->get_stream(eval_embedding);
-      cudaStream_t d2d_stream = gpu_resource->get_stream("default");
-      scheduled_reader->set_schedule_streams(s3w_stream, d2d_stream, local_id);
-
-      auto done_embedding_infreq_model_forward = embedding_infreq_model_forward->record_done();
-      auto done_embedding_infreq_network_forward = embedding_infreq_network_forward->record_done();
-      auto done_embedding_freq_forward = embedding_freq_forward->record_done();
-      auto done_network_eval = network_eval->record_done();
-
-      EMB_input_ready_wait->set_absolute_stream(eval_embedding);
-      embedding_index_calculation->set_absolute_stream(eval_embedding);
-      embedding_infreq_model_forward->set_absolute_stream(eval_embedding);
-      embedding_infreq_network_forward->set_absolute_stream(eval_embedding);
-      embedding_infreq_network_forward->wait_event({done_network_eval});
-
-      if (overlap_infreq_freq) {
-        embedding_freq_forward->set_stream(eval_freq);
-        embedding_freq_forward->wait_event(
-            {done_embedding_infreq_model_forward, done_network_eval});
-      } else {
-        embedding_freq_forward->set_absolute_stream(eval_embedding);
-      }
-      embedding_global_barrier->set_absolute_stream(eval_embedding);
-
-      network_init->wait_event(
-          {done_embedding_infreq_network_forward, done_embedding_freq_forward});
-    } else if (overlap_infreq_freq) {
-      auto done_embedding_infreq_model_forward = embedding_infreq_model_forward->record_done();
-      auto done_embedding_freq_forward = embedding_freq_forward->record_done();
-
-      embedding_freq_forward->set_stream(eval_freq);
-      embedding_freq_forward->wait_event({done_embedding_infreq_model_forward});
-      network_init->wait_event({done_embedding_freq_forward});
-    }
-
-    auto graph = std::make_shared<GraphScheduleable>(scheduleable_list);
-    graph_.evaluate_pipeline_[local_id] = Pipeline{"default", gpu_resource, {graph}};
-  }
-}
-
-void Model::evaluate_pipeline(size_t current_batch_size) {
-  auto scheduled_reader = dynamic_cast<SchedulableDataReader*>(evaluate_data_reader_.get());
-  auto scheduled_embedding = dynamic_cast<SchedulableEmbeding*>(embeddings_[0].get());
-
-  const auto inflight_id = scheduled_reader->get_current_inflight_id();
-  const bool cached = scheduled_reader->is_batch_cached();
-
-  scheduled_embedding->assign_input_tensors(false, current_batch_size, inflight_id, cached);
-
-#pragma omp parallel num_threads(number_of_networks())
-  {
-    size_t id = omp_get_thread_num();
-    auto gpu = resource_manager_->get_local_gpu(id);
-    CudaCPUDeviceContext ctx(gpu->get_device_id());
-
-    if (graph_.is_first_eval_batch_) {
-      auto eval_start_event = gpu->get_event("eval_start_event");
-      HCTR_LIB_THROW(cudaEventRecord(eval_start_event, gpu->get_stream()));
-
-      cudaStream_t evaluate_stream =
-          gpu->get_stream(graph_.evaluate_pipeline_[id].get_stream_name());
-      HCTR_LIB_THROW(cudaStreamWaitEvent(evaluate_stream, eval_start_event));
-      cudaStream_t eval_embedding_stream = gpu->get_stream("eval_embedding");
-      HCTR_LIB_THROW(cudaStreamWaitEvent(eval_embedding_stream, eval_start_event));
-    }
-
-    graph_.evaluate_pipeline_[id].run();
-  }
-
-  for (auto& metric : metrics_) {
-    metric->global_reduce(number_of_networks());
-  }
-}
 bool is_first_data_distributor = true;
 
-template <typename NetworkType>
-void Model::create_train_pipeline_with_ebc(std::vector<std::shared_ptr<NetworkType>>& networks) {
+void Model::create_train_pipeline_with_ebc(std::vector<std::shared_ptr<Network>>& networks) {
   bool is_train = true;
   bool use_graph = solver_.use_cuda_graph;
 
@@ -925,8 +472,7 @@ void Model::train_pipeline_with_ebc() {
   }
 }
 
-template <typename NetworkType>
-void Model::create_evaluate_pipeline_with_ebc(std::vector<std::shared_ptr<NetworkType>>& networks) {
+void Model::create_evaluate_pipeline_with_ebc(std::vector<std::shared_ptr<Network>>& networks) {
   bool is_train = false;
   //  bool use_graph = solver_.use_cuda_graph;
   graph_.evaluate_pipeline_.resize(resource_manager_->get_local_gpu_count());
@@ -1112,13 +658,4 @@ void Model::evaluate_pipeline_with_ebc() {
     metric->global_reduce(number_of_networks());
   }
 }
-
-template void Model::create_train_pipeline(std::vector<std::shared_ptr<Network>>&);
-template void Model::create_evaluate_pipeline(std::vector<std::shared_ptr<Network>>&);
-template void Model::create_train_network_pipeline(std::vector<std::shared_ptr<Network>>&);
-template void Model::create_eval_network_pipeline(std::vector<std::shared_ptr<Network>>&);
-template void Model::create_train_pipeline_with_ebc(
-    std::vector<std::shared_ptr<Network>>& networks);
-template void Model::create_evaluate_pipeline_with_ebc(std::vector<std::shared_ptr<Network>>&);
-
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/resource_manager.cpp b/HugeCTR/src/resource_manager.cpp
index 3fd624fc0c..9a63cca1f0 100644
--- a/HugeCTR/src/resource_manager.cpp
+++ b/HugeCTR/src/resource_manager.cpp
@@ -23,7 +23,7 @@
 
 namespace HugeCTR {
 
-std::shared_ptr<ResourceManager> ResourceManager::create(
+std::shared_ptr<ResourceManager> ResourceManagerCore::create(
     const std::vector<std::vector<int>>& visible_devices, unsigned long long seed,
     DeviceMap::Layout layout) {
   const int size{core23::MpiInitService::get().world_size()};
diff --git a/HugeCTR/src/resource_managers/resource_manager_ext.cpp b/HugeCTR/src/resource_managers/resource_manager_ext.cpp
deleted file mode 100644
index fe0e4fa59a..0000000000
--- a/HugeCTR/src/resource_managers/resource_manager_ext.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <core23/logger.hpp>
-#include <core23/mpi_init_service.hpp>
-#include <random>
-#include <resource_managers/resource_manager_ext.hpp>
-#include <utils.hpp>
-
-namespace HugeCTR {
-
-std::shared_ptr<ResourceManager> ResourceManagerExt::create(
-    const std::vector<std::vector<int>>& visible_devices, unsigned long long seed,
-    DeviceMap::Layout layout) {
-  const int size{core23::MpiInitService::get().world_size()};
-  const int rank{core23::MpiInitService::get().world_rank()};
-
-  DeviceMap device_map(visible_devices, rank, layout);
-
-  std::random_device rd;
-  if (seed == 0) {
-    seed = rd();
-  }
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Bcast(&seed, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD));
-#endif
-
-  HCTR_LOG(INFO, ROOT, "Global seed is %llu\n", seed);
-
-  std::shared_ptr<ResourceManager> core(
-      new ResourceManagerCore(size, rank, std::move(device_map), seed));
-
-  return std::shared_ptr<ResourceManager>(new ResourceManagerExt(core));
-}
-
-#ifdef ENABLE_MPI
-void ResourceManagerExt::init_ib_comm() {
-  int num_process = get_num_process();
-  if (num_process > 1) {
-    int process_id = get_process_id();
-    ib_comm_ = std::make_unique<IbComm>();
-    ib_comm_->init(num_process, get_local_gpu_count(), process_id, get_local_gpu_device_id_list());
-  }
-}
-#endif
-
-void ResourceManagerExt::set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) {
-  int num_process = get_num_process();
-#ifdef ENABLE_MPI
-  IbComm* ib_comm_ptr = nullptr;
-  if (algo == AllReduceAlgo::ONESHOT) {
-    init_ib_comm();
-    ib_comm_ptr = ib_comm_.get();
-  }
-  ar_comm_ = AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, get_local_gpus(),
-                                          ib_comm_ptr);
-#else
-  ar_comm_ = AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, get_local_gpus());
-#endif
-}
-
-}  // namespace HugeCTR
diff --git a/ci/integration_test/dlrm/benchmark_14node.sub b/ci/integration_test/dlrm/benchmark_14node.sub
deleted file mode 100644
index 6a65b12391..0000000000
--- a/ci/integration_test/dlrm/benchmark_14node.sub
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e
-
-srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
-
-srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-    cd /raid/datasets/criteo/mlperf/40m.limit_preshuffled/ && \
-    exec numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100_14x8x640.py"
diff --git a/ci/integration_test/dlrm/benchmark_1node.sub b/ci/integration_test/dlrm/benchmark_1node.sub
deleted file mode 100644
index 2f23af3667..0000000000
--- a/ci/integration_test/dlrm/benchmark_1node.sub
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e
-
-srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
-
-srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \
-    cd /raid/datasets/criteo/mlperf/40m.limit_preshuffled/ && \
-    numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100.py"
diff --git a/ci/integration_test/dlrm/dlrm.sub b/ci/integration_test/dlrm/dlrm.sub
deleted file mode 100644
index 649bad7f87..0000000000
--- a/ci/integration_test/dlrm/dlrm.sub
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
-      cd /dataset/criteo_kaggle/dcn_parquet && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_1gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_8gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_fp16_1gpu.json && \
-      python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_fp16_8gpu.json"
\ No newline at end of file
diff --git a/ci/integration_test/dlrm/ib_nvlink_1node.sub b/ci/integration_test/dlrm/ib_nvlink_1node.sub
deleted file mode 100644
index 1d689b7119..0000000000
--- a/ci/integration_test/dlrm/ib_nvlink_1node.sub
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-set -e
-
-srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
-
-srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
-    numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100_ib_nvlink.py"
-
-srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
-    python3 /workdir/test/embedding_collection_test/dgx_a100_one_hot.py --batchsize 55296 --batchsize_eval=276480 --use_mixed_precision"
diff --git a/ci/integration_test/dlrm/ib_nvlink_8node.sub b/ci/integration_test/dlrm/ib_nvlink_8node.sub
deleted file mode 100644
index 4f43d21816..0000000000
--- a/ci/integration_test/dlrm/ib_nvlink_8node.sub
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-set -e
-
-srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
-
-srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
-    exec numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py"
-
-srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
-    python3 /workdir/test/embedding_collection_test/dgx_a100_one_hot.py --batchsize 71680 --batchsize_eval=1792000 --use_mixed_precision"
diff --git a/ci/integration_test/mlperf_generalization/overlapped_pipeline.sub b/ci/integration_test/mlperf_generalization/overlapped_pipeline.sub
deleted file mode 100644
index 6c1d114b24..0000000000
--- a/ci/integration_test/mlperf_generalization/overlapped_pipeline.sub
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
-      cd /etc/workspace/new_criteo_kaggle &&
-      python3 /workdir/test/pybind_test/dgx_a100_48slots.py "
\ No newline at end of file
diff --git a/ci/selene/ci.yml b/ci/selene/ci.yml
index 0351ef3b60..0811166322 100644
--- a/ci/selene/ci.yml
+++ b/ci/selene/ci.yml
@@ -133,18 +133,6 @@ dcn_8gpu:
     WALLTIME: "01:00:00"
     TEST_CMD: ./ci/integration_test/dcn/dcn_8gpu.sub
 
-dlrm_benchmark_1node:
-  extends: .selene_test_job
-  needs:
-    - pipeline: $PARENT_PIPELINE_ID
-      job: build_train_multi_node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm_benchmark_1node
-    CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED
-    MOUNTS: /raid:/raid
-    WALLTIME: "00:15:00"
-    TEST_CMD: ./ci/integration_test/dlrm/benchmark_1node.sub
-
 dlrm_dcnv2_benchmark_1node:
   extends: .selene_test_job
   needs:
@@ -180,18 +168,6 @@ deepfm:
     WALLTIME: "00:15:00"
     TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub
 
-dlrm:
-  extends: .selene_test_job
-  needs:
-    - pipeline: $PARENT_PIPELINE_ID
-      job: build_train_single_node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm
-    CONT: $TRAIN_IMAGE_VERSIONED
-    MOUNTS: ${DATASET}:${DATASET_MOUNT}
-    WALLTIME: "00:45:00"
-    TEST_CMD: ./ci/integration_test/dlrm/dlrm.sub
-
 mmoe:
   extends: .selene_test_job
   needs:
@@ -204,18 +180,6 @@ mmoe:
     WALLTIME: "00:15:00"
     TEST_CMD: ./ci/integration_test/mmoe/mmoe.sub
 
-mlperf_generalization:
-  extends: .selene_test_job
-  needs:
-    - pipeline: $PARENT_PIPELINE_ID
-      job: build_train_single_node
-  variables:
-    GPFSFOLDER: $LOGDIR/mlperf_generalization
-    CONT: $TRAIN_IMAGE_VERSIONED
-    MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT}
-    WALLTIME: "00:15:00"
-    TEST_CMD: ./ci/integration_test/mlperf_generalization/overlapped_pipeline.sub
-
 inference_hps:
   extends: .selene_test_job
   needs:
@@ -462,18 +426,6 @@ hps_plugin_benchmark_check:
     WALLTIME: "00:15:00"
     TEST_CMD: ./ci/post_test/check_hps_plugin_benchmark.sub
 
-dlrm_1node_check:
-  # Push logs to gitlab
-  extends: .selene_post_test_job
-  needs:
-    - dlrm_benchmark_1node
-  variables:
-    GPFSFOLDER: $LOGDIR/dlrm_1node_check
-    CONT: $TRAIN_IMAGE_VERSIONED
-    MOUNTS: $LOGDIR/dlrm_benchmark_1node:/logs
-    WALLTIME: "00:15:00"
-    TEST_CMD: ./ci/post_test/check_dlrm_1node.sub
-
 dlrm_dcnv2_1node_check:
   # Push logs to gitlab
   extends: .selene_post_test_job
diff --git a/ci/template.yml b/ci/template.yml
index f7fff05ddc..1e4c71b2d6 100644
--- a/ci/template.yml
+++ b/ci/template.yml
@@ -461,7 +461,7 @@ stages:
   variables:
     GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE}
     CONT: ${UNIFIED_CTR_LATEST}
-    MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DATASET_CRITEO_SELENE}:${OLD_CRITEO_MOUNT},/raid:/raid,${CI_PROJECT_DIR}:/hugectr
+    MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DATASET_CRITEO_SELENE}:${CRITEO_MOUNT},/raid:/raid,${CI_PROJECT_DIR}:/hugectr
     SLURM_ACCOUNT: coreai_devtech_all
     OLD_SLURM_ACCOUNT: "devtech"
     GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT"
diff --git a/ci/utest/utest.sub b/ci/utest/utest.sub
index 420c82479d..d49ce258dd 100644
--- a/ci/utest/utest.sub
+++ b/ci/utest/utest.sub
@@ -2,7 +2,6 @@
 
 srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\
       cd /workdir/build/bin && \
-      ./async_reader && \
       ./checker_test && \
       ./data_reader_test && \
       ./device_map_test && \
diff --git a/samples/dlrm/README.md b/samples/dlrm/README.md
index f6f55e9318..deba5b181e 100644
--- a/samples/dlrm/README.md
+++ b/samples/dlrm/README.md
@@ -1,13 +1,12 @@
 # DLRM CTR SAMPLE #
 
-> **Deprecation Warning**: DLRM samples are based on the [one-hot RawAsync DataReader](https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw) and HybridEmbedding, both of which will be deprecated in a future release. Please check out the [multi-hot RawAsync DataReader]((https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw)) and [embedding collection](https://nvidia-merlin.github.io/HugeCTR/main/api/hugectr_layer_book.html#embedding-collection) for alternatives.
+> **Deprecation Warning**: DLRM samples are based on the [one-hot RawAsync DataReader](https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw) and HybridEmbedding, both of which were deprecated. Please check out the [multi-hot RawAsync DataReader]((https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw)) and [embedding collection](https://nvidia-merlin.github.io/HugeCTR/main/api/hugectr_layer_book.html#embedding-collection) for alternatives.
 
-The purpose of this sample is to demonstrate how to build and train a [DLRM model](https://ai.facebook.com/blog/dlrm-an-advanced-open-source-deep-learning-recommendation-model/) with HugeCTR.
+The purpose of this sample is to demonstrate how to build and train a [DLRM DCNv2 model](https://arxiv.org/abs/2008.13535) with HugeCTR.
 
 ## Table of Contents
-* [Set Up the HugeCTR Docker Environmen](#set-up-the-hugectr-docker-environment)
+* [Set Up the HugeCTR Docker Environment](#set-up-the-hugectr-docker-environment)
 * [MLPerf DLRM](#mlperf-dlrm)
-* [Kaggle DLRM](#kaggle-dlrm)
 
 ## Set Up the HugeCTR Docker Environment ##
 You can set up the HugeCTR Docker environment by doing one of the following:
@@ -34,38 +33,121 @@ $ export PYTHONPATH=/usr/local/hugectr/lib:$PYTHONPATH
 
 ## MLPerf DLRM
 Ensure that you've met the following requirements:
-- MLPerf v1.0: DGX A100 14 nodes
+- MLPerf v3.1: DGX H100 1 node, 8 nodes or 16 nodes
+- Install requirements: pip install -r requirements.txt
 
-### Preprocess the Terabyte Click Logs ##
-The [Terabyte Click Logs](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) provided by CriteoLabs is used in this sample. The row count of each embedding table is limited to 40 million. The data is processed the same way as dlrm. For more information, see [Benchmarking](https://github.com/facebookresearch/dlrm#benchmarking). Each sample has 40 32-bit integers. The first integer is a label, the next 13 integers are dense features, and the last 26 integers are category features.
+### Dataset downloading and preprocessing ##
+Input preprocessing steps below are based on the instructions from the official reference implementation repository, see [Running the MLPerf DLRM v2 benchmark](https://github.com/mlcommons/training/tree/master/recommendation_v2/torchrec_dlrm#running-the-mlperf-dlrm-v2-benchmark). Besides, there is a final step to convert the reference implementation dataset to the raw format in order to make it consumable by HugeCTR training script. For completeness, all the steps are detailed below.
 
-1. Download the terabyte datasets from the [Terabyte Click Logs](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) into the `"${project_home}/samples/dlrm/"` folder.
+This process can take up to several days and needs 7 TB of fast storage space. The preprocessing steps do not require a GPU machine.
 
-2. Unzip the datasets and name them in the following manner: `day_0`, `day_1`, ..., `day_23`.
+**1.1** Download the dataset from https://ailab.criteo.com/ressources/criteo-1tb-click-logs-dataset-for-mlperf/.
 
-3. Preprocess the datasets using the following command:
-   ```bash
-   # Usage: dlrm_raw input_dir output_dir --train {days for training} --test {days for testing}
-   $ dlrm_raw ./ ./ \
-   --train 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \
-   --test 23
-   ```
-   This operation will generate `train.bin(671.2GB)` and `test.bin(14.3GB)`.
+**1.2** Clone the reference implementation repository.
 
+```
+git clone https://github.com/mlcommons/training.git
+cd training/recommendation_v2/torchrec_dlrm
+```
 
-### Run the Terabyte Click Logs with MLPerf v1.0 ##
+**1.3** Build and run the reference docker image.
+```
+docker build -t dlrmv2_reference .
+docker run -it --rm --network=host --ipc=host -v /data:/data dlrmv2_reference
+```
 
-Run the single node DGX-100 Python script using the following command:
-   ```shell
-   $ python3 dgx_a100.py
-   ```
+**1.4** Run preprocessing steps to get data in NumPy format.
 
-Run the 14-node DGX-100 Python script using the following command:
-   ```shell
-   $ numactl --interleave=all python3 dgx_a100_14x8x640.py
-   ```
+```
+./scripts/process_Criteo_1TB_Click_Logs_dataset.sh \
+    /data/criteo_1tb/raw_input_dataset_dir \
+    /data/criteo_1tb/temp_intermediate_files_dir \
+    /data/criteo_1tb/numpy_contiguous_shuffled_output_dataset_dir
+```
+As a result, files named: `day_*_labels.npy`, `day_*_dense.npy` and `day_0_sparse.npy` will be created (3 per each of 24 days in the original input dataset, 72 files in total). Once completed, the output data can be verified with md5sums provided in [md5sums_preprocessed_criteo_click_logs_dataset.txt](https://github.com/mlcommons/training/blob/master/recommendation_v2/torchrec_dlrm/md5sums_preprocessed_criteo_click_logs_dataset.txt) file.
+
+**1.5** Create a synthetic multi-hot Criteo dataset.
+
+This step produces multi-hot dataset from the original (one-hot) dataset.
+
+```
+python scripts/materialize_synthetic_multihot_dataset.py \
+    --in_memory_binary_criteo_path /data/criteo_1tb/numpy_contiguous_shuffled_output_dataset_dir \
+    --output_path /data/criteo_1tb_sparse_multi_hot \
+    --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 \
+    --multi_hot_sizes 3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1 \
+    --multi_hot_distribution_type uniform
+```
+
+As a result, `day_*_sparse_multi_hot.npz` files will be created (24 files in total). Once done, the output data can be validated with md5sums provided in [md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt](https://github.com/mlcommons/training/blob/master/recommendation_v2/torchrec_dlrm/md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt) file.
+
+**1.6** Convert NumPy dataset to raw format.
+
+Because HugeCTR uses, among others, [raw format](https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw) for input data, we need to convert NumPy files created in the preceding steps to this format. To this end, use `preprocessing/convert_to_raw.py` script that comes with the container created in section [Build the container and push to a docker registry](#build-the-container-and-push-to-a-docker-registry) below.
+
+```
+docker run -it --rm --network=host --ipc=host -v /data:/data nvcr.io/nvidia/merlin/merlin-hugectr:23.12
+```
+In that container, run:
+```
+python preprocessing/convert_to_raw.py \
+   --input_dir_labels_and_dense /data/criteo_1tb/numpy_contiguous_shuffled_output_dataset_dir \
+   --input_dir_sparse_multihot /data/criteo_1tb_sparse_multi_hot \
+   --output_dir /data/criteo_1tb_multihot_raw \
+   --stages train val
+```
+As a result, `train_data.bin` and `val_data.bin` will be created. Once done, the output files can be verified with the md5sums provided in `preprocessing/md5sums_raw_dataset.txt` file.
+
+### Specify the preprocessed data paths in the training script.
+
+You may need to manually change the location of the datasets in the `train.py` file.
+The `source` parameter should specify the absolute path to the `train_data.bin` file and the `eval_source`
+parameter should point to the `val_data.bin` file from `/data/criteo_1tb_multihot_raw` folder obtained in the previous step.
+
+However, for launching with nvidia-docker, you just need to make sure to set `DATADIR` as the path to the directory containing those two files.
+
+### Steps to launch training on a single node
+
+#### NVIDIA DGX H100 (single-node)
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX H100
+single-node submission are in the `config_DGXH100_1x8x6912.sh` script and in the `train.py` config file.
+
+To launch the training on a single node with a Slurm cluster run:
+```
+source config_DGXH100_1x8x6912.sh
+CONT=<docker/registry>/mlperf-nvidia:recommendation_hugectr LOGDIR=<path/to/output/dir> sbatch -N 1 run.sub
+```
+
+Note that this benchmark has high I/O bandwidth requirements. To achieve optimal performance in the case of single-node training job at least 13.4 GB/s and 41.4 GB/s read bandwidth is required during training and evaluation stage, respectively.
 
-**IMPORTANT NOTES**: 
-- To run the 14-node DGX-100 training script on Selene, you need to submit the job on the Selene login node properly.
-- In v2.2.1, there is a CUDA Graph error that occurs when running this sample on DGX2. To run it on DGX2, specify `"use_cuda_graph = False` within `CreateSolver` in the Python script. For detailed information about this error, see [Known Issues](https://github.com/NVIDIA-Merlin/HugeCTR/blob/master/release_notes.md#known-issues).
-- `cache_eval_data` is only supported on DGX A100. If you're running DGX2, disable it. 
+#### Alternative launch with docker
+
+When generating results for the official v3.0 submission with one node, the
+benchmark was launched onto a cluster managed by a Slurm scheduler. The
+instructions in [NVIDIA DGX H100 (single node)](#nvidia-dgx-h100-single-node) explain
+how that is done.
+
+However, to make it easier to run this benchmark on a wider set of machine
+environments, we are providing here an alternate set of launch instructions
+that can be run using `nvidia-docker`. Note that performance or functionality may
+vary from the tested Slurm instructions.
+
+```
+source config_DGXH100_1x8x6912.sh
+CONT=<docker/registry>mlperf-nvidia:recommendation_hugectr DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> ./run_with_docker.sh
+```
+
+### Steps to launch training on multiple nodes
+
+#### NVIDIA DGX H100 (multi-node)
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX H100
+multi-node submission are in the `config_DGXH100_8x8x2112.sh` or `config_DGXH100_16x8x1056.sh` scripts
+and in the `train.py` config file.
+
+To launch the training for a selected config with a Slurm cluster run:
+```
+source config_DGXH100_8x8x2112.sh
+CONT=<docker/registry>/mlperf-nvidia:recommendation_hugectr LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES run.sub
+```
diff --git a/samples/dlrm/config_DGXH100_16x8x1056.sh b/samples/dlrm/config_DGXH100_16x8x1056.sh
new file mode 100644
index 0000000000..ad0b29b05d
--- /dev/null
+++ b/samples/dlrm/config_DGXH100_16x8x1056.sh
@@ -0,0 +1,32 @@
+## DL params
+export RUN_SCRIPT="train.py"
+export BATCHSIZE=135168
+export BATCHSIZE_EVAL=2097152
+export LEARNING_RATE=0.0034
+export USE_MIXED_PRECISION=true
+export SCALER=20480
+export SHARDING_PLAN=hier_auto
+export MEM_COMM_BW_RATIO=67
+export GEN_LOSS_SUMMARY=true
+export MINIMUM_TRAINING_TIME=10
+export DP_SHARDING_THRESHOLD=0.0125
+
+## System run params
+export DGXNNODES=16
+export DGXNGPU=8
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=15
+
+## Set clocks and walltime for maxQ and minEDP runs
+if [[ "${SET_MAXQ_CLK:-0}" == "1" ]]; then
+  export MAXQ_CLK=1350
+  WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 2) # 50% longer walltime
+elif [[ "${SET_MINEDP_CLK:-0}" == "1" ]]; then
+  export MINEDP_CLK=1410
+  WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 3) # 33% longer walltime
+fi
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES} + 5 ))
+
+## network flags
+export SBATCH_NETWORK=sharp
+export NCCL_COLLNET_ENABLE=1
diff --git a/samples/dlrm/config_DGXH100_1x8x6912.sh b/samples/dlrm/config_DGXH100_1x8x6912.sh
new file mode 100644
index 0000000000..0d72dfcc76
--- /dev/null
+++ b/samples/dlrm/config_DGXH100_1x8x6912.sh
@@ -0,0 +1,28 @@
+## DL params
+export RUN_SCRIPT="train.py"
+export BATCHSIZE=55296
+export BATCHSIZE_EVAL=262144
+export LEARNING_RATE=0.004
+export USE_MIXED_PRECISION=true
+export SCALER=16348
+export SHARDING_PLAN=auto
+export MEM_COMM_BW_RATIO=7
+export GEN_LOSS_SUMMARY=true
+export MINIMUM_TRAINING_TIME=10
+export DP_SHARDING_THRESHOLD=0.008
+
+## System run params
+export DGXNNODES=1
+export DGXNGPU=8
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=15
+
+## Set clocks and walltime for maxQ and minEDP runs
+if [[ "${SET_MAXQ_CLK:-0}" == "1" ]]; then
+  export MAXQ_CLK=1320
+  WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 2) # 50% longer walltime
+elif [[ "${SET_MINEDP_CLK:-0}" == "1" ]]; then
+  export MINEDP_CLK=1665
+  WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 3) # 33% longer walltime
+fi
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES} + 5 ))
diff --git a/samples/dlrm/config_DGXH100_8x8x2112.sh b/samples/dlrm/config_DGXH100_8x8x2112.sh
new file mode 100644
index 0000000000..8db218a4c4
--- /dev/null
+++ b/samples/dlrm/config_DGXH100_8x8x2112.sh
@@ -0,0 +1,32 @@
+## DL params
+export RUN_SCRIPT="train.py"
+export BATCHSIZE=135168
+export BATCHSIZE_EVAL=1048576
+export LEARNING_RATE=0.0034
+export USE_MIXED_PRECISION=true
+export SCALER=20480
+export SHARDING_PLAN=hier_auto
+export MEM_COMM_BW_RATIO=67
+export GEN_LOSS_SUMMARY=true
+export MINIMUM_TRAINING_TIME=10
+export DP_SHARDING_THRESHOLD=0.0125
+
+## System run params
+export DGXNNODES=8
+export DGXNGPU=8
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+WALLTIME_MINUTES=15
+
+## Set clocks and walltime for maxQ and minEDP runs
+if [[ "${SET_MAXQ_CLK:-0}" == "1" ]]; then
+  export MAXQ_CLK=1275
+  WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 2) # 50% longer walltime
+elif [[ "${SET_MINEDP_CLK:-0}" == "1" ]]; then
+  export MINEDP_CLK=1530
+  WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 3) # 33% longer walltime
+fi
+export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES} + 5 ))
+
+## network flags
+export SBATCH_NETWORK=sharp
+export NCCL_COLLNET_ENABLE=1
diff --git a/samples/dlrm/dgx_a100.py b/samples/dlrm/dgx_a100.py
deleted file mode 100644
index 28e37eb8af..0000000000
--- a/samples/dlrm/dgx_a100.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
- Copyright (c) 2023, NVIDIA CORPORATION.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import hugectr
-from mpi4py import MPI
-
-# 1. Create Solver, DataReaderParams and Optimizer
-solver = hugectr.CreateSolver(
-    max_eval_batches=51,
-    batchsize_eval=1769472,
-    batchsize=55296,
-    vvgpu=[[0, 1, 2, 3, 4, 5, 6, 7]],
-    repeat_dataset=True,
-    lr=24.0,
-    warmup_steps=2750,
-    decay_start=49315,
-    decay_steps=27772,
-    decay_power=2.0,
-    end_lr=0.0,
-    use_mixed_precision=True,
-    scaler=1024,
-    use_cuda_graph=True,
-    gen_loss_summary=False,
-    train_intra_iteration_overlap=True,
-    train_inter_iteration_overlap=True,
-    eval_intra_iteration_overlap=True,  # doesn't do anything
-    eval_inter_iteration_overlap=True,
-    all_reduce_algo=hugectr.AllReduceAlgo.OneShot,
-    grouped_all_reduce=False,
-    num_iterations_statistics=20,
-    metrics_spec={hugectr.MetricsType.AUC: 0.8025},
-    perf_logging=True,
-    drop_incomplete_batch=False,
-)
-
-batchsize = 55296
-num_reading_threads = 32
-num_batches_per_threads = 4
-expected_io_block_size = batchsize * 10
-io_depth = 2
-io_alignment = 512
-bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize
-max_nr_per_threads = num_batches_per_threads * (
-    bytes_size_per_batches // expected_io_block_size + 2
-)
-
-reader = hugectr.DataReaderParams(
-    data_reader_type=hugectr.DataReaderType_t.RawAsync,
-    source=["/raid/datasets/criteo/mlperf/40m.limit_preshuffled/train_data.bin"],
-    eval_source="/raid/datasets/criteo/mlperf/40m.limit_preshuffled/test_data.bin",
-    check_type=hugectr.Check_t.Non,
-    num_samples=4195197692,
-    eval_num_samples=89137319,
-    cache_eval_data=51,
-    slot_size_array=[
-        39884406,
-        39043,
-        17289,
-        7420,
-        20263,
-        3,
-        7120,
-        1543,
-        63,
-        38532951,
-        2953546,
-        403346,
-        10,
-        2208,
-        11938,
-        155,
-        4,
-        976,
-        14,
-        39979771,
-        25641295,
-        39664984,
-        585935,
-        12972,
-        108,
-        36,
-    ],
-    # max_nr_per_threads  = num_batches_per_threads  * (bytes_size_per_batches / io_block_size + 2)
-    # max_nr_per_threads  = 4 * (55296 * 160 / 552960 + 2  ) = 4 * 18 = 72
-    async_param=hugectr.AsyncParam(
-        num_reading_threads,
-        num_batches_per_threads,
-        max_nr_per_threads,
-        io_depth,
-        io_alignment,
-        True,
-        hugectr.Alignment_t.Auto,
-        multi_hot_reader=False,
-        is_dense_float=False,
-    ),
-)
-optimizer = hugectr.CreateOptimizer(
-    optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True
-)
-# 2. Initialize the Model instance
-model = hugectr.Model(solver, reader, optimizer)
-# 3. Construct the Model graph
-model.add(
-    hugectr.Input(
-        label_dim=1,
-        label_name="label",
-        dense_dim=13,
-        dense_name="dense",
-        data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)],
-    )
-)
-model.add(
-    hugectr.SparseEmbedding(
-        embedding_type=hugectr.Embedding_t.HybridSparseEmbedding,
-        workspace_size_per_gpu_in_mb=15000,
-        slot_size_array=[
-            39884406,
-            39043,
-            17289,
-            7420,
-            20263,
-            3,
-            7120,
-            1543,
-            63,
-            38532951,
-            2953546,
-            403346,
-            10,
-            2208,
-            11938,
-            155,
-            4,
-            976,
-            14,
-            39979771,
-            25641295,
-            39664984,
-            585935,
-            12972,
-            108,
-            36,
-        ],
-        embedding_vec_size=128,
-        combiner="sum",
-        sparse_embedding_name="sparse_embedding1",
-        bottom_name="data1",
-        optimizer=optimizer,
-        hybrid_embedding_param=hugectr.HybridEmbeddingParam(
-            2,
-            -1,
-            0.03,
-            1.3e11,
-            2.6e11,
-            1.0,
-            hugectr.CommunicationType.NVLink_SingleNode,
-            hugectr.HybridEmbeddingType.Distributed,
-        ),
-    )
-)
-
-compute_config = hugectr.DenseLayerComputeConfig(
-    async_wgrad=True,
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["dense"],
-        top_names=["mlp1"],
-        num_outputs=[512, 256, 128],
-        compute_config=compute_config,
-        act_type=hugectr.Activation_t.Relu,
-        use_bias=True,
-    )
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.Interaction,
-        bottom_names=["mlp1", "sparse_embedding1"],
-        top_names=["interaction1", "interaction_grad"],
-    )
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["interaction1", "interaction_grad"],
-        top_names=["mlp2"],
-        num_outputs=[1024, 1024, 512, 256, 1],
-        compute_config=compute_config,
-        use_bias=True,
-        activations=[
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Non,
-        ],
-    )
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
-        bottom_names=["mlp2", "label"],
-        top_names=["loss"],
-    )
-)
-# 4. Dump the Model graph to JSON
-model.graph_to_json(graph_config_file="dlrm.json")
-# 5. Compile & Fit
-model.compile()
-model.summary()
-model.fit(
-    max_iter=75868, display=1000, eval_interval=3793, snapshot=2000000, snapshot_prefix="dlrm"
-)
diff --git a/samples/dlrm/dgx_a100_14x8x640.py b/samples/dlrm/dgx_a100_14x8x640.py
deleted file mode 100644
index 0e207dab6d..0000000000
--- a/samples/dlrm/dgx_a100_14x8x640.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""
- Copyright (c) 2023, NVIDIA CORPORATION.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import hugectr
-from mpi4py import MPI
-
-# 1. Create Solver, DataReaderParams and Optimizer
-solver = hugectr.CreateSolver(
-    max_eval_batches=50,
-    batchsize_eval=1792000,
-    batchsize=71680,
-    vvgpu=[
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-    ],
-    repeat_dataset=True,
-    lr=26.0,
-    warmup_steps=2500,
-    decay_start=46821,
-    decay_steps=15406,
-    decay_power=2.0,
-    end_lr=0.0,
-    use_mixed_precision=True,
-    scaler=1024,
-    use_cuda_graph=True,
-    gen_loss_summary=False,
-    train_intra_iteration_overlap=True,
-    train_inter_iteration_overlap=True,
-    eval_intra_iteration_overlap=True,
-    eval_inter_iteration_overlap=True,
-    all_reduce_algo=hugectr.AllReduceAlgo.OneShot,
-    grouped_all_reduce=True,
-    num_iterations_statistics=20,
-    metrics_spec={hugectr.MetricsType.AUC: 0.8025},
-    perf_logging=True,
-    drop_incomplete_batch=False,
-)
-
-batchsize = 71680
-num_reading_threads = 32
-num_batches_per_threads = 4
-expected_io_block_size = batchsize * 10
-io_depth = 2
-io_alignment = 512
-bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize
-max_nr_per_threads = num_batches_per_threads * (
-    bytes_size_per_batches // expected_io_block_size + 2
-)
-
-reader = hugectr.DataReaderParams(
-    data_reader_type=hugectr.DataReaderType_t.RawAsync,
-    source=["/raid/datasets/criteo/mlperf/40m.limit_preshuffled/train_data.bin"],
-    eval_source="/raid/datasets/criteo/mlperf/40m.limit_preshuffled/test_data.bin",
-    check_type=hugectr.Check_t.Non,
-    num_samples=4195197692,
-    eval_num_samples=89137319,
-    cache_eval_data=50,
-    slot_size_array=[
-        39884406,
-        39043,
-        17289,
-        7420,
-        20263,
-        3,
-        7120,
-        1543,
-        63,
-        38532951,
-        2953546,
-        403346,
-        10,
-        2208,
-        11938,
-        155,
-        4,
-        976,
-        14,
-        39979771,
-        25641295,
-        39664984,
-        585935,
-        12972,
-        108,
-        36,
-    ],
-    async_param=hugectr.AsyncParam(
-        num_reading_threads,
-        num_batches_per_threads,
-        max_nr_per_threads,
-        io_depth,
-        io_alignment,
-        True,
-        hugectr.Alignment_t.Auto,
-        multi_hot_reader=False,
-        is_dense_float=False,
-    ),
-)
-optimizer = hugectr.CreateOptimizer(
-    optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True
-)
-# 2. Initialize the Model instance
-model = hugectr.Model(solver, reader, optimizer)
-# 3. Construct the Model graph
-model.add(
-    hugectr.Input(
-        label_dim=1,
-        label_name="label",
-        dense_dim=13,
-        dense_name="dense",
-        data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)],
-    )
-)
-
-# Use mean num of infrequent plus 10-sigma guardband
-model.add(
-    hugectr.SparseEmbedding(
-        embedding_type=hugectr.Embedding_t.HybridSparseEmbedding,
-        workspace_size_per_gpu_in_mb=1500,
-        slot_size_array=[
-            39884406,
-            39043,
-            17289,
-            7420,
-            20263,
-            3,
-            7120,
-            1543,
-            63,
-            38532951,
-            2953546,
-            403346,
-            10,
-            2208,
-            11938,
-            155,
-            4,
-            976,
-            14,
-            39979771,
-            25641295,
-            39664984,
-            585935,
-            12972,
-            108,
-            36,
-        ],
-        embedding_vec_size=128,
-        combiner="sum",
-        sparse_embedding_name="sparse_embedding1",
-        bottom_name="data1",
-        optimizer=optimizer,
-        hybrid_embedding_param=hugectr.HybridEmbeddingParam(
-            2,
-            16640 + 1290,
-            0.01,
-            1.3e11,
-            23.75e9,
-            0.5,
-            hugectr.CommunicationType.IB_NVLink_Hier,
-            hugectr.HybridEmbeddingType.Distributed,
-        ),
-    )
-)
-compute_config_bottom = hugectr.DenseLayerComputeConfig(
-    async_wgrad=True,
-    fuse_wb=False,
-)
-
-compute_config_top = hugectr.DenseLayerComputeConfig(
-    async_wgrad=True,
-    fuse_wb=True,
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["dense"],
-        top_names=["mlp1"],
-        num_outputs=[512, 256, 128],
-        compute_config=compute_config_bottom,
-        act_type=hugectr.Activation_t.Relu,
-        use_bias=True,
-    )
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.Interaction,
-        bottom_names=["mlp1", "sparse_embedding1"],
-        top_names=["interaction1", "interaction_grad"],
-    )
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["interaction1", "interaction_grad"],
-        top_names=["mlp2"],
-        num_outputs=[1024, 1024, 512, 256, 1],
-        compute_config=compute_config_top,
-        use_bias=True,
-        activations=[
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Non,
-        ],
-    )
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
-        bottom_names=["mlp2", "label"],
-        top_names=["loss"],
-    )
-)
-# 4. Dump the Model graph to JSON
-model.graph_to_json(graph_config_file="dlrm.json")
-# 5. Compile & Fit
-model.compile()
-model.summary()
-model.fit(
-    max_iter=58527, display=1000, eval_interval=2926, snapshot=2000000, snapshot_prefix="dlrm"
-)
diff --git a/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py b/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py
deleted file mode 100755
index da09611afb..0000000000
--- a/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""
- Copyright (c) 2023, NVIDIA CORPORATION.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import hugectr
-from mpi4py import MPI
-
-# 1. Create Solver, DataReaderParams and Optimizer
-solver = hugectr.CreateSolver(
-    max_eval_batches=125,
-    batchsize_eval=716800,
-    batchsize=71680,
-    vvgpu=[
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-        [0, 1, 2, 3, 4, 5, 6, 7],
-    ],
-    repeat_dataset=True,
-    lr=26.0,
-    warmup_steps=2500,
-    decay_start=46821,
-    decay_steps=15406,
-    decay_power=2.0,
-    end_lr=0.0,
-    use_mixed_precision=True,
-    scaler=1024,
-    use_cuda_graph=False,
-    gen_loss_summary=False,
-    train_intra_iteration_overlap=True,
-    train_inter_iteration_overlap=True,
-    eval_intra_iteration_overlap=False,
-    eval_inter_iteration_overlap=False,
-    all_reduce_algo=hugectr.AllReduceAlgo.NCCL,
-    grouped_all_reduce=True,
-    num_iterations_statistics=20,
-    metrics_spec={hugectr.MetricsType.AUC: 0.8025},
-    perf_logging=True,
-    drop_incomplete_batch=False,
-)
-
-batchsize = 71680
-num_reading_threads = 32
-num_batches_per_threads = 4
-expected_io_block_size = batchsize * 10
-io_depth = 2
-io_alignment = 512
-bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize
-max_nr_per_threads = num_batches_per_threads * (
-    bytes_size_per_batches // expected_io_block_size + 2
-)
-
-reader = hugectr.DataReaderParams(
-    data_reader_type=hugectr.DataReaderType_t.RawAsync,
-    source=["/data/train_data.bin"],
-    eval_source="/data/test_data.bin",
-    check_type=hugectr.Check_t.Non,
-    num_samples=4195197692,
-    eval_num_samples=89137319,
-    cache_eval_data=125,
-    slot_size_array=[
-        39884406,
-        39043,
-        17289,
-        7420,
-        20263,
-        3,
-        7120,
-        1543,
-        63,
-        38532951,
-        2953546,
-        403346,
-        10,
-        2208,
-        11938,
-        155,
-        4,
-        976,
-        14,
-        39979771,
-        25641295,
-        39664984,
-        585935,
-        12972,
-        108,
-        36,
-    ],
-    async_param=hugectr.AsyncParam(
-        num_reading_threads,
-        num_batches_per_threads,
-        max_nr_per_threads,
-        io_depth,
-        io_alignment,
-        True,
-        hugectr.Alignment_t.Auto,
-        multi_hot_reader=False,
-        is_dense_float=False,
-    ),
-)
-
-optimizer = hugectr.CreateOptimizer(
-    optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True
-)
-# 2. Initialize the Model instance
-model = hugectr.Model(solver, reader, optimizer)
-# 3. Construct the Model graph
-model.add(
-    hugectr.Input(
-        label_dim=1,
-        label_name="label",
-        dense_dim=13,
-        dense_name="dense",
-        data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)],
-    )
-)
-
-# Use mean num of infrequent plus 10-sigma guardband
-model.add(
-    hugectr.SparseEmbedding(
-        embedding_type=hugectr.Embedding_t.HybridSparseEmbedding,
-        workspace_size_per_gpu_in_mb=1500,
-        slot_size_array=[
-            39884406,
-            39043,
-            17289,
-            7420,
-            20263,
-            3,
-            7120,
-            1543,
-            63,
-            38532951,
-            2953546,
-            403346,
-            10,
-            2208,
-            11938,
-            155,
-            4,
-            976,
-            14,
-            39979771,
-            25641295,
-            39664984,
-            585935,
-            12972,
-            108,
-            36,
-        ],
-        embedding_vec_size=128,
-        combiner="sum",
-        sparse_embedding_name="sparse_embedding1",
-        bottom_name="data1",
-        optimizer=optimizer,
-        hybrid_embedding_param=hugectr.HybridEmbeddingParam(
-            2,
-            16640 + 1290,
-            0.01,
-            130e9,
-            25e9,
-            1,
-            hugectr.CommunicationType.IB_NVLink,
-            hugectr.HybridEmbeddingType.Distributed,
-        ),
-    )
-)
-
-compute_config_bottom = hugectr.DenseLayerComputeConfig(
-    async_wgrad=True,
-    fuse_wb=False,
-)
-
-compute_config_top = hugectr.DenseLayerComputeConfig(
-    async_wgrad=True,
-    fuse_wb=True,
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["dense"],
-        top_names=["mlp1"],
-        num_outputs=[512, 256, 128],
-        compute_config=compute_config_bottom,
-        act_type=hugectr.Activation_t.Relu,
-        use_bias=True,
-    )
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.Interaction,
-        bottom_names=["mlp1", "sparse_embedding1"],
-        top_names=["interaction1", "interaction_grad"],
-    )
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["interaction1", "interaction_grad"],
-        top_names=["mlp2"],
-        num_outputs=[1024, 1024, 512, 256, 1],
-        compute_config=compute_config_top,
-        use_bias=True,
-        activations=[
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Non,
-        ],
-    )
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
-        bottom_names=["mlp2", "label"],
-        top_names=["loss"],
-    )
-)
-# 4. Dump the Model graph to JSON
-model.graph_to_json(graph_config_file="dlrm.json")
-# 5. Compile & Fit
-model.compile()
-model.summary()
-model.fit(
-    max_iter=58527, display=1000, eval_interval=2926, snapshot=2000000, snapshot_prefix="dlrm"
-)
diff --git a/samples/dlrm/dgx_a100_ib_nvlink.py b/samples/dlrm/dgx_a100_ib_nvlink.py
deleted file mode 100755
index 3e34d649c6..0000000000
--- a/samples/dlrm/dgx_a100_ib_nvlink.py
+++ /dev/null
@@ -1,228 +0,0 @@
-"""
- Copyright (c) 2023, NVIDIA CORPORATION.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import hugectr
-from mpi4py import MPI
-
-# 1. Create Solver, DataReaderParams and Optimizer
-solver = hugectr.CreateSolver(
-    max_eval_batches=323,
-    batchsize_eval=276480,
-    batchsize=55296,
-    vvgpu=[[0, 1, 2, 3, 4, 5, 6, 7]],
-    repeat_dataset=True,
-    lr=24.0,
-    warmup_steps=2750,
-    decay_start=49315,
-    decay_steps=27772,
-    decay_power=2.0,
-    end_lr=0.0,
-    use_mixed_precision=True,
-    scaler=1024,
-    use_cuda_graph=False,
-    gen_loss_summary=False,
-    train_intra_iteration_overlap=True,
-    train_inter_iteration_overlap=True,
-    eval_intra_iteration_overlap=False,
-    eval_inter_iteration_overlap=False,
-    all_reduce_algo=hugectr.AllReduceAlgo.NCCL,
-    grouped_all_reduce=False,
-    num_iterations_statistics=20,
-    metrics_spec={hugectr.MetricsType.AUC: 0.8025},
-    perf_logging=True,
-    drop_incomplete_batch=False,
-)
-
-batchsize = 55296
-num_reading_threads = 32
-num_batches_per_threads = 4
-expected_io_block_size = batchsize * 10
-io_depth = 2
-io_alignment = 512
-bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize
-max_nr_per_threads = num_batches_per_threads * (
-    bytes_size_per_batches // expected_io_block_size + 2
-)
-
-reader = hugectr.DataReaderParams(
-    data_reader_type=hugectr.DataReaderType_t.RawAsync,
-    source=["/data/train_data.bin"],
-    eval_source="/data/test_data.bin",
-    check_type=hugectr.Check_t.Non,
-    num_samples=4195197692,
-    eval_num_samples=89137319,
-    cache_eval_data=323,
-    slot_size_array=[
-        39884406,
-        39043,
-        17289,
-        7420,
-        20263,
-        3,
-        7120,
-        1543,
-        63,
-        38532951,
-        2953546,
-        403346,
-        10,
-        2208,
-        11938,
-        155,
-        4,
-        976,
-        14,
-        39979771,
-        25641295,
-        39664984,
-        585935,
-        12972,
-        108,
-        36,
-    ],
-    async_param=hugectr.AsyncParam(
-        num_reading_threads,
-        num_batches_per_threads,
-        max_nr_per_threads,
-        io_depth,
-        io_alignment,
-        True,
-        hugectr.Alignment_t.Auto,
-        multi_hot_reader=False,
-        is_dense_float=False,
-    ),
-)
-optimizer = hugectr.CreateOptimizer(
-    optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True
-)
-# 2. Initialize the Model instance
-model = hugectr.Model(solver, reader, optimizer)
-# 3. Construct the Model graph
-model.add(
-    hugectr.Input(
-        label_dim=1,
-        label_name="label",
-        dense_dim=13,
-        dense_name="dense",
-        data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)],
-    )
-)
-model.add(
-    hugectr.SparseEmbedding(
-        embedding_type=hugectr.Embedding_t.HybridSparseEmbedding,
-        workspace_size_per_gpu_in_mb=15000,
-        slot_size_array=[
-            39884406,
-            39043,
-            17289,
-            7420,
-            20263,
-            3,
-            7120,
-            1543,
-            63,
-            38532951,
-            2953546,
-            403346,
-            10,
-            2208,
-            11938,
-            155,
-            4,
-            976,
-            14,
-            39979771,
-            25641295,
-            39664984,
-            585935,
-            12972,
-            108,
-            36,
-        ],
-        embedding_vec_size=128,
-        combiner="sum",
-        sparse_embedding_name="sparse_embedding1",
-        bottom_name="data1",
-        optimizer=optimizer,
-        hybrid_embedding_param=hugectr.HybridEmbeddingParam(
-            2,
-            -1,
-            0.03,
-            130e9,
-            260e9,
-            0.25,
-            hugectr.CommunicationType.IB_NVLink,
-            hugectr.HybridEmbeddingType.Distributed,
-        ),
-    )
-)
-
-compute_config = hugectr.DenseLayerComputeConfig(
-    async_wgrad=True,
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["dense"],
-        top_names=["mlp1"],
-        num_outputs=[512, 256, 128],
-        compute_config=compute_config,
-        act_type=hugectr.Activation_t.Relu,
-        use_bias=True,
-    )
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.Interaction,
-        bottom_names=["mlp1", "sparse_embedding1"],
-        top_names=["interaction1", "interaction_grad"],
-    )
-)
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.MLP,
-        bottom_names=["interaction1", "interaction_grad"],
-        top_names=["mlp2"],
-        num_outputs=[1024, 1024, 512, 256, 1],
-        compute_config=compute_config,
-        use_bias=True,
-        activations=[
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Relu,
-            hugectr.Activation_t.Non,
-        ],
-    )
-)
-
-model.add(
-    hugectr.DenseLayer(
-        layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
-        bottom_names=["mlp2", "label"],
-        top_names=["loss"],
-    )
-)
-# 4. Dump the Model graph to JSON
-model.graph_to_json(graph_config_file="dlrm.json")
-# 5. Compile & Fit
-model.compile()
-model.summary()
-model.fit(
-    max_iter=75868, display=1000, eval_interval=3793, snapshot=2000000, snapshot_prefix="dlrm"
-)
diff --git a/samples/dlrm/mlperf_logger/__init__.py b/samples/dlrm/mlperf_logger/__init__.py
new file mode 100644
index 0000000000..54e9dec7d6
--- /dev/null
+++ b/samples/dlrm/mlperf_logger/__init__.py
@@ -0,0 +1,3 @@
+from .callbacks import LoggingCallback
+from .param_info import param_info
+from .utils import *
diff --git a/samples/dlrm/mlperf_logger/callbacks.py b/samples/dlrm/mlperf_logger/callbacks.py
new file mode 100644
index 0000000000..1359b44f80
--- /dev/null
+++ b/samples/dlrm/mlperf_logger/callbacks.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from time import perf_counter
+from typing import Dict
+
+import mlperf_logging.mllog.constants as mlperf_constants
+from mlperf_common.logging import MLLoggerWrapper
+
+import hugectr
+
+
+class LoggingCallback(hugectr.TrainingCallback):
+    def __init__(
+        self,
+        mllogger: MLLoggerWrapper,
+        auc_threshold: float,
+        max_iter: int,
+        batch_size: int,
+    ):
+        self.mllogger = mllogger
+        self.auc_threshold = auc_threshold
+        self.iter_per_epoch = max_iter
+        self.batch_size = batch_size
+        self._success = False
+        self._start_time = -1.0
+        self._total_time = -1.0
+        self._throughput = -1.0
+        self._hit_auc_iter = max_iter
+        self.minimum_training_time = 0
+        super().__init__()
+
+    def _compute_stats(self, current_iter: int):
+        self._total_time = perf_counter() - self._start_time
+        self._throughput = (current_iter + 1) * self.batch_size / self._total_time
+
+    def on_training_start(self):
+        self._start_time = perf_counter()
+        self.mllogger.log_init_stop_run_start()
+        self.mllogger.start(
+            key=mlperf_constants.EPOCH_START,
+            metadata={mlperf_constants.EPOCH_NUM: 0},
+        )
+
+    def on_training_end(self, current_iter: int):
+        epoch_num = current_iter / self.iter_per_epoch
+        self.mllogger.end(
+            key=mlperf_constants.EPOCH_STOP,
+            metadata={mlperf_constants.EPOCH_NUM: epoch_num},
+        )
+        if not self._success:
+            self.mllogger.log_run_stop(status=mlperf_constants.ABORTED, epoch_num=epoch_num)
+        self._compute_stats(current_iter)
+        if self.minimum_training_time > 0:
+            output_max_iter = current_iter + 1
+        else:
+            output_max_iter = self.iter_per_epoch
+        if self.mllogger.comm_handler.global_rank() == 0:
+            if self._success:
+                print(
+                    f"Hit target accuracy AUC {self.auc_threshold:.5f} at "
+                    f"{self._hit_auc_iter} / {output_max_iter} iterations with batchsize {self.batch_size} "
+                    f"in {self._total_time:.2f}s. Average speed is {self._throughput:.2f} records/s."
+                )
+            else:
+                print(
+                    f"Finish {current_iter + 1} iterations with "
+                    f"batchsize: {self.batch_size} in {self._total_time:.2f}s."
+                )
+        self.mllogger.event(
+            key="tracked_stats",
+            metadata={"step": current_iter / self.iter_per_epoch},
+            value={"throughput": self._throughput},
+        )
+
+    def on_eval_start(self, current_iter: int) -> bool:
+        self.mllogger.start(
+            key=mlperf_constants.EVAL_START,
+            metadata={mlperf_constants.EPOCH_NUM: current_iter / self.iter_per_epoch},
+        )
+        return False
+
+    def on_eval_end(self, current_iter: int, eval_results: Dict[str, float]) -> bool:
+        epoch_num = current_iter / self.iter_per_epoch
+        auc = eval_results["AUC"]
+        self.mllogger.event(
+            key=mlperf_constants.EVAL_ACCURACY,
+            value=auc,
+            metadata={mlperf_constants.EPOCH_NUM: epoch_num},
+        )
+        self.mllogger.end(
+            key=mlperf_constants.EVAL_STOP,
+            metadata={mlperf_constants.EPOCH_NUM: epoch_num},
+        )
+        if not self._success:
+            self._success = auc >= self.auc_threshold
+            if self._success:
+                self.mllogger.log_run_stop(status=mlperf_constants.SUCCESS, epoch_num=epoch_num)
+                self._hit_auc_iter = current_iter
+        self._total_time = perf_counter() - self._start_time
+        if self.minimum_training_time > 0:
+            if self._total_time < self.minimum_training_time * 60:
+                return False
+            else:
+                return True
+        else:
+            return self._success
diff --git a/samples/dlrm/mlperf_logger/param_info.py b/samples/dlrm/mlperf_logger/param_info.py
new file mode 100644
index 0000000000..bcb7bd6dd0
--- /dev/null
+++ b/samples/dlrm/mlperf_logger/param_info.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import Namespace
+
+import mlperf_logging.mllog.constants as mllog_constants
+from mlperf_common.logging import MLLoggerWrapper
+
+# Parameters not supported in HugeCTR:
+ADAGRAD_LR_DECAY = 0
+WEIGHT_DECAY = 0
+GRADIENT_ACC_STEPS = 1
+
+
+def param_info(mllogger: MLLoggerWrapper, args: Namespace):
+    mllogger.event(
+        key=mllog_constants.GLOBAL_BATCH_SIZE,
+        value=args.batchsize,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_NAME,
+        value=args.optimizer,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_BASE_LR,
+        value=args.lr,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_ADAGRAD_LR_DECAY,
+        value=ADAGRAD_LR_DECAY,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_WEIGHT_DECAY,
+        value=WEIGHT_DECAY,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_ADAGRAD_INITIAL_ACCUMULATOR_VALUE,
+        value=args.init_accu,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_ADAGRAD_EPSILON,
+        value=args.eps,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_LR_WARMUP_STEPS,
+        value=args.warmup_steps,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_LR_DECAY_START_STEP,
+        value=args.decay_start,
+    )
+    mllogger.event(
+        key=mllog_constants.OPT_LR_DECAY_STEPS,
+        value=args.decay_steps,
+    )
+    mllogger.event(
+        key=mllog_constants.GRADIENT_ACCUMULATION_STEPS,
+        value=GRADIENT_ACC_STEPS,
+    )
diff --git a/tools/io_benchmark/CMakeLists.txt b/samples/dlrm/mlperf_logger/utils.py
similarity index 56%
rename from tools/io_benchmark/CMakeLists.txt
rename to samples/dlrm/mlperf_logger/utils.py
index 85f979e4fd..a81e599dd0 100644
--- a/tools/io_benchmark/CMakeLists.txt
+++ b/samples/dlrm/mlperf_logger/utils.py
@@ -1,25 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
-#      http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-cmake_minimum_required(VERSION 3.20)
-find_package(CUDAToolkit)
+import os
 
-file(GLOB data_reader_bench_src
-  main.cpp
-)
 
-add_executable(io_bench ${data_reader_bench_src})
-target_link_libraries(io_bench PUBLIC CUDA::nvml huge_ctr_shared)
-target_compile_features(io_bench PUBLIC cxx_std_17 cuda_std_17)
+def get_row_count(data_path: str, num_columns: int, bytes_per_value: int) -> int:
+    """Get number of rows for a dataset in raw format."""
+    return os.path.getsize(data_path) // (num_columns * bytes_per_value)
diff --git a/samples/dlrm/preprocessing/convert_to_raw.py b/samples/dlrm/preprocessing/convert_to_raw.py
new file mode 100644
index 0000000000..c49fb0cdeb
--- /dev/null
+++ b/samples/dlrm/preprocessing/convert_to_raw.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import time
+
+import numpy as np
+
+"""
+Script to convert the reference TorchRec NumPy dataset to a binary raw format for HugeCTR training.
+
+The script requires a machine with about 200GB RAM as it reads all three
+day_*_labels.npy, day_*_dense.npy and day_*_sparse_multi_hot.npz files into memory.
+It should complete in about 5h hours (depending on I/O bandwidth).
+
+For the MLPerf Training v3.0 the expected md5sum of the output files are:
+
+| file           | md5sum                           |
+|:---------------|:---------------------------------|
+| test_data.bin  | cf636876d8baf0776287be23b31c2f14 |
+| train_data.bin | 4d48daf07cc244f6fa933b832d7fe5a3 |
+| val_data.bin   | c7ca591ad3fd2b09b75d99fa4fc210e2 |
+"""
+
+INPUT_LABELS_FILE = "day_{day}_labels.npy"
+INPUT_DENSE_FILE = "day_{day}_dense.npy"
+INPUT_SPARSE_FILE = "day_{day}_sparse_multi_hot.npz"
+OUTPUT_FILE = "{stage}_data.bin"
+NUM_DAYS = 24
+NUM_SPARSE = 26
+TRAIN, VAL, TEST = "train", "val", "test"
+STAGES = (TRAIN, VAL, TEST)
+LAST_DAY_TEST_VAL_SPLIT_POINT = 89_137_319
+
+
+class DataConverter:
+    def __init__(
+        self,
+        input_dir_labels_and_dense: str,
+        input_dir_sparse_multihot: str,
+        output_dir: str,
+        stage: str,
+        buffer_size: int,
+        chunk_size: int,
+        logger: logging.Logger,
+        logging_interval: int,
+    ):
+        self.input_dir_labels_and_dense = input_dir_labels_and_dense
+        self.input_dir_sparse_multihot = input_dir_sparse_multihot
+        self.output_file = os.path.join(output_dir, OUTPUT_FILE.format(stage=stage))
+        self.logger = logger
+        self.logging_interval = logging_interval
+        self.stage = stage
+        self.buffer_size = buffer_size
+        self.chunk_size = chunk_size
+        self.days = self._get_days_for_stage()
+        self.slice_ = self._get_slice_for_stage()
+
+    def _get_days_for_stage(self):
+        if self.stage == TRAIN:
+            return list(range(NUM_DAYS - 1))
+        else:
+            return [NUM_DAYS - 1]
+
+    def _get_slice_for_stage(self):
+        slice_ = None
+        if self.stage == VAL:
+            slice_ = slice(None, LAST_DAY_TEST_VAL_SPLIT_POINT)
+        elif self.stage == TEST:
+            slice_ = slice(LAST_DAY_TEST_VAL_SPLIT_POINT, None)
+        self.logger.debug(f"stage = {self.stage}, slice_ = {slice_}")
+        return slice_
+
+    def _read_metadata(self, f):
+        np.lib.format.read_magic(f)
+        shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(f)
+        assert not fortran_order, "C-like index order expected"
+        self.logger.debug(f"Data shape = {shape}")
+        return shape, dtype
+
+    def _load_data_for_day(self, day):
+        labels_file = INPUT_LABELS_FILE.format(day=day)
+        dense_file = INPUT_DENSE_FILE.format(day=day)
+        sparse_file = INPUT_SPARSE_FILE.format(day=day)
+
+        self.logger.debug(f"Loading {labels_file}...")
+        with open(os.path.join(self.input_dir_labels_and_dense, labels_file), "rb") as f:
+            _, dtype = self._read_metadata(f)
+            label = np.fromfile(f, dtype=dtype)
+        self.logger.debug("Loading done")
+
+        self.logger.debug(f"Loading {dense_file}...")
+        with open(os.path.join(self.input_dir_labels_and_dense, dense_file), "rb") as f:
+            shape, dtype = self._read_metadata(f)
+            dense = np.fromfile(f, dtype=dtype).reshape(shape, order="C")
+        self.logger.debug("Loading done")
+
+        self.logger.debug(f"Loading {sparse_file}...")
+        sparse_dict = np.load(os.path.join(self.input_dir_sparse_multihot, sparse_file))
+        sparse_list = [sparse_dict[str(i)] for i in range(NUM_SPARSE)]
+        self.logger.debug("Loading done")
+
+        if self.slice_ is not None:
+            self.logger.debug("Slicing data...")
+            label = label[self.slice_]
+            dense = dense[self.slice_]
+            sparse_list = [sparse[self.slice_] for sparse in sparse_list]
+            self.logger.debug("Slicing done")
+
+        return label, dense, sparse_list
+
+    def save(self):
+        self.logger.info(f"Writing data to {self.output_file}...")
+        samples_total = 0
+        start_time = time.perf_counter()
+        with open(self.output_file, "wb", buffering=self.buffer_size) as out:
+            write = out.write
+            for day in self.days:
+                self.logger.info(f"Processing data for day = {day}...")
+                label, dense, sparse_list = self._load_data_for_day(day)
+                # We concatenate sparse features as it saves time on writing
+                # data below. It is done in chunks to save memory.
+                start = 0
+                end = self.chunk_size
+                while start < len(label):
+                    self.logger.debug("Concatenating sparse features...")
+                    sparse = np.concatenate(
+                        [sparse_feat[start:end] for sparse_feat in sparse_list], axis=1
+                    )
+                    self.logger.debug("Concatenating done")
+                    for samples_total, (label_row, dense_row, sparse_row) in enumerate(
+                        zip(label[start:end], dense[start:end], sparse), samples_total + 1
+                    ):
+                        write(label_row.tobytes())
+                        write(dense_row.tobytes())
+                        write(sparse_row.tobytes())
+                        if samples_total % self.logging_interval == 0:
+                            self.logger.info(f"Number of samples done: {samples_total:,}")
+                    start = end
+                    end += self.chunk_size
+        end_time = time.perf_counter()
+        self.logger.info(f"Creating {self.output_file} done.")
+        self.logger.info(
+            f"Total number of samples done for stage = {self.stage}: {samples_total:,}"
+        )
+        self.logger.info(f"Throughput: {samples_total / (end_time - start_time):.2f} [samples/sec]")
+
+
+def get_logger(level):
+    logger = logging.getLogger(__name__)
+    logger.setLevel(level)
+    s_handler = logging.StreamHandler()
+    log_format = logging.Formatter("[%(asctime)s][%(levelname)s]: %(message)s")
+    s_handler.setFormatter(log_format)
+    logger.addHandler(s_handler)
+    return logger
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="NumPy to Raw format conversion script.")
+    parser.add_argument(
+        "--input_dir_labels_and_dense",
+        type=str,
+        required=True,
+        help="Input directory with labels and dense data",
+    )
+    parser.add_argument(
+        "--input_dir_sparse_multihot",
+        type=str,
+        required=True,
+        help="Input directory with sparse multi-hot data",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Output directory for the raw binary dataset",
+    )
+    parser.add_argument(
+        "--stages",
+        type=str,
+        choices=STAGES,
+        default=STAGES,
+        nargs="+",
+        help="Stages to process",
+    )
+    parser.add_argument(
+        "--buffer_size",
+        type=int,
+        default=2_147_483_647,
+        help="Buffer size for writing data",
+    )
+    parser.add_argument(
+        "--chunk_size",
+        type=int,
+        default=2_000_000,
+        help="Chunk size for concatenating sparse features before saving",
+    )
+    parser.add_argument(
+        "--logging_level",
+        type=int,
+        default=logging.INFO,
+        help="Logging level",
+    )
+    parser.add_argument(
+        "--logging_interval",
+        type=int,
+        default=10_000_000,
+        help="Logging interval for the number of samples done",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    logger = get_logger(level=args.logging_level)
+    logger.info("NumPy to Raw format conversion script")
+    logger.info(f"args are = {vars(args)}")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    for stage in args.stages:
+        converter = DataConverter(
+            input_dir_labels_and_dense=args.input_dir_labels_and_dense,
+            input_dir_sparse_multihot=args.input_dir_sparse_multihot,
+            output_dir=args.output_dir,
+            stage=stage,
+            buffer_size=args.buffer_size,
+            chunk_size=args.chunk_size,
+            logger=logger,
+            logging_interval=args.logging_interval,
+        )
+        converter.save()
+
+    logger.info("Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/dlrm/preprocessing/md5sums_raw_dataset.txt b/samples/dlrm/preprocessing/md5sums_raw_dataset.txt
new file mode 100644
index 0000000000..9317cae385
--- /dev/null
+++ b/samples/dlrm/preprocessing/md5sums_raw_dataset.txt
@@ -0,0 +1,3 @@
+cf636876d8baf0776287be23b31c2f14  test_data.bin
+4d48daf07cc244f6fa933b832d7fe5a3  train_data.bin
+c7ca591ad3fd2b09b75d99fa4fc210e2  val_data.bin
diff --git a/samples/dlrm/requirements.txt b/samples/dlrm/requirements.txt
new file mode 100644
index 0000000000..370ae43460
--- /dev/null
+++ b/samples/dlrm/requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/mlcommons/logging.git@3.1.0-rc1
+git+https://github.com/NVIDIA/mlperf-common.git
+mpi4py==3.1.5
diff --git a/samples/dlrm/run.sub b/samples/dlrm/run.sub
new file mode 100755
index 0000000000..c13fe88b99
--- /dev/null
+++ b/samples/dlrm/run.sub
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --job-name mlperf-dlrm:hugectr
+#SBATCH -t 00:30:00
+
+set -euxo pipefail
+
+# Vars without defaults
+: "${DGXSYSTEM:?DGXSYSTEM not set}"
+: "${CONT:?CONT not set}"
+: "${DATADIR:?DATADIR not set}"
+
+# Vars with defaults
+: "${MLPERF_RULESET:=3.1.0}"
+: "${MLPERF_CLUSTER_NAME:='unknown'}"
+: "${NEXP:=10}"
+: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
+: "${CLEAR_CACHES:=1}"
+: "${CHECK_COMPLIANCE:=1}"
+: "${API_LOG_DIR:=./api_logs}" # apiLog.sh output dir
+: "${ABSLOGDIR:=${PWD}/results}"
+: "${POWERCMDDIR:=' '}"
+: "${DATADIR_VAL:=${DATADIR}}"
+: "${MOUNTS:=${DATADIR}:/data,${DATADIR_VAL}:/data_val}"
+: "${LOGDIR:=./results}"
+
+export MODEL_NAME="recommendation"
+export MODEL_FRAMEWORK="pytorch"
+LOG_BASE="${DATESTAMP}"
+SPREFIX="${MODEL_NAME}_${MODEL_FRAMEWORK}_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}"
+
+
+readonly _logfile_base="${LOGDIR}/${DATESTAMP}"
+readonly _cont_name="${MODEL_NAME}_${SLURM_JOB_ID}"
+_cont_mounts=${MOUNTS}
+
+if [ "${API_LOGGING:-}" -eq 1 ]; then
+    API_LOG_DIR=${API_LOG_DIR}/${MODEL_FRAMEWORK}/${MODEL_NAME}/${DGXSYSTEM}
+    mkdir -p ${API_LOG_DIR}
+    _cont_mounts="${_cont_mounts},${API_LOG_DIR}:/logs"
+
+    # Create JSON file for cuDNN
+    JSON_MODEL_NAME="MLPERF_${MODEL_NAME}_${MODEL_FRAMEWORK}_train"
+    JSON_README_LINK="${README_PREFIX}/${MODEL_NAME}/${MODEL_FRAMEWORK}/README.md"
+    JSON_FMT='{model_name: $mn, readme_link: $rl, configs: {($dt): [$bs]}, sweep: {($dt): [$bs]}}'
+    JSON_OUTPUT="${JSON_MODEL_NAME}.cudnn.json"
+    jq -n --indent 4 --arg mn $JSON_MODEL_NAME --arg rl $JSON_README_LINK --arg dt $APILOG_PRECISION --arg bs $BATCHSIZE "$JSON_FMT" > ${API_LOG_DIR}/$JSON_OUTPUT
+fi
+if [ "${JET:-0}" -eq 1 ]; then
+    _cont_mounts="${_cont_mounts},${JET_DIR}:/root/.jet,${LOGDIR}:/results"
+fi
+
+# make sure the results directory exists on the host
+( umask 0002; mkdir -p "${LOGDIR}" )
+
+# Setup container
+echo MELLANOX_VISIBLE_DEVICES="${MELLANOX_VISIBLE_DEVICES:-}"
+srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-name="${_cont_name}" true
+srun -N1 -n1 --container-name="${_cont_name}" ibv_devinfo --list
+srun -N1 -n1 --container-name="${_cont_name}" nvidia-smi topo -m
+
+#ssh to nodes for power measurements
+NODELIST=$(scontrol show hostnames ${SLURM_JOB_NODELIST})
+NODELIST=(${NODELIST[*]})
+if [ -f "$POWERCMDDIR/power_monitor.sh"  ]; then
+    ( umask 0002; mkdir -p "${ABSLOGDIR}" )
+    for i in "${NODELIST[@]}"
+    do
+        ssh $i 'export NODENAME='"'$i'"';export ABSLOGDIR='"'$ABSLOGDIR'"';export SLURM_JOB_NODELIST='"'$SLURM_JOB_NODELIST'"';export SLURM_JOB_ID='"'$SLURM_JOB_ID'"';POWERCMDDIR='"'$POWERCMDDIR'"';bash ${POWERCMDDIR}/power_monitor.sh' &
+#	break
+    done
+fi
+
+if [[ "${SET_MAXQ_CLK:-}" == "1" ]] || [[ "${SET_MINEDP_CLK:-}" == "1" ]]; then
+    if [[ "${SET_MAXQ_CLK:-}" == "1" ]]; then
+        GPCCLK=${MAXQ_CLK}
+    fi
+    if [[ "${SET_MINEDP_CLK:-}" == "1" ]]; then
+        GPCCLK=${MINEDP_CLK}
+    fi
+    for i in "${NODELIST[@]}"
+    do
+        ssh $i 'export GPCCLK='"'$GPCCLK'"';sudo nvidia-smi -lgc ${GPCCLK}'
+    done
+fi
+
+# Run experiments
+for _experiment_index in $(seq -w 1 "${NEXP}"); do
+    (
+        echo ":::DLPAL ${CONT} ${SLURM_JOB_ID} ${SLURM_JOB_NUM_NODES} ${SLURM_JOB_NODELIST} ${MLPERF_CLUSTER_NAME} ${DGXSYSTEM}"
+
+        # Print system info
+        echo ":::SYSJSON $(srun --ntasks=1 --container-name="${_cont_name}" mlperf-sysjson.sh)"
+
+        if [[ $CLEAR_CACHES == 1 ]]; then
+            srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
+            srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${_cont_name}" python3 -c "
+import mlperf_logging.mllog as mllog
+mllogger = mllog.get_mllogger()
+mllogger.event(key=mllog.constants.CACHE_CLEAR, value=True)"
+        fi
+        echo "Beginning trial ${_experiment_index} of ${NEXP}"
+        srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 \
+             --container-name="${_cont_name}" --container-mounts="${_cont_mounts}" \
+             ./run_and_time.sh
+    ) |& tee "${_logfile_base}_raw_${_experiment_index}.log"
+
+    # Sorting the MLPerf compliance logs by timestamps
+    grep ":::.L..." "${_logfile_base}_raw_${_experiment_index}.log" | sort -k5 -n -s | tee "${_logfile_base}_${_experiment_index}.log"
+    if [ "${CHECK_COMPLIANCE}" -eq 1 ]; then
+      srun --ntasks=1 --nodes=1 --container-name="${_cont_name}" \
+     --container-mounts="$(realpath ${LOGDIR}):/results"   \
+     --container-workdir="/results"                        \
+     python3 -m mlperf_logging.compliance_checker --usage training \
+     --ruleset "${MLPERF_RULESET}"                                 \
+     --log_output "/results/compliance_${DATESTAMP}.out"           \
+     "/results/${DATESTAMP}_${_experiment_index}.log" \
+     || true
+    fi
+
+    if [ "${JET:-0}" -eq 1 ]; then
+      JET_CREATE=${JET_CREATE:-}" --data workload.spec.nodes=${DGXNNODES} --data workload.spec.name=${MODEL_NAME}_${MODEL_FRAMEWORK}_${DGXSYSTEM} --data workload.key=${MODEL_NAME}_${MODEL_FRAMEWORK}_${DGXSYSTEM} --mllogger "
+      srun -N1 -n1 --container-name="${_cont_name}" --container-mounts="${_cont_mounts}" bash -c "${JET_CREATE} /results/${DATESTAMP}_${_experiment_index}.log --asset /results/slurm-${SLURM_JOB_ID}.out --data source_image.name=${CONT} --data slurm.job=${SLURM_JOB_ID} && ${JET_UPLOAD}"
+    fi
+
+done
diff --git a/samples/dlrm/run_and_time.sh b/samples/dlrm/run_and_time.sh
new file mode 100755
index 0000000000..8ce27f9703
--- /dev/null
+++ b/samples/dlrm/run_and_time.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# runs benchmark and reports time to convergence
+
+# default value for DLRM_BIND only if it is not already defined
+#: ${DLRM_BIND:="numactl --membind=1,3,5,7"}
+: ${DLRM_BIND:=}
+
+set -ex
+
+ARGS=""
+[ -n "${OPTIMIZER:-}" ] && ARGS+=" --optimizer ${OPTIMIZER}"
+[ -n "${BATCHSIZE:-}" ] && ARGS+=" --batchsize ${BATCHSIZE}"
+[ -n "${BATCHSIZE_EVAL:-}" ] && ARGS+=" --batchsize_eval ${BATCHSIZE_EVAL}"
+[ -n "${LEARNING_RATE:-}" ] && ARGS+=" --lr ${LEARNING_RATE}"
+[ -n "${WARMUP_STEPS:-}" ] && ARGS+=" --warmup_steps ${WARMUP_STEPS}"
+[ -n "${DECAY_START:-}" ] && ARGS+=" --decay_start ${DECAY_START}"
+[ -n "${DECAY_STEPS:-}" ] && ARGS+=" --decay_steps ${DECAY_STEPS}"
+[ "$ENABLE_TF32_COMPUTE" = true ] && ARGS+=" --enable_tf32_compute"
+[ "$USE_MIXED_PRECISION" = true ] && ARGS+=" --use_mixed_precision"
+[ -n "${SCALER:-}" ] && ARGS+=" --scaler ${SCALER}"
+[ "$GEN_LOSS_SUMMARY" = true ] && ARGS+=" --gen_loss_summary"
+[ "$USE_ALGORITHM_SEARCH" = false ] && ARGS+=" --disable_algorithm_search"
+[ -n "${SHARDING_PLAN:-}" ] && ARGS+=" --sharding_plan ${SHARDING_PLAN}"
+[ -n "${DP_SHARDING_THRESHOLD:-}" ] && ARGS+=" --dp_sharding_threshold ${DP_SHARDING_THRESHOLD}"
+[ -n "${MAX_ITER:-}" ] && ARGS+=" --max_iter ${MAX_ITER}"
+[ -n "${DISPLAY_INTERVAL:-}" ] && ARGS+=" --display_interval ${DISPLAY_INTERVAL}"
+[ -n "${EVAL_INTERVAL:-}" ] && ARGS+=" --eval_interval ${EVAL_INTERVAL}"
+[ -n "${MAX_EVAL_BATCHES:-}" ] && ARGS+=" --max_eval_batches ${MAX_EVAL_BATCHES}"
+[ -n "${AUC_THRESHOLD:-}" ] && ARGS+=" --auc_threshold ${AUC_THRESHOLD}"
+[ -n "${DGXNGPU:-}" ] && ARGS+=" --num_gpus_per_node ${DGXNGPU}"
+[ -n "${MEM_COMM_BW_RATIO:-}" ] && ARGS+=" --mem_comm_bw_ratio ${MEM_COMM_BW_RATIO}"
+[ -n "${SEED:-}" ] && ARGS+=" --seed ${SEED}"
+[ -n "${MLPERF_POWER_TRAIN_AFTER_RUN_STOP:-}" ] && ARGS+=" --minimum_training_time ${MINIMUM_TRAINING_TIME:-0}"
+
+readonly node_rank="${SLURM_NODEID:-0}"
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+
+if [ "$LOGGER" = "apiLog.sh" ];
+then
+  LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}"
+  if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ];
+  then
+    LOGGER=$LOGGER
+  else
+    LOGGER=""
+  fi
+fi
+
+echo "DLRM_BIND is set to \"${DLRM_BIND}\""
+${LOGGER} ${DLRM_BIND} python3 ${RUN_SCRIPT} ${ARGS} | tee /tmp/dlrm_hugectr.log
+
+ ret_code=${PIPESTATUS[0]}
+ if [[ $ret_code != 0 ]]; then exit $ret_code; fi
diff --git a/samples/dlrm/run_with_docker.sh b/samples/dlrm/run_with_docker.sh
new file mode 100755
index 0000000000..a80eab6225
--- /dev/null
+++ b/samples/dlrm/run_with_docker.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --job-name dlrm.hugectr
+#SBATCH -t 00:30:00
+
+set -euxo pipefail
+
+# Vars without defaults
+: "${DGXSYSTEM:?DGXSYSTEM not set}"
+: "${CONT:?CONT not set}"
+: "${DATADIR:?DATADIR not set}"
+
+# Vars with defaults
+: "${NEXP:=1}"
+: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
+: "${CLEAR_CACHES:=1}"
+: "${CHECK_COMPLIANCE:=1}"
+: "${MLPERF_RULESET:=3.1.0}"
+: "${MOUNTS:=${DATADIR}:/data}"
+: "${LOGDIR:=./results}"
+# default DLRM_BIND to null because we don't know what user's system actually is
+: "${DLRM_BIND:=}"
+
+# Other vars
+readonly _config_file="./config_${DGXSYSTEM}.sh"
+readonly _logfile_base="${LOGDIR}/${DATESTAMP}"
+readonly _cont_name=dlrm_hugectr
+_cont_mounts=("--volume=${DATADIR}:/data" "--volume=${DATADIR}:/data_val" "--volume=${LOGDIR}:${LOGDIR}")
+
+
+# Setup directories
+mkdir -p "${LOGDIR}"
+
+# Get list of envvars to pass to docker
+mapfile -t _config_env < <(env -i bash -c ". ${_config_file} && compgen -e" | grep -E -v '^(PWD|SHLVL)')
+_config_env+=(DATADIR)
+_config_env+=(DATASET_TYPE)
+_config_env+=(DGXSYSTEM)
+mapfile -t _config_env < <(for v in "${_config_env[@]}"; do echo "--env=$v"; done)
+
+# Cleanup container
+cleanup_docker() {
+    docker container rm -f "${_cont_name}" || true
+}
+cleanup_docker
+trap 'set -eux; cleanup_docker' EXIT
+
+# Setup container
+nvidia-docker run --rm --init --detach \
+    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
+    --name="${_cont_name}" "${_cont_mounts[@]}" \
+    "${CONT}" sleep infinity
+# Make sure container has time to finish initialization
+sleep 30
+docker exec -it "${_cont_name}" true
+
+
+# Run experiments
+for _experiment_index in $(seq 1 "${NEXP}"); do
+  (
+    echo "Beginning trial ${_experiment_index} of ${NEXP}"
+    if [[ $CLEAR_CACHES == 1 ]]; then
+      bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
+      docker exec -it "${_cont_name}" python3 -c "
+import mlperf_logging.mllog as mllog
+mllogger = mllog.get_mllogger()
+mllogger.event(key=mllog.constants.CACHE_CLEAR, value=True)"
+    fi
+
+    docker exec -it ${_config_env[@]} ${_cont_name} bash ./run_and_time.sh
+  ) |& tee "${_logfile_base}_${_experiment_index}.log"
+
+    if [ "${CHECK_COMPLIANCE}" -eq 1 ]; then
+      docker exec -it "${_config_env[@]}" "${_cont_name}"  \
+           python3 -m mlperf_logging.compliance_checker --usage training \
+           --ruleset "${MLPERF_RULESET}"                                 \
+           --log_output "/results/compliance_${DATESTAMP}.out"           \
+           "/results/${DATESTAMP}_${_experiment_index}.log" \
+    || true
+    fi
+done
diff --git a/samples/dlrm/sharding/__init__.py b/samples/dlrm/sharding/__init__.py
new file mode 100644
index 0000000000..2357451f46
--- /dev/null
+++ b/samples/dlrm/sharding/__init__.py
@@ -0,0 +1,2 @@
+from .generate_plan import generate_plan
+from .planner import Cost, CostModel, Planner, ShardingState
diff --git a/samples/dlrm/sharding/generate_plan.py b/samples/dlrm/sharding/generate_plan.py
new file mode 100644
index 0000000000..dac8f2d3e3
--- /dev/null
+++ b/samples/dlrm/sharding/generate_plan.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from argparse import Namespace
+from itertools import chain
+from typing import List
+
+from .planner import CostModel, Planner
+
+
+def generate_plan(
+    slot_size_array: List[int],
+    multi_hot_sizes: List[int],
+    num_nodes: int,
+    num_gpus: int,
+    args: Namespace,
+    log_result: bool,
+):
+    def sanity_check(shard_matrix, shard_strategy):
+        # mainly to make sure all the tables are sharded
+        msg = "Not all tables covered in the sharding plan"
+        assert set(chain(*shard_matrix)) == set(range(len(slot_size_array))), msg
+        shard_strategy_list = [x for strategy_pair in shard_strategy for x in strategy_pair[1]]
+        assert set(shard_strategy_list) == set(range(len(slot_size_array))), msg
+
+        for table_list in shard_matrix:
+            if len(table_list) == 0:
+                raise Exception("Currently no empty shard list is allowed")
+
+    def int_to_string(shard_matrix_int, shard_strategy_int):
+        shard_strategy, shard_matrix = [], []
+        for pair in shard_strategy_int:
+            if len(pair[1]) != 0:
+                shard_strategy.append((pair[0], [str(x) for x in pair[1]]))
+        for sub_matrix_ in shard_matrix_int:
+            shard_matrix.append([str(x) for x in sub_matrix_])
+        return shard_matrix, shard_strategy
+
+    if args.sharding_plan in ["round_robin", "uniform"]:
+        # sharding strategies that don't exploit system configs
+        if args.sharding_plan == "round_robin":
+            mp_table = [i for i in range(len(slot_size_array))]
+            shard_matrix_ = [[] for _ in range(num_gpus)]
+            shard_strategy_ = [("mp", [i for i in mp_table])]
+
+            for i, table_id in enumerate(mp_table):
+                target_gpu = i % num_gpus
+                shard_matrix_[target_gpu].append(table_id)
+
+        elif args.sharding_plan == "uniform":
+            shard_matrix_ = [[x for x in range(len(slot_size_array))] for _ in range(num_gpus)]
+            shard_strategy_ = [("mp", [i for i in range(len(slot_size_array))])]
+
+    elif args.sharding_plan in ["auto", "hier_auto"]:
+        # sharding strategies that exploit system configs
+        dram_cap = args.memory_cap_for_embedding
+        if args.optimizer == "adagrad":
+            byte_per_elem = 8
+        elif args.optimizer == "sgd":
+            byte_per_elem = 4
+
+        if args.sharding_plan == "auto":
+            cost_model = CostModel(
+                1,
+                args.mem_comm_bw_ratio / args.mem_comm_work_ratio,
+                args.ev_size * byte_per_elem * 1e-9,
+                dram_cap,
+                slot_size_array,
+            )
+            planner = Planner(
+                multi_hot_sizes,
+                num_gpus,
+                cost_model,
+                log_result=log_result,
+                dp_threshold=args.dp_sharding_threshold,
+            )
+            shard_strategy_, shard_matrix_ = planner.plan()
+
+        elif args.sharding_plan == "hier_auto":
+            if num_nodes <= 1:
+                raise Exception(
+                    "hier_auto plan is only applicable to configs with more than one node"
+                )
+            cost_model = CostModel(
+                1,
+                args.mem_comm_bw_ratio / args.mem_comm_work_ratio,
+                args.ev_size * byte_per_elem * 1e-9,
+                dram_cap * args.num_gpus_per_node,
+                slot_size_array,
+            )
+            planner = Planner(
+                multi_hot_sizes,
+                num_nodes,
+                cost_model,
+                log_result=log_result,
+                dp_threshold=args.dp_sharding_threshold,
+            )
+            shard_strategy_, shard_matrix_node_ = planner.plan()
+            shard_matrix_ = []
+            for node_shard_matrix in shard_matrix_node_:
+                for i in range(args.num_gpus_per_node):
+                    shard_matrix_.append(node_shard_matrix)
+    else:
+        raise Exception("unknown sharding plan")
+
+    sanity_check(shard_matrix_, shard_strategy_)
+    shard_matrix, shard_strategy = int_to_string(shard_matrix_, shard_strategy_)
+
+    if log_result:
+        logging.info("Provided system info: ")
+        logging.info("num_gpu_per_nodes: %d", args.num_gpus_per_node)
+        logging.info("Memory to communication BW ratio: %f", args.mem_comm_bw_ratio)
+        logging.info("Memory to communication work ratio: %f", args.mem_comm_work_ratio)
+        logging.info("DRAM capacity: %f GB", args.memory_cap_for_embedding)
+        logging.info("shard_matrix:")
+        logging.info(shard_matrix)
+        logging.info("\n")
+
+    return shard_matrix, shard_strategy
diff --git a/samples/dlrm/sharding/planner.py b/samples/dlrm/sharding/planner.py
new file mode 100644
index 0000000000..c68a5d7b23
--- /dev/null
+++ b/samples/dlrm/sharding/planner.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+from typing import List, Tuple
+
+import numpy as np
+
+
+class ShardingState:
+    """
+    Containing the state of a sharding process.
+    The plan iteratively update the sharding state based on a given heuristic and obtain
+    solutions.
+    """
+
+    def __init__(
+        self,
+        array_hotness: np.array,
+        num_bucket: int,
+        dp_table_id: np.array(int) = np.array([]),
+    ) -> None:
+        mp_table_id = np.setdiff1d(np.arange(array_hotness.size), dp_table_id)
+        array_hotness_mp = array_hotness[mp_table_id]
+        sorted_idx = np.argsort(array_hotness_mp)[::-1]
+        self.array_unshard_hotness = array_hotness
+        self.array_hotness = array_hotness_mp[sorted_idx]
+        self.num_bucket = num_bucket
+        self.array_table_id = mp_table_id[sorted_idx]
+        self.array_num_split = np.zeros(self.array_unshard_hotness.size, dtype=int)
+        self.array_num_split[mp_table_id] = 1
+        self.shard_ll = [[] for i in range(self.num_bucket)]
+
+    def split_hot_shard(self):
+        """
+        split the shard with the largest hotness
+        """
+        # shards are sorted based on the hotness. Find the first hot shard that
+        # can be split further
+        for shard_id in range(self.array_table_id.size):
+            table_id = self.array_table_id[shard_id]
+            hotness = self.array_unshard_hotness[table_id]
+            if self.array_num_split[table_id] * 2 <= self.num_bucket:
+                # if this table can be further split and we can put it into
+                # more buckets
+                idx = np.where(self.array_table_id == table_id)[0]
+                self.array_hotness = np.delete(self.array_hotness, idx)
+                self.array_table_id = np.delete(self.array_table_id, idx)
+                self.array_num_split[table_id] *= 2
+                self.array_hotness = np.concatenate(
+                    (
+                        self.array_hotness,
+                        np.ones(self.array_num_split[table_id])
+                        * (hotness / self.array_num_split[table_id]),
+                    )
+                )
+                self.array_table_id = np.concatenate(
+                    (
+                        self.array_table_id,
+                        np.ones(self.array_num_split[table_id], dtype=int) * table_id,
+                    )
+                )
+                break
+
+        # sort after splitting to maintain the shard hotness in order
+        sorted_idx = np.argsort(self.array_hotness)[::-1]
+        self.array_hotness = self.array_hotness[sorted_idx]
+        self.array_table_id = self.array_table_id[sorted_idx]
+
+    def split_oom_shard(self, table_id: int) -> bool:
+        hotness = self.array_unshard_hotness[table_id]
+        if self.array_num_split[table_id] * 2 <= self.num_bucket:
+            idx = np.where(self.array_table_id == table_id)[0]
+            self.array_hotness = np.delete(self.array_hotness, idx)
+            self.array_table_id = np.delete(self.array_table_id, idx)
+            self.array_num_split[table_id] *= 2
+            self.array_hotness = np.concatenate(
+                (
+                    self.array_hotness,
+                    np.ones(self.array_num_split[table_id])
+                    * (hotness / self.array_num_split[table_id]),
+                )
+            )
+            self.array_table_id = np.concatenate(
+                (self.array_table_id, np.ones(self.array_num_split[table_id], dtype=int) * table_id)
+            )
+            sorted_idx = np.argsort(self.array_hotness)[::-1]
+            self.array_hotness = self.array_hotness[sorted_idx]
+            self.array_table_id = self.array_table_id[sorted_idx]
+            return True
+        else:
+            return False
+
+    def update_split_num(self):
+        self.array_num_split = np.zeros_like(self.array_unshard_hotness)
+        for shard_list in self.shard_ll:
+            for table_id in shard_list:
+                self.array_num_split[table_id] += 1
+
+    def reset_shard_ll(self):
+        self.shard_ll = [[] for i in range(self.num_bucket)]
+
+    def push_bucket(
+        self,
+        bucket_id: int,
+        table_id: int,
+    ) -> None:
+        self.shard_ll[bucket_id].append(table_id)
+
+    def pop_bucket(
+        self,
+        bucket_id: int,
+    ) -> None:
+        self.shard_ll[bucket_id].pop()
+
+
+class Cost:
+    def __init__(
+        self,
+        cost: np.array(float),
+        hotness_cost: np.array(float),
+        table_cost: np.array(float),
+        mem_cost: np.array(float),
+    ) -> None:
+        self.cost = cost
+        self.hotness_cost = hotness_cost
+        self.table_cost = table_cost
+        self.mem_cost = mem_cost
+
+
+class CostModel:
+    def __init__(
+        self,
+        hotness_cost: float,
+        table_cost: float,
+        mem_cost: float,
+        mem_capacity: float,
+        table_size: List[int],
+    ) -> None:
+        self.unit_hotness_cost = hotness_cost
+        self.unit_table_cost = table_cost
+        self.unit_mem_cost = mem_cost
+        self.mem_capacity = mem_capacity
+        self.array_table_size = np.array(table_size)
+
+    def get_cost(
+        self,
+        ss: ShardingState,
+    ) -> Tuple[Cost, bool]:
+        list_cost = []
+        list_hotness_cost = []
+        list_table_cost = []
+        list_mem_cost = []
+
+        for shard_list in ss.shard_ll:
+            hotness_cost = (
+                self.unit_hotness_cost
+                * (
+                    ss.array_unshard_hotness[shard_list] / np.array(ss.array_num_split)[shard_list]
+                ).sum()
+            )
+            table_cost = self.unit_table_cost * len(shard_list)
+            mem_cost = (
+                self.unit_mem_cost
+                * (
+                    self.array_table_size[shard_list] / np.array(ss.array_num_split)[shard_list]
+                ).sum()
+            )
+            list_cost.append(hotness_cost + table_cost)
+            list_hotness_cost.append(hotness_cost)
+            list_table_cost.append(table_cost)
+            list_mem_cost.append(mem_cost)
+
+        return (
+            Cost(
+                np.array(list_cost),
+                np.array(list_hotness_cost),
+                np.array(list_table_cost),
+                np.array(list_mem_cost),
+            ),
+            max(list_mem_cost) > self.mem_capacity,
+        )
+
+    def deduct_mem_cap_for_dp(
+        self,
+        dp_table_id: list,
+    ) -> None:
+        self.mem_capacity -= self.array_table_size[dp_table_id].sum() * self.unit_mem_cost
+        if self.mem_capacity < 0:
+            raise Exception("OOM due to DP. Please considering increase the DP threshold")
+
+
+class Planner:
+    """
+    The planner work out a series of plans iteratively.
+    In each iteration, the planner tries to split the hottest shard and place the shards into
+    a bucket based on a give heuristic. When the shard is too large to fit into the best bucket
+    suggested by the heuristic, it finds the next best bucket until it iterates through all the
+    buckets. In that case, it tries to split the shard further. If the shard cannot be split
+    further, the planner aborts and returns the default sharding plan.
+    """
+
+    def __init__(
+        self,
+        list_hotness: list,
+        num_bucket: int,
+        cost_model: CostModel,
+        dp_threshold: int = 0,
+        max_search_iter: int = 20,
+        log_result: bool = False,
+    ) -> None:
+        self.array_hotness = np.array(list_hotness)
+        self.num_bucket = num_bucket
+        self.cost_model = cost_model
+        self.list_candidate = []
+        self.max_search_iter = max_search_iter
+        self.log_result = log_result
+
+        # Create the default sharding plan. Throw if even this default sharding plan cannot fit, as
+        # it should be the most memory-efficient
+        sharding_state_default = ShardingState(self.array_hotness, self.num_bucket)
+        for b in range(self.num_bucket):
+            for t in range(self.array_hotness.size):
+                sharding_state_default.push_bucket(b, t)
+        sharding_state_default.update_split_num()
+        cost, oom = self.cost_model.get_cost(sharding_state_default)
+        if oom:
+            raise Exception("OOM even with the most memory-efficient sharding plan")
+        self.list_candidate.append(
+            (
+                cost.cost.max(),
+                cost.hotness_cost,
+                cost.table_cost,
+                cost.mem_cost,
+                sharding_state_default.shard_ll,
+            )
+        )
+
+        # Create DP sharding plan based on the DP threshold
+        self.dp_table_id = np.where(
+            cost_model.array_table_size < dp_threshold / cost_model.unit_mem_cost
+        )[0]
+        self.mp_table_id = np.setdiff1d(np.arange(self.array_hotness.size), self.dp_table_id)
+        self.sharding_state = ShardingState(self.array_hotness, self.num_bucket, self.dp_table_id)
+        self.cost_model.deduct_mem_cap_for_dp(self.dp_table_id)
+
+        logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    def greedy_plan(self, ss):
+        """
+        This is a heuristic based on greedy policy. The shard is placed to the bucket with the
+        lowest hotness cost
+        """
+        array_cost = np.zeros(ss.num_bucket)
+        ss.reset_shard_ll()
+        for i in range(ss.array_hotness.size):
+            sorted_idx = np.argsort(array_cost)
+            sharded = False
+            for bucket_id in sorted_idx:
+                if ss.array_table_id[i] not in ss.shard_ll[bucket_id]:
+                    # for now, only uniform sharding is supported. Hence cannot put two shards
+                    # from the same table into the same bucket
+                    ss.push_bucket(bucket_id, ss.array_table_id[i])
+                    cost, oom = self.cost_model.get_cost(ss)
+                    if not oom:
+                        sharded = True
+                        array_cost = cost.cost
+                        break
+                    else:
+                        # Current bucket cannot fit. Iterate to the next best bucket
+                        ss.pop_bucket(bucket_id)
+            if not sharded:
+                # This means the shard is too large to fit within any bucket
+                return ss.array_table_id[i], ss, cost
+        return None, ss, cost
+
+    def plan(self):
+        t0 = time.time()
+        for i in range(self.max_search_iter):
+            oom_table_id, self.sharding_state, cost = self.greedy_plan(self.sharding_state)
+            if oom_table_id is None:
+                self.list_candidate.append(
+                    (
+                        cost.cost.max(),
+                        cost.hotness_cost,
+                        cost.table_cost,
+                        cost.mem_cost,
+                        self.sharding_state.shard_ll,
+                    )
+                )
+                self.sharding_state.split_hot_shard()
+            else:
+                oom_table_can_split = self.sharding_state.split_oom_shard(oom_table_id)
+                if not oom_table_can_split:
+                    break
+
+        self.list_candidate.sort(key=lambda x: x[0])
+
+        shard_strategy = [("mp", self.mp_table_id.tolist())]
+        shard_strategy.append(("dp", self.dp_table_id.tolist()))
+        shard_matrix = self.list_candidate[0][-1]
+        for table_id in self.dp_table_id:
+            for shard_list in shard_matrix:
+                shard_list.append(table_id)
+        if self.log_result:
+            logging.info("Planner took %f sec" % (time.time() - t0))
+            logging.info(shard_strategy)
+            logging.info(shard_matrix)
+            logging.info("hotness cost is:")
+            logging.info(self.list_candidate[0][1])
+            logging.info("table cost is:")
+            logging.info(self.list_candidate[0][2])
+            logging.info("mem cost is:")
+            logging.info(self.list_candidate[0][3])
+        return shard_strategy, shard_matrix
diff --git a/samples/dlrm/train.py b/samples/dlrm/train.py
new file mode 100644
index 0000000000..c7289b8577
--- /dev/null
+++ b/samples/dlrm/train.py
@@ -0,0 +1,485 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+
+import mlperf_logging.mllog.constants as mllog_constants
+from mlperf_common.frameworks.hugectr import HCTRCommunicationHandler
+from mlperf_common.logging import MLLoggerWrapper
+from mpi4py import MPI
+
+import hugectr
+import mlperf_logger
+import sharding
+
+TRAIN_NUM_SAMPLES = 4195197692
+EVAL_NUM_SAMPLES = 89137319
+TABLE_SIZE_ARRAY = [
+    40000000,
+    39060,
+    17295,
+    7424,
+    20265,
+    3,
+    7122,
+    1543,
+    63,
+    40000000,
+    3067956,
+    405282,
+    10,
+    2209,
+    11938,
+    155,
+    4,
+    976,
+    14,
+    40000000,
+    40000000,
+    40000000,
+    590152,
+    12973,
+    108,
+    36,
+]
+MULTI_HOT_SIZES = [
+    3,
+    2,
+    1,
+    2,
+    6,
+    1,
+    1,
+    1,
+    1,
+    7,
+    3,
+    8,
+    1,
+    6,
+    9,
+    5,
+    1,
+    1,
+    1,
+    12,
+    100,
+    27,
+    10,
+    3,
+    1,
+    1,
+]
+NUM_TABLE = len(TABLE_SIZE_ARRAY)
+NUM_DENSE = 13
+
+mllogger = MLLoggerWrapper(HCTRCommunicationHandler(), value=None)
+mllogger.start(key=mllog_constants.INIT_START)
+
+parser = argparse.ArgumentParser(description="HugeCTR DCN V2 model training script.")
+parser.add_argument(
+    "--optimizer",
+    help="Optimizer to use",
+    type=str,
+    choices=["adagrad", "sgd"],
+    default="adagrad",
+)
+parser.add_argument(
+    "--batchsize",
+    help="Batch size for training",
+    type=int,
+    default=8192,
+)
+parser.add_argument(
+    "--batchsize_eval",
+    help="Batch size for evaluation",
+    type=int,
+    default=16384,
+)
+parser.add_argument(
+    "--max_eval_batches",
+    help="The number of evaluation batches to use",
+    type=int,
+    default=None,
+)
+parser.add_argument(
+    "--lr",
+    help="Learning rate",
+    type=float,
+    default=0.005,
+)
+parser.add_argument(
+    "--eps",
+    help="Epsilon value for Adagrad",
+    type=float,
+    default=1e-8,
+)
+parser.add_argument(
+    "--init_accu",
+    help="Initial accumulator value for Adagrad",
+    type=float,
+    default=0.0,
+)
+parser.add_argument(
+    "--warmup_steps",
+    help="Warmup steps",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--decay_start",
+    help="Decay start",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--decay_steps",
+    help="Decay steps",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--use_mixed_precision",
+    action="store_true",
+)
+parser.add_argument(
+    "--scaler",
+    help="Loss scaling constant",
+    type=float,
+    default=1.0,
+)
+parser.add_argument(
+    "--enable_tf32_compute",
+    action="store_true",
+)
+parser.add_argument(
+    "--disable_algorithm_search",
+    help="Disables GEMM algorithm search for fully connected layers",
+    dest="use_algorithm_search",
+    action="store_false",
+)
+parser.add_argument(
+    "--gen_loss_summary",
+    help="Compute loss summary during training (loss = 0 if not set)",
+    action="store_true",
+)
+parser.add_argument(
+    "--max_iter",
+    help="Number of training iterations to run",
+    type=int,
+    default=None,
+)
+parser.add_argument(
+    "--display_interval",
+    help="Display throughput stats every number of iterations",
+    type=int,
+    default=100,
+)
+parser.add_argument(
+    "--eval_interval",
+    help="Evaluate every number of iterations given",
+    type=int,
+    default=None,
+)
+parser.add_argument(
+    "--auc_threshold",
+    help="AUC threshold to reach to stop training",
+    type=float,
+    default=0.80275,
+)
+parser.add_argument(
+    "--sharding_plan",
+    help="Sharding plan to use",
+    type=str,
+    choices=["round_robin", "uniform", "auto", "hier_auto"],
+    default="round_robin",
+)
+
+parser.add_argument(
+    "--dp_sharding_threshold",
+    help="threshold for DP sharding in GiB.",
+    type=float,
+    default=0,
+)
+
+parser.add_argument(
+    "--num_gpus_per_node",
+    help="The number of GPUs per node",
+    type=int,
+    default=8,
+)
+parser.add_argument(
+    "--mem_comm_bw_ratio",
+    help="The ratio between the communication and the memory bw of the system",
+    type=float,
+    default=3.35e12 / 450e9,
+)
+parser.add_argument(
+    "--mem_comm_work_ratio",
+    help="The ratio between the communication and the memory work of the network",
+    type=float,
+    default=8 / 2,
+)
+parser.add_argument(
+    "--memory_cap_for_embedding",
+    help="The amount of memory can be used for storing embedding in GB",
+    type=float,
+    default=60,
+)
+parser.add_argument(
+    "--ev_size",
+    help="The width of the embedding vector",
+    type=int,
+    default=128,
+)
+parser.add_argument(
+    "--seed",
+    help="The global seed for training.",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--minimum_training_time",
+    help="If set this vable bigger than 0, even hit the target AUC, training will continue until reach the minumum_training_time(minutes)",
+    type=int,
+    default=0,
+)
+
+args = parser.parse_args()
+comm = MPI.COMM_WORLD
+num_nodes = comm.Get_size()
+rank = comm.Get_rank()
+num_gpus = num_nodes * args.num_gpus_per_node
+is_rank_zero = rank == 0
+# If specify arg.minimum_training_time, set max_iter to a bigger value.
+if args.minimum_training_time > 0:
+    args.max_iter = 1000000
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+# Dependent parameters (if not set)
+iter_per_epoch = TRAIN_NUM_SAMPLES / args.batchsize
+if args.max_iter is None:
+    args.max_iter = math.ceil(iter_per_epoch)
+if args.eval_interval is None:
+    args.eval_interval = math.floor(0.05 * iter_per_epoch)
+if args.max_eval_batches is None:
+    args.max_eval_batches = math.ceil(EVAL_NUM_SAMPLES / args.batchsize_eval)
+iter_per_epoch = math.ceil(iter_per_epoch)
+
+# Log submission metadata and relevant hyperparameters
+mllogger.mlperf_submission_log(mllog_constants.DLRM_DCNv2, num_nodes, "NVIDIA")
+mlperf_logger.param_info(mllogger, args)
+
+shard_matrix, shard_strategy = sharding.generate_plan(
+    TABLE_SIZE_ARRAY, MULTI_HOT_SIZES, num_nodes, num_gpus, args, is_rank_zero
+)
+
+# 0. Callback for logging evaluation AUC
+logging_callback = mlperf_logger.LoggingCallback(
+    mllogger,
+    args.auc_threshold,
+    iter_per_epoch,
+    args.batchsize,
+)
+logging_callback.minimum_training_time = args.minimum_training_time
+
+# 1. Create Solver, DataReaderParams and Optimizer
+solver = hugectr.CreateSolver(
+    model_name=mllog_constants.DLRM_DCNv2,
+    seed=args.seed,
+    max_eval_batches=args.max_eval_batches,
+    batchsize_eval=args.batchsize_eval,
+    batchsize=args.batchsize,
+    vvgpu=[[x for x in range(args.num_gpus_per_node)] for _ in range(num_nodes)],
+    repeat_dataset=True,
+    lr=args.lr,
+    warmup_steps=args.warmup_steps,
+    decay_start=args.decay_start,
+    decay_steps=args.decay_steps,
+    decay_power=2.0,
+    end_lr=0.0,
+    use_mixed_precision=args.use_mixed_precision,
+    enable_tf32_compute=args.enable_tf32_compute,
+    scaler=args.scaler,
+    use_cuda_graph=True,
+    gen_loss_summary=args.gen_loss_summary,
+    train_intra_iteration_overlap=True,
+    train_inter_iteration_overlap=True,
+    eval_intra_iteration_overlap=False,
+    eval_inter_iteration_overlap=True,
+    all_reduce_algo=hugectr.AllReduceAlgo.NCCL,
+    grouped_all_reduce=True,
+    num_iterations_statistics=20,
+    perf_logging=False,
+    drop_incomplete_batch=True,
+    use_embedding_collection=True,
+    use_algorithm_search=args.use_algorithm_search,
+    training_callbacks=[logging_callback],
+)
+
+optimizer = None
+if args.optimizer == "adagrad":
+    optimizer = hugectr.CreateOptimizer(
+        optimizer_type=hugectr.Optimizer_t.AdaGrad,
+        update_type=hugectr.Update_t.Global,
+        initial_accu_value=args.init_accu,
+        epsilon=args.eps,
+    )
+elif args.optimizer == "sgd":
+    optimizer = hugectr.CreateOptimizer(
+        optimizer_type=hugectr.Optimizer_t.SGD,
+        update_type=hugectr.Update_t.Local,
+        atomic_update=True,
+    )
+
+reader = hugectr.DataReaderParams(
+    data_reader_type=hugectr.DataReaderType_t.RawAsync,
+    source=["/data/train_data.bin"],
+    eval_source="/data_val/val_data.bin",
+    check_type=hugectr.Check_t.Non,
+    num_samples=TRAIN_NUM_SAMPLES,
+    eval_num_samples=EVAL_NUM_SAMPLES,
+    cache_eval_data=1,
+    slot_size_array=TABLE_SIZE_ARRAY,
+    async_param=hugectr.AsyncParam(
+        num_threads=1,
+        num_batches_per_thread=16,
+        shuffle=False,
+        multi_hot_reader=True,
+        is_dense_float=True,
+    ),
+)
+
+# 2. Initialize the Model instance
+model = hugectr.Model(solver, reader, optimizer)
+# 3. Construct the Model graph
+model.add(
+    hugectr.Input(
+        label_dim=1,
+        label_name="label",
+        dense_dim=NUM_DENSE,
+        dense_name="dense",
+        data_reader_sparse_param_array=[
+            hugectr.DataReaderSparseParam("data{}".format(i), MULTI_HOT_SIZES[i], True, 1)
+            for i in range(NUM_TABLE)
+        ],
+    )
+)
+
+# create embedding table
+embedding_table_list = []
+for i in range(NUM_TABLE):
+    embedding_table_list.append(
+        hugectr.EmbeddingTableConfig(
+            name=str(i), max_vocabulary_size=TABLE_SIZE_ARRAY[i], ev_size=args.ev_size
+        )
+    )
+# create embedding planner and embedding collection
+comm_strategy = (
+    hugectr.CommunicationStrategy.Hierarchical
+    if num_nodes > 1
+    else hugectr.CommunicationStrategy.Uniform
+)
+ebc_config = hugectr.EmbeddingCollectionConfig(use_exclusive_keys=True, comm_strategy=comm_strategy)
+ebc_config.embedding_lookup(
+    table_config=[embedding_table_list[i] for i in range(NUM_TABLE)],
+    bottom_name=["data{}".format(i) for i in range(NUM_TABLE)],
+    top_name="sparse_embedding",
+    combiner=["sum" for _ in range(NUM_TABLE)],
+)
+
+ebc_config.shard(shard_matrix=shard_matrix, shard_strategy=shard_strategy)
+
+model.add(ebc_config)
+
+# configure compute knobs for bottom & top MLP layers
+compute_config = hugectr.DenseLayerComputeConfig(
+    async_wgrad=True,
+    fuse_wb=False,
+)
+
+model.add(
+    hugectr.DenseLayer(
+        layer_type=hugectr.Layer_t.MLP,
+        bottom_names=["dense"],
+        top_names=["mlp1"],
+        num_outputs=[512, 256, 128],
+        act_type=hugectr.Activation_t.Relu,
+        compute_config=compute_config,
+    )
+)
+model.add(
+    hugectr.DenseLayer(
+        layer_type=hugectr.Layer_t.Concat,
+        bottom_names=["sparse_embedding", "mlp1"],
+        top_names=["concat1"],
+    )
+)
+model.add(
+    hugectr.DenseLayer(
+        layer_type=hugectr.Layer_t.MultiCross,
+        bottom_names=["concat1"],
+        top_names=["interaction1"],
+        projection_dim=512,
+        num_layers=3,
+        compute_config=compute_config,
+    )
+)
+model.add(
+    hugectr.DenseLayer(
+        layer_type=hugectr.Layer_t.MLP,
+        bottom_names=["interaction1"],
+        top_names=["mlp2"],
+        num_outputs=[1024, 1024, 512, 256, 1],
+        activations=[
+            hugectr.Activation_t.Relu,
+            hugectr.Activation_t.Relu,
+            hugectr.Activation_t.Relu,
+            hugectr.Activation_t.Relu,
+            hugectr.Activation_t.Non,
+        ],
+        compute_config=compute_config,
+    )
+)
+model.add(
+    hugectr.DenseLayer(
+        layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
+        bottom_names=["mlp2", "label"],
+        top_names=["loss"],
+    )
+)
+# 4. Compile & Fit
+model.compile()
+model.summary()
+num_columns = 1 + NUM_DENSE + sum(MULTI_HOT_SIZES)  # +1 for the label
+mllogger.event(
+    key=mllog_constants.TRAIN_SAMPLES,
+    value=mlperf_logger.get_row_count("/data/train_data.bin", num_columns, 4),
+    metadata={mllog_constants.EPOCH_NUM: 0.0},
+)
+mllogger.event(
+    key=mllog_constants.EVAL_SAMPLES,
+    value=mlperf_logger.get_row_count("/data_val/val_data.bin", num_columns, 4),
+    metadata={mllog_constants.EPOCH_NUM: 0.0},
+)
+model.fit(
+    max_iter=args.max_iter,
+    display=args.display_interval,
+    eval_interval=args.eval_interval,
+    snapshot=2000000,
+    snapshot_prefix="dlrm",
+)
diff --git a/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py b/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py
index a52af1932a..c274023d80 100644
--- a/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py
+++ b/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py
@@ -486,7 +486,6 @@ def export(var):
         )
         # sort_indice_tensor = tf.argsort(indices)
         with tf.device("CPU"):
-
             indices = tf.identity(indices)
             values = tf.identity(values)
         return indices, values
diff --git a/test/utest/communication/ar_oneshot_test.cu b/test/utest/communication/ar_oneshot_test.cu
index 802e5aa165..2e4d2563df 100644
--- a/test/utest/communication/ar_oneshot_test.cu
+++ b/test/utest/communication/ar_oneshot_test.cu
@@ -17,10 +17,11 @@
 #ifndef ENABLE_MPI
 #include <gtest/gtest.h>
 
+#include <collectives/collective.hpp>
 #include <common.hpp>
 #include <general_buffer2.hpp>
 #include <random>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <tensor2.hpp>
 #include <type_traits>
 #include <utest/test_utils.hpp>
@@ -116,9 +117,10 @@ struct arTest {
       use_mixed_precision_ = true;
     }
 
-    resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
-    resource_manager_->set_ar_comm(AllReduceAlgo::ONESHOT, use_mixed_precision_);
-    ar_comm_ = resource_manager_->get_ar_comm();
+    resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
+    collective_manager_ = std::make_shared<CollectiveManager>(resource_manager_);
+    collective_manager_->set_ar_comm(AllReduceAlgo::ONESHOT, use_mixed_precision_);
+    ar_comm_ = collective_manager_->get_ar_comm();
     init_buffers();
   }
 
@@ -129,6 +131,7 @@ struct arTest {
   bool use_mixed_precision_;
   AllReduceInPlaceComm* ar_comm_;
   std::shared_ptr<ResourceManager> resource_manager_;
+  std::shared_ptr<CollectiveManager> collective_manager_;
 
   std::vector<Tensor2<TypeEmbeddingComp>> h_ar_buff_;
   std::vector<Tensor2<TypeEmbeddingComp>> d_ar_buff_;
diff --git a/test/utest/communication/ib_comms_a2a_v_integ_test.cu b/test/utest/communication/ib_comms_a2a_v_integ_test.cu
index c454adade0..8edb23166c 100644
--- a/test/utest/communication/ib_comms_a2a_v_integ_test.cu
+++ b/test/utest/communication/ib_comms_a2a_v_integ_test.cu
@@ -19,12 +19,13 @@
 #include <gtest/gtest.h>
 
 #include <chrono>
+#include <collectives/collective.hpp>
 #include <collectives/ib_comm.hpp>
 #include <common.hpp>
 #include <core23/mpi_init_service.hpp>
 #include <general_buffer2.hpp>
 #include <random>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <tensor2.hpp>
 #include <utest/test_utils.hpp>
 #include <utils.hpp>
@@ -119,9 +120,10 @@ struct IbCommsTest {
     for (int i = 0; i < num_procs_; i++) {
       vvgpu.push_back(device_list);
     }
-    resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
-    resource_manager_->init_ib_comm();
-    ib_comm_ = resource_manager_->get_ib_comm();
+    resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
+    collective_manager_ = std::make_shared<CollectiveManager>(resource_manager_);
+    collective_manager_->init_ib_comm();
+    ib_comm_ = collective_manager_->get_ib_comm();
     init_buffers();
     gen_uniform_size(max_size_);
 
@@ -488,6 +490,7 @@ struct IbCommsTest {
   bool inter_graph_captured_ = false;
 
   std::shared_ptr<ResourceManager> resource_manager_;
+  std::shared_ptr<CollectiveManager> collective_manager_;
   IbComm* ib_comm_;  // TODO: Make it shared so we have only one instance of ibcomm
   HierA2AvCollHandle coll_handle_;
 
diff --git a/test/utest/communication/ib_comms_a2a_v_test.cu b/test/utest/communication/ib_comms_a2a_v_test.cu
index b1ebd933d8..95a6511dcc 100644
--- a/test/utest/communication/ib_comms_a2a_v_test.cu
+++ b/test/utest/communication/ib_comms_a2a_v_test.cu
@@ -18,12 +18,13 @@
 
 #include <gtest/gtest.h>
 
+#include <collectives/collective.hpp>
 #include <collectives/ib_comm.hpp>
 #include <common.hpp>
 #include <core23/mpi_init_service.hpp>
 #include <general_buffer2.hpp>
 #include <random>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <tensor2.hpp>
 #include <utest/test_utils.hpp>
 #include <utils.hpp>
@@ -91,9 +92,11 @@ struct IbCommsTest {
     for (int i = 0; i < num_procs_; i++) {
       vvgpu.push_back(device_list);
     }
-    resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
-    resource_manager_->init_ib_comm();
-    ib_comm_ = resource_manager_->get_ib_comm();
+    resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
+    collective_manager_ = std::make_shared<CollectiveManager>(resource_manager_);
+
+    collective_manager_->init_ib_comm();
+    ib_comm_ = collective_manager_->get_ib_comm();
 
     comm_stream_.resize(num_gpus_);
     comm_events_.resize(num_gpus_);
@@ -408,6 +411,8 @@ struct IbCommsTest {
   int num_procs_ = 1;
 
   std::shared_ptr<ResourceManager> resource_manager_;
+  std::shared_ptr<CollectiveManager> collective_manager_;
+
   IbComm* ib_comm_;  // TODO: Make it shared so we have only one instance of ibcomm
   HierA2AvCollHandle coll_handle_;
 
diff --git a/test/utest/communication/ib_comms_ar_test.cu b/test/utest/communication/ib_comms_ar_test.cu
index 2e8e89bdd0..357a63949a 100644
--- a/test/utest/communication/ib_comms_ar_test.cu
+++ b/test/utest/communication/ib_comms_ar_test.cu
@@ -18,12 +18,13 @@
 
 #include <gtest/gtest.h>
 
+#include <collectives/collective.hpp>
 #include <collectives/ib_comm.hpp>
 #include <common.hpp>
 #include <core23/mpi_init_service.hpp>
 #include <general_buffer2.hpp>
 #include <random>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <tensor2.hpp>
 #include <type_traits>
 #include <utest/test_utils.hpp>
@@ -115,9 +116,10 @@ struct IbCommsTest {
     for (int i = 0; i < num_procs_; i++) {
       vvgpu.push_back(device_list);
     }
-    resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
-    resource_manager_->init_ib_comm();
-    ib_comm_ = resource_manager_->get_ib_comm();
+    resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST);
+    collective_manager_ = std::make_shared<CollectiveManager>(resource_manager_);
+    collective_manager_->init_ib_comm();
+    ib_comm_ = collective_manager_->get_ib_comm();
 
     init_buffers();
   }
@@ -129,6 +131,7 @@ struct IbCommsTest {
   int num_procs_;
 
   std::shared_ptr<ResourceManager> resource_manager_;
+  std::shared_ptr<CollectiveManager> collective_manager_;
   IbComm* ib_comm_;  // TODO: Make it shared so we have only one instance of ibcomm
 
   std::vector<Tensor2<TypeEmbeddingComp>> h_ar_buff_;
diff --git a/test/utest/data_distributor/data_distributor_tests.cpp b/test/utest/data_distributor/data_distributor_tests.cpp
index 588245074e..8793ef58eb 100644
--- a/test/utest/data_distributor/data_distributor_tests.cpp
+++ b/test/utest/data_distributor/data_distributor_tests.cpp
@@ -19,7 +19,7 @@
 #include <core/hctr_impl/hctr_backend.hpp>
 #include <embedding/data_distributor/data_distributor.hpp>
 #include <iostream>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 
 using namespace HugeCTR;
 using namespace embedding;
@@ -87,7 +87,7 @@ void test_data_distributor(const std::vector<int>& device_list,
   auto emb_type = core23::ToScalarType<float>::value;             // doesn't matter
   auto wgrad_type = HugeCTR::core23::ToScalarType<float>::value;  // doesn't matter
 
-  auto resource_manager = ResourceManagerExt::create({device_list}, 424242);
+  auto resource_manager = ResourceManagerCore::create({device_list}, 424242);
   auto core_list = get_core_resource_managers(resource_manager);
   int num_gpus = device_list.size();
   int num_lookup = lookup_params.size();
diff --git a/test/utest/data_reader/CMakeLists.txt b/test/utest/data_reader/CMakeLists.txt
index 5bd9ab037f..63b8755c8e 100644
--- a/test/utest/data_reader/CMakeLists.txt
+++ b/test/utest/data_reader/CMakeLists.txt
@@ -16,10 +16,6 @@
 cmake_minimum_required(VERSION 3.20)
 find_package(CUDAToolkit)
 
-file(GLOB async_reader_src 
-  data_reader_async_adapter_test.cpp
-  data_reader_async_test.cpp
-)
 if (NOT DISABLE_CUDF)
   file(GLOB data_reader_test_src
     data_reader_parquet_test.cpp
@@ -30,13 +26,10 @@ if (NOT DISABLE_CUDF)
   target_link_libraries(data_reader_test PUBLIC /usr/local/cuda/lib64/stubs/libcuda.so)
 endif()
 
-add_executable(async_reader ${async_reader_src})
 add_executable(multi_hot_async_data_reader_test multi_hot_async_data_reader_test.cpp)
 add_executable(batch_locations_test batch_locations_test.cpp)
 add_executable(v2_async_reader_test data_reader_v2_async_test.cpp)
 add_executable(benchmark_async_reader data_reader_benchmark.cu)
-target_link_libraries(async_reader PUBLIC CUDA::nvml huge_ctr_shared gtest gtest_main)
-target_link_libraries(async_reader PUBLIC /usr/local/cuda/lib64/stubs/libcuda.so)
 target_link_libraries(v2_async_reader_test PUBLIC CUDA::nvml huge_ctr_shared gtest gtest_main)
 target_link_libraries(v2_async_reader_test PUBLIC /usr/local/cuda/lib64/stubs/libcuda.so)
 target_link_libraries(multi_hot_async_data_reader_test PUBLIC CUDA::nvml huge_ctr_shared gtest gtest_main /usr/local/cuda/lib64/stubs/libcuda.so)
diff --git a/test/utest/data_reader/data_reader_async_adapter_test.cpp b/test/utest/data_reader/data_reader_async_adapter_test.cpp
deleted file mode 100644
index 328b4dd55a..0000000000
--- a/test/utest/data_reader/data_reader_async_adapter_test.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <omp.h>
-
-#include <common.hpp>
-#include <cstdio>
-#include <data_readers/async_reader/async_reader_adapter.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <fstream>
-#include <functional>
-#include <general_buffer2.hpp>
-#include <iostream>
-#include <resource_managers/resource_manager_ext.hpp>
-#include <sstream>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-using namespace HugeCTR::hybrid_embedding;
-
-size_t global_seed = 321654;
-size_t io_alignment = 4096;
-// threads = 32.
-const size_t num_batches = 10;
-template <typename dtype>
-void reader_adapter_test(std::vector<int> device_list, size_t batch_size, int num_threads,
-                         int batches_per_thread, int label_dim, int dense_dim, int sparse_dim,
-                         int num_passes, int seed, bool wait_for_gpu_idle = false,
-                         bool shuffle = false) {
-  using DataReaderType = AsyncReader<dtype>;
-
-  const std::string fname = "__tmp_test.dat";
-  size_t io_block_size = io_alignment * 8;
-  int bytes_per_batch = sizeof(int) * (label_dim + dense_dim + sparse_dim) * batch_size;
-  int actual_nr_requests = 2;
-  for (int io_blk = io_alignment;; io_blk += io_alignment) {
-    actual_nr_requests = batches_per_thread * num_threads * (bytes_per_batch / io_blk + 2);
-    if (actual_nr_requests <= 1023) {
-      io_block_size = io_blk;
-      break;
-    }
-  }
-
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_threads = " << num_threads << std::endl;
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_batches_per_thread = " << batches_per_thread
-                         << std::endl;
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_block_size = " << io_block_size << std::endl;
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_nr_requests = " << actual_nr_requests << std::endl;
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_depth = " << 2 << std::endl;
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_alignment = " << io_alignment << std::endl;
-  HCTR_LOG_S(INFO, ROOT) << "AsyncReader: shuffle = " << (shuffle ? "ON" : "OFF") << std::endl;
-
-  const bool mixed_precision = true;
-  const float epsilon = mixed_precision ? 1e0f : 1e-3f;
-
-  HCTR_LIB_THROW(nvmlInit_v2());
-
-  std::vector<std::vector<int>> vvgpu;
-  vvgpu.push_back(device_list);
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
-
-  size_t local_gpu_count = resource_manager->get_local_gpu_count();
-  const int sample_dim = label_dim + dense_dim + sparse_dim;
-  const size_t file_size = num_batches * batch_size * sample_dim;
-
-  std::vector<int> ref_data(file_size);
-
-#pragma omp parallel
-  {
-    std::mt19937 gen(seed + omp_get_thread_num());
-    std::uniform_int_distribution<uint32_t> dis(10000, 99999);
-    std::uniform_real_distribution<float> disf(0.1, 1.1);
-
-#pragma omp for
-    for (size_t i = 0; i < num_batches * batch_size; i++) {
-      for (int j = 0; j < label_dim; j++) {
-        ref_data[i * sample_dim + j] = dis(gen);
-      }
-
-      for (int j = 0; j < dense_dim; j++) {
-        ref_data[i * sample_dim + label_dim + j] = dis(gen);
-      }
-
-      for (int j = 0; j < sparse_dim; j++) {
-        auto dtype_ref =
-            reinterpret_cast<uint32_t*>(ref_data.data() + i * sample_dim + label_dim + dense_dim);
-        dtype_ref[j] = dis(gen);
-      }
-    }
-  }
-
-  {
-    std::ofstream fout(fname);
-    fout.write((char*)ref_data.data(), file_size * sizeof(int));
-  }
-
-  std::vector<DataReaderSparseParam> params{
-      DataReaderSparseParam("dummy", std::vector<int>(sparse_dim, 1), true, sparse_dim)};
-
-  DataReaderType data_reader(fname, batch_size, label_dim, dense_dim, params, true,
-                             resource_manager, num_threads, batches_per_thread, io_block_size, 2,
-                             io_alignment, shuffle, wait_for_gpu_idle);
-
-  auto label_tensors = data_reader.get_label_tensor23s();
-  auto dense_tensors = data_reader.get_dense_tensor23s();
-  auto sparse_tensors = data_reader.get_value_tensor23s();
-
-  data_reader.start();
-
-  for (int pass = 0; pass < num_passes; pass++) {
-    size_t total_read = 0;
-    for (size_t batch = 0; batch < num_batches; batch++) {
-      size_t sz = data_reader.read_a_batch_to_device();
-      HCTR_LOG_S(INFO, ROOT) << "iter " << batch << " batchsize " << sz << std::endl;
-
-      std::vector<size_t> device_batch_offsets(local_gpu_count + 1);
-      size_t total_offset = 0;
-      for (size_t id = 0; id < local_gpu_count + 1; id++) {
-        device_batch_offsets[id] = total_offset;
-        if (id < local_gpu_count) {
-          total_offset += data_reader.get_current_batchsize_per_device(id);
-        }
-      }
-
-      //#pragma omp parallel for num_threads(local_gpu_count)
-      for (size_t id = 0; id < local_gpu_count; id++) {
-        auto device = resource_manager->get_local_gpu(id);
-        CudaDeviceContext context(device->get_device_id());
-
-        std::vector<float> labels(label_tensors[id].num_elements());
-        std::vector<__half> denses(dense_tensors[id].num_elements());
-        std::vector<dtype> sparses(sparse_tensors[id].get_value_tensor().num_elements());
-
-        core23::copy_sync(labels, label_tensors[id]);
-        core23::copy_sync(denses, dense_tensors[id]);
-        core23::copy_sync(sparses, sparse_tensors[id].get_value_tensor());
-
-        auto cur_ref = ref_data.data() + total_read * sample_dim;
-
-        for (size_t sample = device_batch_offsets[id]; sample < device_batch_offsets[id + 1];
-             sample++) {
-          for (int j = 0; j < label_dim; j++) {
-            ASSERT_EQ((float)cur_ref[sample * sample_dim + j],
-                      labels[(sample - device_batch_offsets[id]) * label_dim + j]);
-          }
-
-          for (int j = 0; j < dense_dim; j++) {
-            ASSERT_NEAR(std::log((double)cur_ref[sample * sample_dim + label_dim + j] + 1.0),
-                        (double)denses[(sample - device_batch_offsets[id]) * dense_dim + j],
-                        epsilon);
-          }
-        }
-
-        for (size_t sample = 0; sample < sz; sample++) {
-          for (int j = 0; j < sparse_dim; j++) {
-            auto dtype_ref = cur_ref + sample * sample_dim + label_dim + dense_dim;
-            ASSERT_EQ(static_cast<dtype>(dtype_ref[j]), sparses[sample * sparse_dim + j]);
-          }
-        }
-      }
-
-      total_read += sz;
-    }
-  }
-}
-
-class MPIEnvironment : public ::testing::Environment {
- protected:
-  virtual void SetUp() { test::mpi_init(); }
-  virtual void TearDown() { test::mpi_finalize(); }
-  virtual ~MPIEnvironment(){};
-};
-
-::testing::Environment* const mpi_env = ::testing::AddGlobalTestEnvironment(new MPIEnvironment);
-
-//   device_list   batch  threads  batch_per_thread   label  dense  sparse  num_passes  seed
-//
-TEST(reader_adapter_test, dgxa100_longlong) {
-  reader_adapter_test<long long>({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 26, 1,
-                                 global_seed += 128);
-}
-
-TEST(reader_adapter_test, test1) {
-  reader_adapter_test<uint32_t>({0}, 100, 1, 1, 2, 1, 1, 1, global_seed += 128);
-}
-TEST(reader_adapter_test, test2) {
-  reader_adapter_test<uint32_t>({0}, 100, 1, 1, 2, 1, 1, 2, global_seed += 128);
-}
-TEST(reader_adapter_test, test3) {
-  reader_adapter_test<uint32_t>({0}, 100, 1, 1, 2, 3, 1, 3, global_seed += 128);
-}
-TEST(reader_adapter_test, test4) {
-  reader_adapter_test<uint32_t>({0}, 100, 1, 1, 2, 3, 6, 7, global_seed += 128);
-}
-TEST(reader_adapter_test, test5) {
-  reader_adapter_test<uint32_t>({0}, 1012, 2, 1, 2, 3, 7, 18, global_seed += 128);
-}
-TEST(reader_adapter_test, test6) {
-  reader_adapter_test<uint32_t>({0}, 101256, 2, 1, 2, 3, 7, 8, global_seed += 128);
-}
-TEST(reader_adapter_test, test7) {
-  reader_adapter_test<uint32_t>({0}, 101256, 2, 4, 2, 3, 7, 5, global_seed += 128);
-}
-TEST(reader_adapter_test, test8) {
-  reader_adapter_test<uint32_t>({0}, 101256, 2, 3, 3, 3, 9, 2, global_seed += 128);
-}
-TEST(reader_adapter_test, test9) {
-  reader_adapter_test<uint32_t>({0}, 101256, 4, 4, 1, 8, 6, 4, global_seed += 128);
-}
-TEST(reader_adapter_test, test10) {
-  reader_adapter_test<uint32_t>({0, 1}, 10, 2, 2, 7, 2, 1, 21, global_seed += 128);
-}
-TEST(reader_adapter_test, test11) {
-  reader_adapter_test<uint32_t>({1, 4}, 6000, 3, 2, 7, 13, 26, 1, global_seed += 128);
-}
-TEST(reader_adapter_test, dgxa100_48slots) {
-  reader_adapter_test<uint32_t>({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 48, 1,
-                                global_seed += 128);
-}
-TEST(reader_adapter_test, dgxa100_48slots_wait_for_idle) {
-  reader_adapter_test<uint32_t>({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 48, 1,
-                                global_seed += 128, true);
-}
-TEST(reader_adapter_test, dgxa100_26slots) {
-  reader_adapter_test<uint32_t>({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 800, 1,
-                                global_seed += 128);
-}
diff --git a/test/utest/data_reader/data_reader_async_test.cpp b/test/utest/data_reader/data_reader_async_test.cpp
deleted file mode 100644
index e2a4e3cdde..0000000000
--- a/test/utest/data_reader/data_reader_async_test.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <omp.h>
-
-#include <common.hpp>
-#include <cstdio>
-#include <data_readers/async_reader/async_reader.hpp>
-#include <fstream>
-#include <functional>
-#include <general_buffer2.hpp>
-#include <iostream>
-#include <resource_managers/resource_manager_ext.hpp>
-#include <sstream>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-void reader_test(std::vector<int> device_list, size_t file_size, size_t batch_size, int num_threads,
-                 int batches_per_thread, int io_block_size, int io_depth, int wait_time_us) {
-  const std::string fname = "__tmp_test.dat";
-  char* ref_data;
-  char* read_data;
-
-  HCTR_LIB_THROW(nvmlInit_v2());
-
-  std::vector<std::vector<int>> vvgpu;
-  vvgpu.push_back(device_list);
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
-
-  HCTR_LIB_THROW(cudaMallocManaged(&ref_data, file_size));
-  HCTR_LIB_THROW(cudaMallocManaged(&read_data, file_size));
-
-#pragma omp parallel
-  {
-    std::mt19937 gen(424242 + omp_get_thread_num());
-    // std::uniform_int_distribution<uint8_t> dis(0, 255);
-    std::uniform_int_distribution<uint8_t> dis('a', 'z');
-
-#pragma omp for
-    for (size_t i = 0; i < file_size; i++) {
-      ref_data[i] = dis(gen);
-    }
-  }
-
-  {
-    std::ofstream fout(fname);
-    fout.write(ref_data, file_size);
-  }
-
-  AsyncReaderImpl reader_impl(fname, batch_size, resource_manager.get(), num_threads,
-                              batches_per_thread, io_block_size, io_depth, 4096);
-
-  reader_impl.load_async();
-
-  size_t total_sz = 0;
-  while (true) {
-    BatchDesc desc = reader_impl.get_batch();
-    size_t sz = desc.size_bytes;
-
-    if (sz > 0) {
-      HCTR_LIB_THROW(
-          cudaMemcpy(read_data + total_sz, desc.dev_data[0], sz, cudaMemcpyDeviceToDevice));
-      total_sz += sz;
-      usleep(wait_time_us);
-      reader_impl.finalize_batch();
-    } else {
-      break;
-    }
-    if (total_sz >= file_size) {
-      break;
-    }
-  }
-
-  ASSERT_EQ(total_sz, file_size);
-  for (size_t i = 0; i < std::min(file_size, total_sz); i++) {
-    // HCTR_LOG_S(DEBUG, WORLD) << "Symbols differ at index " << i << " : expected "
-    //           << ref_data[i] << " got " << read_data[i] << std::endl;
-    ASSERT_EQ(ref_data[i], read_data[i]) << "Symbols differ at index " << i << " : expected "
-                                         << ref_data[i] << " got " << read_data[i];
-  }
-
-  cudaFree(ref_data);
-  cudaFree(read_data);
-}
-
-//   device_list   file_size batch  threads  batch_per_thread  io_block  io_depth  wait_time
-//
-TEST(reader_test, test1) { reader_test({0}, 100, 20, 1, 1, 4096 * 2, 1, 0); }
-TEST(reader_test, test2) { reader_test({0}, 100, 20, 2, 1, 4096 * 2, 1, 0); }
-TEST(reader_test, test3) { reader_test({0}, 1012, 20, 2, 1, 4096 * 2, 1, 0); }
-TEST(reader_test, test4) { reader_test({0}, 1012, 32, 2, 2, 4096 * 2, 1, 0); }
-TEST(reader_test, test5) { reader_test({0}, 10120, 32, 2, 2, 4096 * 2, 2, 0); }
-TEST(reader_test, test6) { reader_test({0}, 101256, 1000, 2, 4, 4096 * 2, 2, 0); }
-TEST(reader_test, test7) { reader_test({0}, 101256, 1000, 2, 4, 4096 * 2, 2, 100); }
-TEST(reader_test, test8) { reader_test({0}, 101256, 1000, 2, 4, 4096 * 2, 2, 1000); }
-TEST(reader_test, test9) { reader_test({0, 1}, 100, 20, 2, 1, 4096 * 2, 1, 0); }
-TEST(reader_test, test10) { reader_test({0, 1}, 101256, 1000, 2, 4, 4096 * 2, 2, 0); }
-TEST(reader_test, test11) { reader_test({0, 1}, 101256, 1000, 2, 4, 4096 * 2, 2, 100); }
-TEST(reader_test, test12) { reader_test({0, 1}, 101256, 1000, 2, 4, 4096 * 2, 2, 1000); }
-TEST(reader_test, test13) { reader_test({0, 1}, 1014252, 14352, 6, 4, 4096 * 2, 2, 0); }
-TEST(reader_test, test14) { reader_test({0, 1, 2, 3}, 100980, 1980, 4, 4, 4096 * 2, 2, 1000); }
-TEST(reader_test, test15) { reader_test({0, 1, 2, 3, 4}, 101256, 7616, 8, 4, 4096 * 2, 2, 0); }
-TEST(reader_test, test16) {
-  reader_test({0, 1, 2, 3, 4, 5, 6, 7}, 8012516, 38720, 8, 4, 4096 * 2, 2, 0);
-}
-TEST(reader_test, test17) {
-  reader_test({0, 1, 2, 3, 4, 5, 6, 7}, 8012516, 38720, 16, 4, 4096 * 2, 2, 0);
-}
-TEST(reader_test, test18) {
-  reader_test({0, 1, 2, 3, 4, 5, 6, 7}, 18012516, 38720, 8, 4, 4096 * 2, 2, 2000);
-}
diff --git a/test/utest/data_reader/data_reader_benchmark.cu b/test/utest/data_reader/data_reader_benchmark.cu
index 8d12668dbc..8c76838fdd 100644
--- a/test/utest/data_reader/data_reader_benchmark.cu
+++ b/test/utest/data_reader/data_reader_benchmark.cu
@@ -19,14 +19,13 @@
 
 #include <common.hpp>
 #include <cstdio>
-#include <data_readers/async_reader/async_reader.hpp>
 #include <data_readers/multi_hot/detail/data_reader_impl.hpp>
 #include <filesystem>
 #include <fstream>
 #include <functional>
 #include <general_buffer2.hpp>
 #include <iostream>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 #include <utest/test_utils.hpp>
 #include <vector>
@@ -69,7 +68,7 @@ void reader_test(std::vector<int> device_list, size_t batch_size_bytes, int num_
 
   std::vector<std::vector<int>> vvgpu;
   vvgpu.push_back(device_list);
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
+  const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242);
 
   MultiHot::FileSource source;
   source.name = fname;
diff --git a/test/utest/data_reader/data_reader_parquet_test.cpp b/test/utest/data_reader/data_reader_parquet_test.cpp
index f23d4314b0..2a8253058e 100644
--- a/test/utest/data_reader/data_reader_parquet_test.cpp
+++ b/test/utest/data_reader/data_reader_parquet_test.cpp
@@ -21,7 +21,7 @@
 #include <data_readers/file_list.hpp>
 #include <data_readers/parquet_data_reader_worker.hpp>
 #include <fstream>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/test_utils.hpp>
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@@ -334,7 +334,7 @@ void data_reader_group_iter_strided_batch_test_impl(int num_files, long long sam
   ASSERT_TRUE(num_files % device_list.size() == 0);
   int files_per_worker = num_files / device_list.size();
 
-  const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0);
   const DataReaderSparseParam param = {"distributed", std::vector<int>(slot_num, max_nnz), false,
                                        slot_num};
   std::vector<DataReaderSparseParam> params;
@@ -506,7 +506,7 @@ void data_reader_group_iter_squential_batch_test_impl(int num_files, long long s
     vvgpu.push_back(device_list);
   }
 
-  const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0);
   const DataReaderSparseParam param = {"distributed", std::vector<int>(slot_num, max_nnz), false,
                                        slot_num};
   std::vector<DataReaderSparseParam> params;
@@ -645,7 +645,7 @@ void data_reader_group_epoch_strided_batch_test_impl(int num_files, long long sa
     vvgpu.push_back(device_list);
   }
 
-  const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0);
   const DataReaderSparseParam param = {"distributed", std::vector<int>(slot_num, max_nnz), false,
                                        slot_num};
   std::vector<DataReaderSparseParam> params;
@@ -862,7 +862,7 @@ void data_reader_group_epoch_squential_batch_test_impl(int num_files, long long
     vvgpu.push_back(device_list);
   }
 
-  const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0);
   const DataReaderSparseParam param = {"distributed", std::vector<int>(slot_num, max_nnz), false,
                                        slot_num};
   std::vector<DataReaderSparseParam> params;
@@ -1020,7 +1020,7 @@ void data_reader_worker_test_impl(const int num_files, const long long sample_pe
   for (int i = 0; i < numprocs; i++) {
     vvgpu.push_back(device_list);
   }
-  auto gpu_resource_group = ResourceManagerExt::create(vvgpu, 0);
+  auto gpu_resource_group = ResourceManagerCore::create(vvgpu, 0);
   // const int num_devices = 1;
   const DataReaderSparseParam param = {"localized", std::vector<int>(slot_num, max_nnz), true,
                                        slot_num};
diff --git a/test/utest/data_reader/data_reader_v2_async_test.cpp b/test/utest/data_reader/data_reader_v2_async_test.cpp
index e9b532c0f8..1fd0902087 100644
--- a/test/utest/data_reader/data_reader_v2_async_test.cpp
+++ b/test/utest/data_reader/data_reader_v2_async_test.cpp
@@ -24,7 +24,7 @@
 #include <functional>
 #include <general_buffer2.hpp>
 #include <iostream>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 #include <utest/test_utils.hpp>
 #include <vector>
@@ -41,7 +41,7 @@ void reader_test(std::vector<int> device_list, size_t file_size, size_t batch_si
 
   std::vector<std::vector<int>> vvgpu;
   vvgpu.push_back(device_list);
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
+  const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242);
 
   HCTR_LIB_THROW(cudaMallocHost(&ref_data, file_size));
   HCTR_LIB_THROW(cudaMallocHost(&read_data, file_size));
diff --git a/test/utest/data_reader/multi_hot_async_data_reader_test.cpp b/test/utest/data_reader/multi_hot_async_data_reader_test.cpp
index c0eaab6e32..d9d8fb30d1 100644
--- a/test/utest/data_reader/multi_hot_async_data_reader_test.cpp
+++ b/test/utest/data_reader/multi_hot_async_data_reader_test.cpp
@@ -20,13 +20,12 @@
 #include <common.hpp>
 #include <cstdio>
 #include <data_readers/multi_hot/async_data_reader.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
 #include <filesystem>
 #include <fstream>
 #include <functional>
 #include <general_buffer2.hpp>
 #include <iostream>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 #include <type_traits>
 #include <utest/test_utils.hpp>
@@ -34,7 +33,6 @@
 
 using namespace HugeCTR;
 using namespace HugeCTR::MultiHot;
-using namespace HugeCTR::hybrid_embedding;
 
 size_t global_seed = 321654;
 size_t num_batches = 13;
@@ -69,7 +67,7 @@ void async_data_reader_test(std::vector<int> device_list, size_t batch_size,
 
   std::vector<std::vector<int>> vvgpu;
   vvgpu.push_back(device_list);
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
+  const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242);
 
   size_t local_gpu_count = resource_manager->get_local_gpu_count();
   const int sample_dim = label_dim + dense_dim + (total_sparse_dim * (sizeof(dtype) / sizeof(int)));
diff --git a/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu b/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu
index 10e077a914..e9f1e58b43 100644
--- a/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu
+++ b/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu
@@ -25,7 +25,7 @@
 #include <fstream>
 #include <functional>
 #include <memory>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <unordered_set>
 #include <utest/embedding/embedding_test_utils.hpp>
 #include <utest/embedding/sparse_embedding_hash_cpu.hpp>
@@ -166,7 +166,7 @@ void train_and_test(const std::vector<int> &device_list, const Optimizer_t &opti
   for (int i = 0; i < numprocs; i++) {
     vvgpu.push_back(device_list);
   }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
 
   if (pid == 0) {
     // re-generate the dataset files
@@ -472,7 +472,7 @@ void load_and_dump(const std::vector<int> &device_list, const Optimizer_t &optim
 
   std::vector<std::vector<int>> vvgpu;
   vvgpu.push_back(device_list);
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
 
   // re-generate the dataset files
   {
@@ -652,7 +652,7 @@ void load_and_dump_file(const std::vector<int> &device_list, const Optimizer_t &
   for (int i = 0; i < numprocs; i++) {
     vvgpu.push_back(device_list);
   }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
 
   if (pid == 0) {
     // re-generate the dataset files
diff --git a/test/utest/embedding/hybrid_embedding/data_test.cpp b/test/utest/embedding/hybrid_embedding/data_test.cpp
deleted file mode 100644
index 3fed01d324..0000000000
--- a/test/utest/embedding/hybrid_embedding/data_test.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_runtime.h>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <tensor2.hpp>
-
-using namespace HugeCTR;
-using namespace hybrid_embedding;
-
-namespace {
-template <typename dtype, typename emtype = float>
-void data_test() {
-  size_t batch_size = 4;
-  size_t num_iterations = 2;
-  std::vector<size_t> table_sizes{100, 10, 10, 20};
-  std::vector<dtype> data_in{99, 3, 7, 19, 0,  0, 0, 0,  1, 1, 1, 1, 2, 2, 2, 2,
-                             3,  3, 3, 3,  50, 2, 4, 10, 2, 2, 2, 2, 1, 1, 1, 1};
-  std::vector<dtype> data_to_unique_categories_ref{
-      99, 103, 117, 139, 0,  100, 110, 120, 1, 101, 111, 121, 2, 102, 112, 122,
-      3,  103, 113, 123, 50, 102, 114, 130, 2, 102, 112, 122, 1, 101, 111, 121};
-
-  Tensor2<dtype> d_data_in;
-  // HCTR_LOG_S(DEBUG, WORLD) << "debug2" << std::endl;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  buff->reserve({batch_size * num_iterations * table_sizes.size()}, &d_data_in);
-  buff->allocate();
-  upload_tensor(data_in, d_data_in, 0);
-  // HCTR_LOG_S(DEBUG, WORLD) << "debug3" << std::endl;
-  Data<dtype> data(table_sizes, batch_size, num_iterations);
-  // HCTR_LOG_S(DEBUG, WORLD) << "debug" << std::endl;
-  data.data_to_unique_categories(d_data_in, 0);
-  // HCTR_LOG_S(DEBUG, WORLD) << "debug1" << std::endl;
-  std::vector<dtype> data_to_unique_categories_ret;
-  download_tensor(data_to_unique_categories_ret, data.samples, 0);
-  EXPECT_THAT(data_to_unique_categories_ret,
-              ::testing::ElementsAreArray(data_to_unique_categories_ref));
-};
-
-}  // namespace
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-void test_raw_data(dtype *d_raw_data, size_t num_samples, size_t num_tables, size_t num_iterations,
-                   const std::vector<size_t> &table_sizes) {
-  size_t num_elements = num_samples * num_tables * num_iterations;
-  std::cout << " test_raw_data:\tnum_samples " << num_samples << " num_tables " << num_tables
-            << std::endl;
-  std::vector<dtype> h_raw_data(num_elements, (dtype)0);
-  cudaStream_t stream = 0;
-  HCTR_LIB_THROW(cudaMemcpyAsync(h_raw_data.data(), d_raw_data, num_elements * sizeof(dtype),
-                                 cudaMemcpyDeviceToHost, stream));
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-
-  for (size_t iteration = 0; iteration < num_iterations; ++iteration) {
-    for (size_t sample = 0; sample < num_samples; ++sample) {
-      for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-        size_t category = (size_t)
-            h_raw_data[iteration * num_samples * num_tables + sample * num_tables + embedding];
-        if (category >= table_sizes[embedding]) {
-          std::cout << " sample " << sample << " embedding " << embedding << " category "
-                    << category << " table sizes " << table_sizes[embedding] << std::endl;
-        }
-        EXPECT_TRUE(category < table_sizes[embedding]);
-      }
-    }
-  }
-}
-
-template <typename dtype>
-void test_samples(dtype *d_raw_data, Data<dtype> &data) {
-  const size_t num_iterations = data.num_iterations;
-  const size_t num_samples = data.batch_size;
-  const size_t num_tables = data.table_sizes.size();
-
-  size_t num_elements = num_iterations * num_samples * num_tables;
-
-  const size_t num_categories = EmbeddingTableFunctors<dtype>::get_num_categories(data.table_sizes);
-  std::vector<dtype> embedding_offsets;
-  EmbeddingTableFunctors<dtype>::get_embedding_offsets(embedding_offsets, data.table_sizes);
-
-  cudaStream_t stream = 0;
-  std::vector<dtype> h_raw_data(num_elements, (dtype)0);
-  HCTR_LIB_THROW(cudaMemcpyAsync(h_raw_data.data(), d_raw_data, num_elements * sizeof(dtype),
-                                 cudaMemcpyDeviceToHost, stream));
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-  std::vector<dtype> h_samples;
-  download_tensor(h_samples, data.samples, stream);
-
-  for (size_t iteration = 0; iteration < num_iterations; ++iteration) {
-    for (size_t sample = 0; sample < num_samples; ++sample) {
-      for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-        size_t indx = iteration * num_samples * num_tables + sample * num_tables + embedding;
-        size_t unique_category = (size_t)h_samples[indx];
-        size_t category_samples = (size_t)unique_category - embedding_offsets[embedding];
-        size_t category_data = (size_t)h_raw_data[indx];
-
-        EXPECT_TRUE(category_samples == category_data);
-        EXPECT_TRUE(unique_category < num_categories);
-      }
-    }
-  }
-}
-
-template void test_raw_data<uint32_t>(uint32_t *d_raw_data, size_t num_samples, size_t num_tables,
-                                      size_t num_iterations,
-                                      const std::vector<size_t> &table_sizes);
-template void test_raw_data<long long>(long long *d_raw_data, size_t num_samples, size_t num_tables,
-                                       size_t num_iterations,
-                                       const std::vector<size_t> &table_sizes);
-template void test_samples<uint32_t>(uint32_t *d_raw_data, Data<uint32_t> &data);
-template void test_samples<long long>(long long *d_raw_data, Data<long long> &data);
-
-/**
- * Tests we pad the incomplete batch (e.g last batch in eval) with NULL category
- */
-template <typename dtype>
-void test_padding() {
-  const size_t batch_size = 8;
-  const size_t current_batch_size = 5;
-
-  std::vector<size_t> table_sizes{100, 10, 10, 20};
-  std::vector<dtype> data_in{99, 3, 7, 19, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
-  std::vector<dtype> data_to_unique_categories_ref{
-      99, 103, 117, 139, 0, 100, 110, 120, 1, 101, 111, 121, 2, 102, 112, 122, 3, 103, 113, 123};
-
-  Tensor2<dtype> d_data_in;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  buff->reserve({current_batch_size * table_sizes.size()}, &d_data_in);
-  buff->allocate();
-  upload_tensor(data_in, d_data_in, 0);
-
-  d_data_in.reset_shape({current_batch_size, table_sizes.size()});
-
-  Data<dtype> data(table_sizes, batch_size, 1);
-  data.data_to_unique_categories(d_data_in, 0);
-  std::vector<dtype> data_to_unique_categories_ret;
-  download_tensor(data_to_unique_categories_ret, data.samples, 0);
-
-  const auto NULL_category = EmbeddingTableFunctors<dtype>::get_num_categories(table_sizes);
-
-  // Ensure valid samples calculated correctly
-  size_t i;
-  for (i = 0; i < current_batch_size * table_sizes.size(); ++i) {
-    EXPECT_TRUE(data_to_unique_categories_ret[i] == data_to_unique_categories_ref[i]);
-  }
-
-  // Ensure padded correctly
-  for (; i < batch_size * table_sizes.size(); ++i) {
-    EXPECT_TRUE(data_to_unique_categories_ret[i] == NULL_category);
-  }
-}
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
-
-TEST(data_test, uint32) { data_test<uint32_t>(); };
-TEST(data_test, long_long) { data_test<long long>(); };
-TEST(data_test, incomplete_batch_uint32) { test_padding<uint32_t>(); }
-TEST(data_test, incomplete_batch_long_long) { test_padding<long long>(); }
diff --git a/test/utest/embedding/hybrid_embedding/data_test.hpp b/test/utest/embedding/hybrid_embedding/data_test.hpp
deleted file mode 100644
index e4cb78d38a..0000000000
--- a/test/utest/embedding/hybrid_embedding/data_test.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime.h>
-
-#include <common.hpp>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <tensor2.hpp>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-std::shared_ptr<Data<dtype>> create_data_from_distribution(
-    const std::vector<std::vector<double>> &distribution, const size_t batch_size,
-    const size_t num_iterations) {
-  std::vector<size_t> table_sizes(distribution.size());
-  size_t num_categories = (size_t)0;
-  for (size_t i = 0; i < distribution.size(); ++i) {
-    table_sizes[i] = distribution[i].size();
-    num_categories += table_sizes[i];
-  }
-
-  std::vector<double> acc_prob(num_categories);
-  double sum_p = 0.;
-  size_t category = (size_t)0;
-  for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding) {
-    for (size_t em_category = 0; em_category < table_sizes[embedding]; ++em_category) {
-      sum_p += distribution[embedding][em_category];
-      acc_prob[category++] = sum_p;
-    }
-  }
-
-  return std::make_shared<Data<dtype>>(table_sizes, batch_size, num_iterations);
-}
-
-template <typename dtype>
-void test_raw_data(dtype *raw_data, size_t num_samples, size_t num_tables, size_t num_iterations,
-                   const std::vector<size_t> &table_sizes);
-
-template <typename dtype>
-void test_samples(dtype *raw_data, Data<dtype> &data);
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/test/utest/embedding/hybrid_embedding/end_to_end_test.cpp b/test/utest/embedding/hybrid_embedding/end_to_end_test.cpp
deleted file mode 100644
index 7978c126f8..0000000000
--- a/test/utest/embedding/hybrid_embedding/end_to_end_test.cpp
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <core23/mpi_init_service.hpp>
-#include <general_buffer2.hpp>
-#include <map>
-#include <memory>
-#include <random>
-#include <resource_managers/resource_manager_ext.hpp>
-#include <tensor2.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-// all your base are belong to us
-#define private public
-#define protected public
-#include <embeddings/hybrid_sparse_embedding.hpp>
-
-using namespace HugeCTR;
-
-constexpr bool debug_print = false;
-int global_seed = 0;
-
-template <typename dtype, typename emtype>
-void end_to_end_impl(std::vector<int> device_list, HybridEmbeddingInputGenerator<dtype> *generator,
-                     size_t batch_size, size_t embedding_vec_size, double bw_ratio_a2a_over_ar,
-                     size_t seed, size_t num_evals) {
-  constexpr double epsilon = sizeof(emtype) < 4 ? 1e-2 : 1e-3;
-
-  const int rank{core23::MpiInitService::get().world_rank()};
-  const int num_procs{core23::MpiInitService::get().world_size()};
-
-  HCTR_LIB_THROW(nvmlInit_v2());
-
-  std::vector<std::vector<int>> vvgpu;
-
-  // if there are multi-node, we assume each node has the same gpu device_list
-  for (int i = 0; i < num_procs; i++) {
-    vvgpu.push_back(device_list);
-  }
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, seed);
-
-  size_t total_gpu_count = resource_manager->get_global_gpu_count();
-  size_t local_gpu_count = resource_manager->get_local_gpu_count();
-  size_t local_batch_size = batch_size / total_gpu_count;
-  assert(batch_size % total_gpu_count == 0);
-
-  auto table_sizes = generator->get_table_sizes();
-  size_t num_tables = table_sizes.size();
-  size_t total_categories = std::accumulate(table_sizes.begin(), table_sizes.end(), 0);
-  HCTR_LOG(INFO, WORLD, "total categories: %lu\n", total_categories);
-
-  size_t num_init_batches = 50;
-
-  SparseTensors<dtype> inputs;
-  SparseTensors<dtype> inits;
-  for (size_t i = 0; i < local_gpu_count; i++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(i)->get_device_id());
-    auto buf = GeneralBuffer2<CudaManagedAllocator>::create();
-    Tensor2<dtype> value_tensor;
-    buf->reserve({batch_size, num_tables}, &value_tensor);
-    auto dummy_row_offset_tensor = Tensor2<dtype>();
-    std::shared_ptr<size_t> dummy_nnz(new size_t);
-    inputs.emplace_back(SparseTensor<dtype>(value_tensor, dummy_row_offset_tensor, dummy_nnz));
-
-    buf->reserve({num_init_batches * batch_size, num_tables}, &value_tensor);
-    inits.emplace_back(SparseTensor<dtype>(value_tensor, dummy_row_offset_tensor, dummy_nnz));
-    buf->allocate();
-  }
-
-  const float lr = 0.42f;
-
-  GpuLearningRateSchedulers lr_scheds;
-  for (size_t i = 0; i < local_gpu_count; i++) {
-    lr_scheds.emplace_back(new GpuLearningRateScheduler(2 * lr, 2, 0, 1, 2.f, 0.f,
-                                                        resource_manager->get_local_gpu(i)));
-    lr_scheds.back()->update();
-  }
-
-  HybridSparseEmbeddingParams params = {
-      batch_size,
-      batch_size,
-      num_init_batches,
-      2 * num_tables * batch_size,
-      -1,
-      0.01,  // p_max_dup ?
-      embedding_vec_size,
-      num_tables,
-      generator->get_table_sizes(),
-      num_procs == 1 ? hybrid_embedding::CommunicationType::NVLink_SingleNode
-                     : hybrid_embedding::CommunicationType::IB_NVLink,
-      1.0,
-      bw_ratio_a2a_over_ar,
-      1.0,
-      HybridEmbeddingType::Distributed,
-      OptParams{Optimizer_t::SGD, lr, {}, Update_t::Global, 1.0f}};
-
-  std::vector<std::shared_ptr<BufferBlock2<emtype>>> placeholder(
-      resource_manager->get_local_gpu_count(), NULL);
-  auto embedding = std::make_unique<HybridSparseEmbedding<dtype, emtype>>(
-      inputs, inputs, params, placeholder, lr_scheds, false, resource_manager);
-
-  // Table offsets
-  std::vector<size_t> table_offsets(num_tables);
-  size_t total = 0;
-  for (size_t table = 0; table < num_tables; table++) {
-    table_offsets[table] = total;
-    total += generator->get_table_sizes()[table];
-  }
-
-  auto initial_input = generator->generate_categorical_input(num_init_batches * batch_size);
-
-  if (debug_print) {
-    std::map<dtype, int> unique_cat;
-    HCTR_LOG(INFO, ROOT, "Generated INIT unique categories:  ");
-    for (size_t i = 0; i < num_init_batches * batch_size; i++) {
-      for (size_t j = 0; j < num_tables; j++) {
-        unique_cat[initial_input[i * num_tables + j] + table_offsets[j]] = 1;
-      }
-    }
-    for (auto c : unique_cat) {
-      HCTR_PRINT(INFO, " %d", static_cast<int>(c.first));
-    }
-    HCTR_PRINT(INFO, "\n");
-  }
-
-  for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(lgpu)->get_device_id());
-    auto stream = resource_manager->get_local_gpu(lgpu)->get_stream();
-    upload_tensor(initial_input, inits[lgpu].get_value_tensor(), stream);
-  }
-  size_t tmp_size = 0;
-  embedding->init_model(inits, tmp_size);
-
-  size_t num_frequent = embedding->model_[0].num_frequent;
-  if (rank == 0) {
-    HCTR_LOG(INFO, WORLD, "Number of frequent categories: %ld\n", num_frequent);
-  }
-  std::vector<size_t> num_infrequent(local_gpu_count);
-  for (size_t i = 0; i < local_gpu_count; i++) {
-    num_infrequent[i] = embedding->model_[i].h_infrequent_model_table_offsets[num_tables];
-    // if (debug_print) {
-    HCTR_LOG(INFO, WORLD, "local_gpu = %ld, Number of infrequent categories: %ld\n", i,
-             num_infrequent[i]);
-    //}
-  }
-
-  std::vector<float> full_emb_table(total_categories * embedding_vec_size);
-  {
-    std::mt19937 gen(seed + 2);
-    std::uniform_real_distribution<float> distr(-1, 1);
-    for (auto &e : full_emb_table) {
-      e = distr(gen);
-    }
-  }
-
-  // Set frequent embeddings
-  for (size_t device = 0; device < local_gpu_count; device++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-
-    std::vector<dtype> h_frequent_categories;
-    download_tensor(h_frequent_categories, embedding->model_[device].frequent_categories, 0);
-
-    for (size_t i = 0; i < num_frequent; ++i) {
-      dtype cat = h_frequent_categories[i];
-      HCTR_LIB_THROW(cudaMemcpy(embedding->frequent_embeddings_single_node_[device]
-                                        .frequent_data_.frequent_embedding_vectors_.get_ptr() +
-                                    i * embedding_vec_size,
-                                full_emb_table.data() + cat * embedding_vec_size,
-                                sizeof(float) * embedding_vec_size, cudaMemcpyHostToDevice));
-    }
-
-    if (debug_print && device == 0) {
-      HCTR_LOG(INFO, ROOT, "Frequent categories: ");
-      for (size_t i = 0; i < num_frequent; i++) {
-        HCTR_PRINT(INFO, " %d", h_frequent_categories[i]);
-      }
-      HCTR_PRINT(INFO, "\n");
-    }
-  }
-
-  // Set infrequent embeddings
-  for (size_t device = 0; device < local_gpu_count; device++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-    int global_id = resource_manager->get_local_gpu(device)->get_global_id();
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    size_t num_infrequent = embedding->model_[device].h_infrequent_model_table_offsets[num_tables];
-
-    float *h_infrequent_embedding_vectors;
-    dtype *h_category_location;
-    HCTR_LIB_THROW(cudaMallocHost((void **)&h_infrequent_embedding_vectors,
-                                  (num_infrequent + 1) * embedding_vec_size * sizeof(float)));
-    HCTR_LIB_THROW(
-        cudaMallocHost((void **)&h_category_location, total_categories * 2 * sizeof(dtype)));
-
-    HCTR_LIB_THROW(cudaMemcpy(h_category_location,
-                              embedding->model_[device].category_location.get_ptr(),
-                              total_categories * 2 * sizeof(dtype), cudaMemcpyDeviceToHost));
-
-    if (debug_print) {
-      HCTR_LOG(INFO, ROOT, "Category location array:\n");
-      for (size_t i = 0; i < total_categories; i++) {
-        HCTR_PRINT(INFO, "  (%d, %d)\n", h_category_location[2 * i],
-                   h_category_location[2 * i + 1]);
-      }
-    }
-
-    for (size_t i = 0; i < total_categories; ++i) {
-      if (static_cast<int>(h_category_location[2 * i]) == global_id &&
-          static_cast<size_t>(h_category_location[2 * i + 1]) < total_categories) {
-        auto loc = h_category_location[2 * i + 1];
-        memcpy(h_infrequent_embedding_vectors + loc * embedding_vec_size,
-               full_emb_table.data() + i * embedding_vec_size, sizeof(float) * embedding_vec_size);
-        /*
-        if(device == 0)
-        {
-          HCTR_LOG(INFO, WORLD, "i = %ld, loc = %d, embed[0]  = %f\n", i, loc,
-        *(h_infrequent_embedding_vectors+loc*embedding_vec_size));
-        }
-        */
-      }
-    }
-
-    if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-      HCTR_LIB_THROW(cudaMemcpy(embedding->infrequent_embeddings_single_node_[device]
-                                    .infrequent_embedding_vectors_.get_ptr(),
-                                h_infrequent_embedding_vectors,
-                                num_infrequent * embedding_vec_size * sizeof(float),
-                                cudaMemcpyHostToDevice));
-    }
-
-    if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-      HCTR_LIB_THROW(cudaMemcpy(embedding->infrequent_embeddings_ib_nvlink_[device]
-                                    .infrequent_embedding_vectors_.get_ptr(),
-                                h_infrequent_embedding_vectors,
-                                num_infrequent * embedding_vec_size * sizeof(float),
-                                cudaMemcpyHostToDevice));
-    }
-
-    if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-      HCTR_LIB_THROW(cudaMemcpy(embedding->infrequent_embeddings_ib_nvlink_hier_[device]
-                                    .infrequent_embedding_vectors_.get_ptr(),
-                                h_infrequent_embedding_vectors,
-                                num_infrequent * embedding_vec_size * sizeof(float),
-                                cudaMemcpyHostToDevice));
-    }
-    // HCTR_LOG(INFO, WORLD, "gpu = %ld, num_infrequent = %ld, infrequent_embedding_vectors_ =
-    // 0x%lx\n", device, num_infrequent,
-    // (size_t)(embedding->infrequent_embeddings_[device].infrequent_embedding_vectors_.get_ptr()));
-    HCTR_LIB_THROW(cudaFreeHost(h_infrequent_embedding_vectors));
-    HCTR_LIB_THROW(cudaFreeHost(h_category_location));
-  }
-
-  if (debug_print) {
-    HCTR_LOG(INFO, ROOT, "Generated full embedding table\n");
-    for (size_t i = 0; i < full_emb_table.size(); i++) {
-      HCTR_PRINT(INFO, "%8.5f ", full_emb_table[i]);
-      if (i % embedding_vec_size == embedding_vec_size - 1) {
-        HCTR_PRINT(INFO, "\n");
-      }
-    }
-    HCTR_PRINT(INFO, "\n");
-  }
-
-  auto outputs = embedding->get_train_output_tensors();
-  //======================================================================================
-  // Do the forward step
-  //======================================================================================
-  auto input = generator->generate_categorical_input(batch_size);
-  for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(lgpu)->get_device_id());
-    auto stream = resource_manager->get_local_gpu(lgpu)->get_stream();
-    upload_tensor(input, inputs[lgpu].get_value_tensor(), stream);
-  }
-
-  if (debug_print) {
-    HCTR_LOG(INFO, ROOT, "Generated input:\n");
-    HCTR_PRINT(INFO, "  Table sizes: ");
-    for (auto sz : generator->get_table_sizes()) {
-      HCTR_PRINT(INFO, "%ld ", sz);
-    }
-    HCTR_PRINT(INFO, "\n");
-    HCTR_PRINT(INFO, "  Input:\n");
-    for (size_t i = 0; i < batch_size; i++) {
-      HCTR_PRINT(INFO, "   [ ");
-      for (size_t j = 0; j < num_tables; j++) {
-        HCTR_PRINT(INFO, "%7d ", input[i * num_tables + j]);
-      }
-      HCTR_PRINT(INFO, " ]\n");
-    }
-  }
-
-  embedding->forward(true);
-
-  if (debug_print) {
-    const int device = 0;
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-    int global_id = resource_manager->get_local_gpu(device)->get_global_id();
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    {
-      std::vector<dtype> tmp;
-      if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-        download_tensor(
-            tmp, embedding->infrequent_embeddings_single_node_[device].indices_->model_indices_, 0);
-      }
-      if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-        download_tensor(
-            tmp, embedding->infrequent_embeddings_ib_nvlink_[device].indices_->model_indices_, 0);
-      }
-      if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-        download_tensor(
-            tmp, embedding->infrequent_embeddings_ib_nvlink_hier_[device].indices_->model_indices_,
-            0);
-      }
-
-      // download_tensor(tmp, embedding->infrequent_embeddings_[device].indices_->model_indices_,
-      // 0);
-
-      HCTR_LOG(INFO, ROOT, "Instance %d model indices: ", global_id);
-      for (size_t j = 0; j < tmp.size(); j++) {
-        HCTR_PRINT(INFO, " %d", static_cast<int>(tmp[j]));
-      }
-      HCTR_PRINT(INFO, "\n");
-
-      HCTR_LOG(INFO, ROOT, "Instance %d model indices OFFSETS: ", global_id);
-      for (int j = 0; j < num_procs + 1; j++) {
-        if (embedding->embedding_params_.communication_type ==
-            CommunicationType::NVLink_SingleNode) {
-          HCTR_PRINT(INFO, " %u",
-                     embedding->infrequent_embeddings_single_node_[device]
-                         .indices_->model_indices_offsets_.get_ptr()[j]);
-        }
-        if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-          HCTR_PRINT(INFO, " %u",
-                     embedding->infrequent_embeddings_ib_nvlink_[device]
-                         .indices_->model_indices_offsets_.get_ptr()[j]);
-        }
-        if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-          HCTR_PRINT(INFO, " %u",
-                     embedding->infrequent_embeddings_ib_nvlink_hier_[device]
-                         .indices_->model_indices_offsets_.get_ptr()[j]);
-        }
-      }
-      HCTR_PRINT(INFO, "\n");
-
-      int num_batch_frequent;
-      HCTR_LIB_THROW(cudaMemcpy(&num_batch_frequent,
-                                embedding->frequent_embeddings_single_node_[device]
-                                    .indices_->d_num_frequent_sample_indices_.get_ptr(),
-                                sizeof(uint32_t), cudaMemcpyDeviceToHost));
-      HCTR_LOG(INFO, ROOT, "Instance %d found %d frequent categories in positions: ", global_id,
-               num_batch_frequent);
-      download_tensor(
-          tmp,
-          embedding->frequent_embeddings_single_node_[device].indices_->frequent_sample_indices_,
-          0);
-      for (int j = 0; j < num_batch_frequent; j++) {
-        HCTR_PRINT(INFO, " %d", static_cast<int>(tmp[j]));
-      }
-      HCTR_PRINT(INFO, "\n");
-    }
-
-    {
-      std::vector<dtype> tmp;
-      if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-        download_tensor(
-            tmp, embedding->infrequent_embeddings_single_node_[device].indices_->network_indices_,
-            0);
-      }
-      if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-        download_tensor(
-            tmp, embedding->infrequent_embeddings_ib_nvlink_[device].indices_->network_indices_, 0);
-      }
-      if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-        download_tensor(
-            tmp,
-            embedding->infrequent_embeddings_ib_nvlink_hier_[device].indices_->network_indices_, 0);
-      }
-
-      HCTR_LOG(INFO, ROOT, "Instance %d network indices: ", global_id);
-      for (size_t j = 0; j < tmp.size(); j++) {
-        HCTR_PRINT(INFO, " %d", static_cast<int>(tmp[j]));
-      }
-      HCTR_PRINT(INFO, "\n");
-
-      HCTR_LOG(INFO, ROOT, "Instance %d network indices OFFSETS: ", global_id);
-      for (int j = 0; j < num_procs + 1; j++) {
-        // HCTR_PRINT(INFO, " %d",
-        //(int)embedding->infrequent_embeddings_[device]
-        //.indices_->network_indices_offsets_.get_ptr()[j]);
-
-        if (embedding->embedding_params_.communication_type ==
-            CommunicationType::NVLink_SingleNode) {
-          HCTR_PRINT(INFO, " %u",
-                     embedding->infrequent_embeddings_single_node_[device]
-                         .indices_->network_indices_offsets_.get_ptr()[j]);
-        }
-        if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-          HCTR_PRINT(INFO, " %u",
-                     embedding->infrequent_embeddings_ib_nvlink_[device]
-                         .indices_->network_indices_offsets_.get_ptr()[j]);
-        }
-        if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-          HCTR_PRINT(INFO, " %u",
-                     embedding->infrequent_embeddings_ib_nvlink_hier_[device]
-                         .indices_->network_indices_offsets_.get_ptr()[j]);
-        }
-      }
-      HCTR_PRINT(INFO, "\n");
-    }
-  }
-
-  // Check
-  for (size_t device = 0; device < local_gpu_count; device++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-    int global_id = resource_manager->get_local_gpu(device)->get_global_id();
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    std::vector<emtype> h_output;
-    std::vector<emtype> expected(embedding_vec_size);
-    ASSERT_EQ(local_batch_size, embedding->get_batch_size_per_gpu(true));
-
-    download_tensor(h_output, Tensor2<emtype>::stretch_from(outputs[device]), 0);
-    ASSERT_EQ(h_output.size() % embedding_vec_size, 0);
-    ASSERT_EQ(h_output.size(), local_batch_size * num_tables * embedding_vec_size);
-
-    for (size_t i = 0; i < h_output.size() / embedding_vec_size; i++) {
-      size_t table = i % num_tables;
-      size_t cat_id = table_offsets[table] + input[i + global_id * local_batch_size * num_tables];
-      auto expected_ptr = full_emb_table.data() + cat_id * embedding_vec_size;
-      auto actual_ptr = h_output.data() + i * embedding_vec_size;
-
-      if (debug_print) {
-        HCTR_LOG(INFO, ROOT, " Instance %d sample %ld slot %ld comparing category %ld: ", global_id,
-                 i, table, cat_id);
-        for (size_t j = 0; j < embedding_vec_size; j++) {
-          HCTR_PRINT(INFO, " (%8.5f : %8.5f) ", static_cast<float>(actual_ptr[j]),
-                     static_cast<float>(expected_ptr[j]));
-        }
-        HCTR_PRINT(INFO, "\n");
-      }
-
-      for (size_t j = 0; j < embedding_vec_size; j++) {
-        expected[j] = (emtype)expected_ptr[j];
-      }
-
-      ASSERT_EQ(memcmp(expected.data(), actual_ptr, embedding_vec_size * sizeof(emtype)), 0)
-          << "Data mismatch on instance " << global_id << " in sample " << i / num_tables
-          << " feature " << table << std::endl;
-    }
-  }
-
-  //======================================================================================
-  // Do the backward step and update
-  //======================================================================================
-  for (size_t device = 0; device < local_gpu_count; device++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-
-    std::vector<emtype> h_output(local_batch_size * num_tables * embedding_vec_size);
-
-    // Per-GPU generator
-    std::mt19937 gen(seed + 3 + resource_manager->get_local_gpu(device)->get_global_id());
-    std::uniform_real_distribution<float> distr(-1, 1);
-    for (auto &grad : h_output) {
-      grad = (emtype)distr(gen);
-    }
-    upload_tensor(h_output, Tensor2<emtype>::stretch_from(outputs[device]), 0);
-  }
-
-  // We can't allreduce __half type with MPI, so need to recreate all the output tensors locally.
-  std::vector<double> gradients(total_categories * embedding_vec_size, 0);
-  for (size_t device = 0; device < total_gpu_count; device++) {
-    std::mt19937 gen(seed + 3 + device);
-    std::uniform_real_distribution<float> distr(-1, 1);
-
-    for (size_t i = 0; i < local_batch_size * num_tables; i++) {
-      size_t table = i % num_tables;
-      size_t cat_id = table_offsets[table] + input[i + device * local_batch_size * num_tables];
-      auto grad_ptr = gradients.data() + cat_id * embedding_vec_size;
-
-      for (size_t j = 0; j < embedding_vec_size; j++) {
-        grad_ptr[j] += distr(gen);
-      }
-    }
-  }
-
-  if (debug_print) {
-    HCTR_LOG(INFO, ROOT, "Generated embedding gradients");
-    for (size_t i = 0; i < gradients.size(); i++) {
-      if (i % embedding_vec_size == 0) {
-        HCTR_PRINT(INFO, "\nRank %d cat %ld :: ", rank, i / embedding_vec_size);
-      }
-      HCTR_PRINT(INFO, "%8.5f ", static_cast<float>(gradients[i]));
-    }
-    HCTR_PRINT(INFO, "\n");
-  }
-
-  embedding->backward();
-  embedding->update_params();
-
-  // Check
-  // Check frequent embeddings
-  for (size_t device = 0; device < local_gpu_count; device++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-    int global_id = resource_manager->get_local_gpu(device)->get_global_id();
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    std::vector<dtype> h_frequent_categories;
-    download_tensor(h_frequent_categories, embedding->model_[device].frequent_categories, 0);
-
-    float *h_frequent_embedding_vectors;
-    HCTR_LIB_THROW(
-        cudaMallocHost((void **)&h_frequent_embedding_vectors, embedding_vec_size * sizeof(float)));
-
-    // Only checking the categories that the instance owns
-    size_t chunk = num_frequent / resource_manager->get_global_gpu_count();
-    ASSERT_EQ(num_frequent % resource_manager->get_global_gpu_count(), 0);
-
-    size_t start = device * chunk;
-    size_t end = (device + 1) * chunk;
-    for (size_t i = start; i < end; ++i) {
-      dtype cat_id = h_frequent_categories[i];
-      HCTR_LIB_THROW(cudaMemcpy(h_frequent_embedding_vectors,
-                                embedding->frequent_embeddings_single_node_[device]
-                                        .frequent_data_.frequent_embedding_vectors_.get_ptr() +
-                                    i * embedding_vec_size,
-                                sizeof(float) * embedding_vec_size, cudaMemcpyDeviceToHost));
-      for (size_t j = 0; j < embedding_vec_size; j++) {
-        ASSERT_NEAR(static_cast<double>(h_frequent_embedding_vectors[j]),
-                    static_cast<double>(full_emb_table.data()[cat_id * embedding_vec_size + j]) -
-                        static_cast<double>(gradients.data()[cat_id * embedding_vec_size + j]) * lr,
-                    epsilon)
-            << "Gradient (frequent) mismatch on instance " << global_id << " in category " << cat_id
-            << " dimension " << j << "/" << embedding_vec_size << std::endl;
-      }
-    }
-    HCTR_LIB_THROW(cudaFreeHost(h_frequent_embedding_vectors));
-  }
-
-  // Check infrequent embeddings
-  for (size_t device = 0; device < local_gpu_count; device++) {
-    CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id());
-    int global_id = resource_manager->get_local_gpu(device)->get_global_id();
-
-    size_t num_infrequent = embedding->model_[device].h_infrequent_model_table_offsets[num_tables];
-
-    float *h_infrequent_embedding_vectors;
-    dtype *h_category_location;
-    HCTR_LIB_THROW(cudaMallocHost((void **)&h_infrequent_embedding_vectors,
-                                  num_infrequent * embedding_vec_size * sizeof(float)));
-    HCTR_LIB_THROW(
-        cudaMallocHost((void **)&h_category_location, total_categories * 2 * sizeof(dtype)));
-
-    HCTR_LIB_THROW(cudaMemcpy(h_category_location,
-                              embedding->model_[device].category_location.get_ptr(),
-                              total_categories * 2 * sizeof(dtype), cudaMemcpyDeviceToHost));
-
-    // if (embedding_params_.)
-    // cudaMemcpy(h_infrequent_embedding_vectors,
-    // embedding->infrequent_embeddings_[device].infrequent_embedding_vectors_.get_ptr(),
-    // num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost);
-
-    if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) {
-      cudaMemcpy(h_infrequent_embedding_vectors,
-                 embedding->infrequent_embeddings_single_node_[device]
-                     .infrequent_embedding_vectors_.get_ptr(),
-                 num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost);
-    }
-    if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) {
-      cudaMemcpy(h_infrequent_embedding_vectors,
-                 embedding->infrequent_embeddings_ib_nvlink_[device]
-                     .infrequent_embedding_vectors_.get_ptr(),
-                 num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost);
-    }
-    if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) {
-      cudaMemcpy(h_infrequent_embedding_vectors,
-                 embedding->infrequent_embeddings_ib_nvlink_hier_[device]
-                     .infrequent_embedding_vectors_.get_ptr(),
-                 num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost);
-    }
-
-    for (size_t cat_id = 0; cat_id < total_categories; ++cat_id) {
-      if (static_cast<int>(h_category_location[2 * cat_id]) == global_id) {
-        auto local_cat_id = h_category_location[2 * cat_id + 1];
-
-        for (size_t j = 0; j < embedding_vec_size; j++) {
-          ASSERT_NEAR(
-              static_cast<double>(
-                  h_infrequent_embedding_vectors[local_cat_id * embedding_vec_size + j]),
-              static_cast<double>(full_emb_table.data()[cat_id * embedding_vec_size + j]) -
-                  static_cast<double>(gradients.data()[cat_id * embedding_vec_size + j]) * lr,
-              epsilon)
-              << "Gradient (infrequent) mismatch on instance " << global_id << " in category "
-              << cat_id << " dimension " << j << "/" << embedding_vec_size << std::endl;
-        }
-      }
-    }
-
-    HCTR_LIB_THROW(cudaFreeHost(h_infrequent_embedding_vectors));
-    HCTR_LIB_THROW(cudaFreeHost(h_category_location));
-  }
-}
-
-template <typename dtype, typename emtype>
-void end_to_end(std::vector<int> device_list, size_t num_tables, size_t total_categories,
-                size_t batch_size, size_t embedding_vec_size, double bw_ratio_a2a_over_ar,
-                size_t seed = 42, size_t num_evals = 1) {
-  const int num_procs{core23::MpiInitService::get().world_size()};
-  size_t num_total_gpus = num_procs * device_list.size();
-
-  HybridEmbeddingConfig<dtype> test_config = {
-      static_cast<size_t>(num_procs),
-      num_total_gpus,
-      num_tables,
-      embedding_vec_size,
-      static_cast<dtype>(total_categories),
-      {},    // irrelevant here
-      1.0f,  // irrelevant here
-      num_procs == 1 ? hybrid_embedding::CommunicationType::NVLink_SingleNode
-                     : hybrid_embedding::CommunicationType::IB_NVLink,
-  };
-
-  auto generator = std::make_unique<HybridEmbeddingInputGenerator<dtype>>(test_config, seed + 1);
-  end_to_end_impl<dtype, emtype>(device_list, generator.get(), batch_size, embedding_vec_size,
-                                 bw_ratio_a2a_over_ar, seed, num_evals);
-}
-
-template <typename dtype, typename emtype>
-void end_to_end(std::vector<int> device_list, std::vector<size_t> table_sizes, size_t batch_size,
-                size_t embedding_vec_size, double bw_ratio_a2a_over_ar, size_t seed = 42,
-                size_t num_evals = 1) {
-  const int num_procs{core23::MpiInitService::get().world_size()};
-  size_t num_total_gpus = num_procs * device_list.size();
-
-  HybridEmbeddingConfig<dtype> test_config = {
-      static_cast<size_t>(num_procs),
-      num_total_gpus,
-      0,  // irrelevant here
-      embedding_vec_size,
-      {},    // irrelevant here
-      {},    // irrelevant here
-      1.0f,  // irrelevant here
-      num_procs == 1 ? hybrid_embedding::CommunicationType::NVLink_SingleNode
-                     : hybrid_embedding::CommunicationType::IB_NVLink,
-  };
-
-  auto generator =
-      std::make_unique<HybridEmbeddingInputGenerator<dtype>>(test_config, table_sizes, seed + 1);
-  end_to_end_impl<dtype, emtype>(device_list, generator.get(), batch_size, embedding_vec_size,
-                                 bw_ratio_a2a_over_ar, seed, num_evals);
-}
-
-class MPIEnvironment : public ::testing::Environment {
- protected:
-  virtual void SetUp() { test::mpi_init(); }
-  virtual void TearDown() { test::mpi_finalize(); }
-  virtual ~MPIEnvironment(){};
-};
-
-::testing::Environment *const mpi_env = ::testing::AddGlobalTestEnvironment(new MPIEnvironment);
-//
-TEST(hybrid_e2e, test1) { end_to_end<uint32_t, float>({0}, 2, 16, 20, 2, 1.0e10, global_seed); }
-TEST(hybrid_e2e, test2) { end_to_end<uint32_t, float>({0}, 2, 16, 20, 2, 1.0e-10, global_seed++); }
-TEST(hybrid_e2e, test3) {
-  end_to_end<uint32_t, float>({0, 1}, 2, 128, 20, 2, 1.0e10, global_seed++);
-}
-TEST(hybrid_e2e, test4) {
-  end_to_end<uint32_t, float>({0, 1}, 2, 128, 20, 2, 1.0e-10, global_seed++);
-}
-TEST(hybrid_e2e, test5) { end_to_end<uint32_t, float>({0, 1}, 2, 128, 20, 2, 1.0, global_seed++); }
-TEST(hybrid_e2e, test6) { end_to_end<uint32_t, float>({0, 1}, 7, 128, 20, 2, 1.0, global_seed++); }
-TEST(hybrid_e2e, test7) {
-  end_to_end<uint32_t, float>({0, 1, 2}, 3, 192, 96, 5, 1.0, global_seed++);
-}
-TEST(hybrid_e2e, test8) {
-  end_to_end<uint32_t, float>({0, 1, 2, 3}, 6, 651, 96, 128, 1.5, global_seed++);
-}
-TEST(hybrid_e2e, test9) {
-  end_to_end<uint32_t, float>({0, 1, 2, 3}, 18, 6531, 256, 64, 1.7, global_seed++);
-}
-TEST(hybrid_e2e, test10) {
-  end_to_end<uint32_t, float>({0, 1, 2, 3, 4, 5, 6, 7}, 18, 6531, 256, 64, 1.7, global_seed++);
-}
-TEST(hybrid_e2e, test11) {
-  end_to_end<uint32_t, float>({0, 1, 2, 3, 4, 5, 6, 7}, 26, 16531, 512, 48, 1.33, global_seed++);
-}
-TEST(hybrid_e2e, test12) {
-  end_to_end<uint32_t, float>({0, 1, 6, 7}, 13, 21345, 256, 32, 0.6, global_seed++);
-}
-TEST(hybrid_e2e, test13) {
-  std::vector<size_t> slot_size_array{
-      39884406, 39043,    17289,    7420,     20263,  3,     7120, 1543, 63,
-      38532951, 2953546,  403346,   10,       2208,   11938, 155,  4,    976,
-      14,       39979771, 25641295, 39664984, 585935, 12972, 108,  36};
-  // for (auto& s : slot_size_array) {
-  //   s = s/16 + 1;
-  // }
-
-  end_to_end<uint32_t, float>({0, 1, 2, 3, 4, 5, 6, 7}, slot_size_array, 1024, 128, 1.9 / 1.3,
-                              global_seed++);
-}
-
-TEST(hybrid_e2e, test21) { end_to_end<uint32_t, __half>({0}, 2, 16, 20, 2, 1.0e10, global_seed++); }
-TEST(hybrid_e2e, test22) {
-  end_to_end<uint32_t, __half>({0}, 2, 16, 20, 2, 1.0e-10, global_seed++);
-}
-TEST(hybrid_e2e, test23) {
-  end_to_end<uint32_t, __half>({0, 1}, 2, 128, 20, 2, 1.0e10, global_seed++);
-}
-TEST(hybrid_e2e, test24) {
-  end_to_end<uint32_t, __half>({0, 1}, 2, 128, 20, 2, 1.0e-10, global_seed++);
-}
-TEST(hybrid_e2e, test25) {
-  end_to_end<uint32_t, __half>({0, 1}, 2, 128, 20, 2, 1.0, global_seed++);
-}
-TEST(hybrid_e2e, test26) {
-  end_to_end<uint32_t, __half>({0, 1}, 7, 128, 20, 2, 1.0, global_seed++);
-}
-TEST(hybrid_e2e, test27) {
-  end_to_end<uint32_t, __half>({0, 1, 2}, 3, 192, 96, 5, 1.0, global_seed++);
-}
-TEST(hybrid_e2e, test28) {
-  end_to_end<uint32_t, __half>({0, 1, 2, 3}, 6, 651, 96, 128, 1.5, global_seed++);
-}
-TEST(hybrid_e2e, test29) {
-  end_to_end<uint32_t, __half>({0, 1, 2, 3}, 18, 6531, 256, 64, 1.7, global_seed++);
-}
-TEST(hybrid_e2e, test30) {
-  end_to_end<uint32_t, __half>({0, 1, 2, 3, 4, 5, 6, 7}, 18, 6531, 256, 64, 1.7, global_seed++);
-}
-TEST(hybrid_e2e, test31) {
-  end_to_end<uint32_t, __half>({0, 1, 2, 3, 4, 5, 6, 7}, 26, 16531, 512, 48, 1.33, global_seed++);
-}
-TEST(hybrid_e2e, test32) {
-  end_to_end<uint32_t, __half>({0, 1, 6, 7}, 13, 21345, 256, 32, 0.6, global_seed++);
-}
-TEST(hybrid_e2e, test33) {
-  std::vector<size_t> slot_size_array{
-      39884406, 39043,    17289,    7420,     20263,  3,     7120, 1543, 63,
-      38532951, 2953546,  403346,   10,       2208,   11938, 155,  4,    976,
-      14,       39979771, 25641295, 39664984, 585935, 12972, 108,  36};
-  // for (auto& s : slot_size_array) {
-  //   s = s/16 + 1;
-  // }
-
-  end_to_end<uint32_t, float>({0, 1, 2, 3, 4, 5, 6, 7}, slot_size_array, 1024, 128, 1.9 / 1.3,
-                              global_seed++);
-}
diff --git a/test/utest/embedding/hybrid_embedding/forward_test.cpp b/test/utest/embedding/hybrid_embedding/forward_test.cpp
deleted file mode 100644
index 87e3c665ee..0000000000
--- a/test/utest/embedding/hybrid_embedding/forward_test.cpp
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp>
-#include <utest/embedding/hybrid_embedding/test_common.cuh>
-
-/****************** Frequent and infrequent forward network ******************/
-
-template <typename dtype, typename emtype>
-class ForwardNetworkTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- protected:
-  bool single_node;
-
- public:
-  ForwardNetworkTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size, bool single_node,
-                     size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed),
-        single_node(single_node) {}
-
-  void run() {
-    uint32_t local_batch_size = ceildiv<uint32_t>(this->batch_size, this->num_instances);
-
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.generate_embedding_vectors();
-    cpu_embedding.forward_network();
-    if (!single_node) {
-      cpu_embedding.calculate_infrequent_model_indices();
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        cpu_embedding.forward_a2a_messages_hier();
-      } else {
-        cpu_embedding.forward_a2a_messages();
-      }
-    }
-
-    /* Tensors for the interaction layer input and messages */
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-    std::vector<Tensor2<emtype>> interaction_layer_input(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size},
-                    &interaction_layer_input[i]);
-    }
-    std::vector<Tensor2<emtype>> received_messages(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({this->num_instances * local_batch_size * this->config.num_tables,
-                     this->config.embedding_vec_size},
-                    &received_messages[i]);
-    }
-    buff->allocate();
-
-    /* In single-node case, make an array of the interaction mayer input pointers */
-    std::vector<emtype *> interaction_layer_input_pointers_;
-    if (single_node) {
-      for (size_t i = 0; i < this->num_instances; i++) {
-        interaction_layer_input_pointers_.push_back(interaction_layer_input[i].get_ptr());
-      }
-    }
-
-    /* Frequent and infrequent forward_network */
-    this->build_infrequent();
-    this->build_frequent();
-    for (size_t i = 0; i < this->num_instances; i++) {
-      upload_tensor(cpu_embedding.frequent_embedding_vectors[i],
-                    this->get_frequent_embedding_data(i).frequent_embedding_vectors_, this->stream);
-
-      if (single_node) {
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_,
-                      this->stream);
-      }
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_,
-                      this->stream);
-      }
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_,
-                      this->stream);
-      }
-    }
-
-    for (size_t i = 0; i < this->num_instances; i++) {
-      // this->frequent_embeddings[i].set_current_indices(&this->frequent_embedding_indices[i],
-      // this->stream);
-      this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]);
-      if (single_node) {
-        this->infrequent_embeddings_single_node[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-      }
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-      }
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        this->infrequent_embeddings_ib_nvlink[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-      }
-
-      if (single_node) {
-        this->frequent_embeddings_single_node[i].indices_->calculate_cache_masks(this->stream);
-        this->frequent_embeddings_single_node[i].indices_->calculate_model_cache_indices(
-            80, this->stream);
-        this->frequent_embeddings_single_node[i].forward_model(this->stream);
-      }
-    }
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->get_frequent_embedding(i).indices_->calculate_frequent_sample_indices(this->stream);
-      if (single_node) {
-        this->frequent_embeddings_single_node[i].forward_network(
-            interaction_layer_input[i].get_ptr(), this->stream);
-      } else {
-        this->frequent_embeddings_multi_node[i].forward_network(
-            interaction_layer_input[i].get_ptr(), this->stream);
-      }
-      if (single_node) {
-        this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream);
-        HCTR_LIB_THROW(cudaMemcpyAsync(this->infrequent_embeddings_single_node[i]
-                                           .interaction_layer_input_pointers_train_.get_ptr(),
-                                       interaction_layer_input_pointers_.data(),
-                                       this->num_instances * sizeof(emtype *),
-                                       cudaMemcpyHostToDevice, this->stream));
-        this->infrequent_embeddings_single_node[i].forward_network_direct(true, this->stream);
-      } else {
-        if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-          this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_network_indices(
-              80, this->stream);
-        } else {
-          this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_network_indices(
-              80, this->stream);
-        }
-        // this->infrequent_embeddings[i].indices_->calculate_network_indices(80, this->stream);
-        upload_tensor(cpu_embedding.forward_received_messages[i], received_messages[i],
-                      this->stream);
-        if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-          this->infrequent_embeddings_ib_nvlink_hier[i].hier_forward_network(
-              received_messages[i].get_ptr(), interaction_layer_input[i].get_ptr(), this->stream);
-        } else {  // ib_nvlink
-          this->infrequent_embeddings_ib_nvlink[i].forward_network(
-              received_messages[i].get_ptr(), interaction_layer_input[i].get_ptr(), this->stream);
-        }
-      }
-    }
-
-    std::vector<std::vector<emtype>> h_interaction_layer_input(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      download_tensor(h_interaction_layer_input[i], interaction_layer_input[i], this->stream);
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      ASSERT_TRUE(compare_array(
-          local_batch_size * this->config.num_tables * this->config.embedding_vec_size,
-          h_interaction_layer_input[i].data(), cpu_embedding.interaction_layer_input[i].data(),
-          1e-2));
-    }
-  }
-};
-
-/************** Frequent embedding forward model (single node) **************/
-
-template <typename dtype, typename emtype>
-class FrequentForwardModelTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  FrequentForwardModelTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                           size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    uint32_t local_batch_size = ceildiv<uint32_t>(this->batch_size, this->num_instances);
-
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_frequent_network_cache_indices();
-    cpu_embedding.generate_embedding_vectors();
-    cpu_embedding.generate_gradients();
-    cpu_embedding.frequent_reduce_gradients();
-
-    /* Tensors for the gradients */
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-    std::vector<Tensor2<emtype>> gradients(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size},
-                    &gradients[i]);
-    }
-    buff->allocate();
-
-    /* Frequent update_model */
-    this->build_frequent();
-    std::vector<const emtype *> frequent_partial_gradients_pointers(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      upload_tensor(
-          cpu_embedding.frequent_embedding_vectors[i],
-          this->frequent_embeddings_single_node[i].frequent_data_.frequent_embedding_vectors_,
-          this->stream);
-      upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream);
-      frequent_partial_gradients_pointers[i] =
-          this->frequent_embeddings_single_node[i].frequent_data_.get_gradients().get_ptr();
-      this->frequent_embeddings_single_node[i].set_current_indices(
-          &this->frequent_embedding_indices[i]);
-    }
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->frequent_embeddings_single_node[i].indices_->calculate_cache_masks(this->stream);
-      this->frequent_embeddings_single_node[i].indices_->calculate_network_cache_indices(
-          this->stream);
-      this->frequent_embeddings_single_node[i].indices_->calculate_model_cache_indices(
-          80, this->stream);
-      this->frequent_embeddings_single_node[i].indices_->calculate_frequent_sample_indices(
-          this->stream);
-      this->frequent_embeddings_single_node[i].local_reduce(gradients[i].get_ptr(), this->stream);
-    }
-    for (size_t i = 0; i < this->num_instances; i++) {
-      HCTR_LIB_THROW(cudaMemcpyAsync(
-          this->frequent_embeddings_single_node[i].partial_gradients_pointers_.get_ptr(),
-          frequent_partial_gradients_pointers.data(), this->num_instances * sizeof(emtype *),
-          cudaMemcpyHostToDevice, this->stream));
-      this->frequent_embeddings_single_node[i].update_model_direct(this->dev_lr, 1.f, this->stream);
-    }
-
-    /* Set cache to zero for easy comparison with CPU version */
-    if (sizeof(emtype) != sizeof(float)) {
-      for (size_t i = 0; i < this->num_instances; i++) {
-        HCTR_LIB_THROW(cudaMemsetAsync(
-            this->frequent_embeddings_single_node[i].get_embedding_vectors_cache().get_ptr(), 0,
-            this->config.num_frequent * this->config.embedding_vec_size * sizeof(emtype),
-            this->stream));
-      }
-    }
-
-    /* Frequent forward_model */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->frequent_embeddings_single_node[i].forward_model(this->stream);
-    }
-
-    std::vector<std::vector<emtype>> updated_vectors_cache(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      download_tensor(updated_vectors_cache[i],
-                      this->frequent_embeddings_single_node[i].get_embedding_vectors_cache(),
-                      this->stream);
-    }
-
-    /* Reference update_model */
-    cpu_embedding.frequent_update_single_node();
-
-    /* Reference forward_model */
-    cpu_embedding.frequent_forward_model();
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      ASSERT_TRUE(compare_array(this->config.num_frequent * this->config.embedding_vec_size,
-                                updated_vectors_cache[i].data(),
-                                cpu_embedding.frequent_embedding_vectors_cache[i].data(), 5e-2));
-    }
-  }
-};
-
-/**************************** Test instantiations ****************************/
-
-static const HybridEmbeddingConfig<uint32_t> config_uint32 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<long long> config_int64 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_uint32_single_node = {
-    1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode};
-static const HybridEmbeddingConfig<long long> config_int64_single_node = {
-    1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode};
-
-// Edge cases: no frequent, all frequent
-static const HybridEmbeddingConfig<uint32_t> config_no_freq = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_no_freq_single_node = {
-    1, 8, 10, 128, 1000, 0, 0.5f, CommunicationType::NVLink_SingleNode};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq_single_node = {
-    1, 8, 10, 128, 1000, 1000, 0.5f, CommunicationType::NVLink_SingleNode};
-
-// Hierarchical A2A
-static const HybridEmbeddingConfig<uint32_t> config_uint32_hier = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<long long> config_int64_hier = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<uint32_t> config_no_freq_hier = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq_hier = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink_Hier};
-
-/* hybrid_embedding_forward_network_test */
-
-TEST(hybrid_embedding_forward_network_test, uint32_half_64) {
-  ForwardNetworkTest<uint32_t, __half>(config_uint32, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, int64_half_64) {
-  ForwardNetworkTest<long long, __half>(config_int64, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, uint32_half_2048) {
-  ForwardNetworkTest<uint32_t, __half>(config_uint32, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, int64_half_2048) {
-  ForwardNetworkTest<long long, __half>(config_int64, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, uint32_float_64) {
-  ForwardNetworkTest<uint32_t, float>(config_uint32, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, int64_float_64) {
-  ForwardNetworkTest<long long, float>(config_int64, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, uint32_float_2048) {
-  ForwardNetworkTest<uint32_t, float>(config_uint32, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, int64_float_2048) {
-  ForwardNetworkTest<long long, float>(config_int64, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, uint32_float_128_no_freq) {
-  ForwardNetworkTest<uint32_t, float>(config_no_freq, 128, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_test, uint32_float_128_all_freq) {
-  ForwardNetworkTest<uint32_t, float>(config_all_freq, 128, false).run();
-}
-
-/* hybrid_embedding_forward_network_single_node_test */
-
-TEST(hybrid_embedding_forward_network_single_node_test, uint32_half_64) {
-  ForwardNetworkTest<uint32_t, __half>(config_uint32_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, int64_half_64) {
-  ForwardNetworkTest<long long, __half>(config_int64_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, uint32_half_2048) {
-  ForwardNetworkTest<uint32_t, __half>(config_uint32_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, int64_half_2048) {
-  ForwardNetworkTest<long long, __half>(config_int64_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_64) {
-  ForwardNetworkTest<uint32_t, float>(config_uint32_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, int64_float_64) {
-  ForwardNetworkTest<long long, float>(config_int64_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_2048) {
-  ForwardNetworkTest<uint32_t, float>(config_uint32_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, int64_float_2048) {
-  ForwardNetworkTest<long long, float>(config_int64_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_128_no_freq) {
-  ForwardNetworkTest<uint32_t, float>(config_no_freq_single_node, 128, true).run();
-}
-
-TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_128_all_freq) {
-  ForwardNetworkTest<uint32_t, float>(config_all_freq_single_node, 128, true).run();
-}
-
-/* hybrid_embedding_forward_network_hier_test */
-
-TEST(hybrid_embedding_forward_network_hier_test, uint32_half_64) {
-  ForwardNetworkTest<uint32_t, __half>(config_uint32_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, int64_half_64) {
-  ForwardNetworkTest<long long, __half>(config_int64_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, uint32_half_2048) {
-  ForwardNetworkTest<uint32_t, __half>(config_uint32_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, int64_half_2048) {
-  ForwardNetworkTest<long long, __half>(config_int64_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, uint32_float_64) {
-  ForwardNetworkTest<uint32_t, float>(config_uint32_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, int64_float_64) {
-  ForwardNetworkTest<long long, float>(config_int64_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, uint32_float_2048) {
-  ForwardNetworkTest<uint32_t, float>(config_uint32_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, int64_float_2048) {
-  ForwardNetworkTest<long long, float>(config_int64_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, uint32_float_128_no_freq) {
-  ForwardNetworkTest<uint32_t, float>(config_no_freq_hier, 128, false).run();
-}
-
-TEST(hybrid_embedding_forward_network_hier_test, uint32_float_128_all_freq) {
-  ForwardNetworkTest<uint32_t, float>(config_all_freq_hier, 128, false).run();
-}
-
-/* hybrid_embedding_frequent_forward_model_test */
-
-TEST(hybrid_embedding_frequent_forward_model_test, uint32_half_64) {
-  FrequentForwardModelTest<uint32_t, __half>(config_uint32_single_node, 64).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, int64_half_64) {
-  FrequentForwardModelTest<long long, __half>(config_int64_single_node, 64).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, uint32_half_2048) {
-  FrequentForwardModelTest<uint32_t, __half>(config_uint32_single_node, 2048).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, int64_half_2048) {
-  FrequentForwardModelTest<long long, __half>(config_int64_single_node, 2048).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_64) {
-  FrequentForwardModelTest<uint32_t, float>(config_uint32_single_node, 64).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, int64_float_64) {
-  FrequentForwardModelTest<long long, float>(config_int64_single_node, 64).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_2048) {
-  FrequentForwardModelTest<uint32_t, float>(config_uint32_single_node, 2048).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, int64_float_2048) {
-  FrequentForwardModelTest<long long, float>(config_int64_single_node, 2048).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_128_no_freq) {
-  FrequentForwardModelTest<uint32_t, float>(config_no_freq_single_node, 128).run();
-}
-
-TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_128_all_freq) {
-  FrequentForwardModelTest<uint32_t, float>(config_all_freq_single_node, 128).run();
-}
diff --git a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp b/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp
deleted file mode 100644
index 16e45de486..0000000000
--- a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithm>
-#include <random>
-#include <utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp>
-#include <utility>
-
-using namespace HugeCTR;
-using namespace HugeCTR::hybrid_embedding;
-
-namespace utils {
-template <typename OUT, typename IN>
-struct TypeConvertFunc;
-
-template <>
-struct TypeConvertFunc<__half, float> {
-  static inline __half convert(float val) { return __float2half(val); }
-};
-
-template <>
-struct TypeConvertFunc<float, __half> {
-  static inline float convert(__half val) { return __half2float(val); }
-};
-
-template <>
-struct TypeConvertFunc<float, float> {
-  static inline float convert(float val) { return val; }
-};
-
-template <typename T>
-static bool lesser_by_first(const std::pair<T, T>& a, const std::pair<T, T>& b) {
-  return (a.first < b.first);
-}
-
-}  // namespace utils
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::calculate_infrequent_model_indices() {
-  model_indices.resize(num_instances);
-  model_indices_offsets.resize(num_instances);
-
-  for (uint32_t model_id = 0; model_id < num_instances; model_id++) {
-    model_indices[model_id].resize(batch_size * num_tables);
-    model_indices_offsets[model_id].resize(num_instances + 1);
-
-    // Prefix sum
-    uint32_t sum = 0;
-    for (uint32_t j = 0; j < batch_size; j++) {
-      if (j % local_batch_size == 0) {
-        model_indices_offsets[model_id][j / local_batch_size] = sum;
-      }
-      for (uint32_t i = 0; i < num_tables; i++) {
-        uint32_t idx = j * num_tables + i;
-
-        dtype category = samples[idx];
-        bool mask = category_location[2 * category] == model_id;
-
-        sum += static_cast<uint32_t>(mask);
-
-        if (mask) model_indices[model_id][sum - 1] = idx;
-      }
-    }
-    // Total size stored at the end of the offsets vector
-    model_indices_offsets[model_id][num_instances] = sum;
-    model_indices[model_id].resize(sum);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::calculate_infrequent_network_indices() {
-  network_indices.resize(num_instances);
-  network_indices_offsets.resize(num_instances);
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    network_indices[network_id].resize(local_batch_size * num_tables);
-    network_indices_offsets[network_id].resize(num_instances + 1);
-
-    std::vector<std::pair<uint32_t, uint32_t>> network_sources_indices =
-        std::vector<std::pair<uint32_t, uint32_t>>(local_batch_size * num_tables);
-
-    // Prefix sum only of this GPU's sub-batch
-    uint32_t sum = 0;
-    for (uint32_t j = local_batch_size * network_id;
-         j < std::min(batch_size, local_batch_size * (network_id + 1)); j++) {
-      for (uint32_t i = 0; i < num_tables; i++) {
-        uint32_t idx = j * num_tables + i;
-        dtype category = samples[idx];
-        dtype model_id = category_location[2 * category];
-        bool mask = model_id < num_instances;
-        sum += static_cast<uint32_t>(mask);
-        uint32_t local_mlp_index = (j - local_batch_size * network_id) * num_tables + i;
-        if (mask)
-          network_sources_indices[sum - 1] =
-              std::make_pair(static_cast<uint32_t>(model_id), local_mlp_index);
-      }
-    }
-    // Sort by source only, otherwise stable
-    std::stable_sort(network_sources_indices.begin(), network_sources_indices.begin() + sum,
-                     utils::lesser_by_first<dtype>);
-
-    // Retrieve indices
-    for (uint32_t idx = 0; idx < sum; idx++) {
-      network_indices[network_id][idx] = network_sources_indices[idx].second;
-    }
-    // Compute offsets
-    for (uint32_t i = 0; i < num_instances; i++) {
-      network_indices_offsets[network_id][i] =
-          std::lower_bound(network_sources_indices.begin(), network_sources_indices.begin() + sum,
-                           std::make_pair(i, (uint32_t)0), utils::lesser_by_first<uint32_t>) -
-          network_sources_indices.begin();
-    }
-    // Total size stored at the end of the offsets vector
-    network_indices_offsets[network_id][num_instances] = sum;
-    network_indices[network_id].resize(sum);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::calculate_frequent_sample_indices() {
-  frequent_sample_indices.resize(num_instances);
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    frequent_sample_indices[network_id].resize(local_batch_size * num_tables);
-
-    // Prefix sum only of this GPU's sub-batch
-    uint32_t sum = 0;
-    for (uint32_t j = local_batch_size * network_id;
-         j < std::min(batch_size, local_batch_size * (network_id + 1)); j++) {
-      for (uint32_t i = 0; i < num_tables; i++) {
-        uint32_t idx = j * num_tables + i;
-
-        dtype category = samples[idx];
-        dtype model_id = category_location[2 * category];
-        bool mask = model_id == num_instances;
-
-        sum += static_cast<uint32_t>(mask);
-
-        uint32_t local_mlp_index = (j - local_batch_size * network_id) * num_tables + i;
-
-        if (mask) frequent_sample_indices[network_id][sum - 1] = local_mlp_index;
-      }
-    }
-
-    frequent_sample_indices[network_id].resize(sum);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::calculate_frequent_model_cache_indices() {
-  const uint32_t num_frequent_per_model = num_frequent / num_instances;
-
-  model_cache_indices.resize(num_instances);
-  model_cache_indices_offsets.resize(num_instances);
-
-  for (uint32_t model_id = 0; model_id < num_instances; model_id++) {
-    model_cache_indices[model_id].resize(num_frequent);
-    model_cache_indices_offsets[model_id].resize(num_instances + 1);
-
-    /* Compute the mask (for each network, frequent categories that belong to my model id) */
-    std::vector<bool> network_frequent_mask = std::vector<bool>(num_frequent, false);
-    for (uint32_t i = 0; i < num_instances; i++) {
-      for (uint32_t j = 0; j < local_batch_size * num_tables; j++) {
-        uint32_t global_j = local_batch_size * num_tables * i + j;
-
-        dtype category = samples[global_j];
-        dtype frequent_index = category_location[2 * category + 1];
-
-        if (category_location[2 * category] == num_instances &&
-            frequent_index / num_frequent_per_model == model_id) {
-          network_frequent_mask[i * num_frequent_per_model +
-                                frequent_index % num_frequent_per_model] = true;
-        }
-      }
-    }
-
-    /* Select categories according to the mask */
-    uint32_t sum = 0;
-    for (uint32_t idx = 0; idx < num_frequent; idx++) {
-      bool mask = network_frequent_mask[idx];
-      sum += static_cast<uint32_t>(mask);
-      if (mask) model_cache_indices[model_id][sum - 1] = idx;
-    }
-
-    /* Compute offsets */
-    for (uint32_t i = 0; i < num_instances; i++) {
-      model_cache_indices_offsets[model_id][i] =
-          std::lower_bound(model_cache_indices[model_id].begin(),
-                           model_cache_indices[model_id].begin() + sum,
-                           i * num_frequent_per_model) -
-          model_cache_indices[model_id].begin();
-    }
-    model_cache_indices_offsets[model_id][num_instances] = sum;
-
-    /* Convert to buffer indices */
-    for (uint32_t idx = 0; idx < sum; idx++) {
-      model_cache_indices[model_id][idx] =
-          model_cache_indices[model_id][idx] % num_frequent_per_model +
-          num_frequent_per_model * model_id;
-    }
-
-    model_cache_indices[model_id].resize(sum);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::calculate_frequent_network_cache_indices() {
-  const uint32_t num_frequent_per_model = num_frequent / num_instances;
-
-  if (network_cache_mask.size() == 0) calculate_frequent_network_cache_mask();
-
-  network_cache_indices.resize(num_instances);
-  network_cache_indices_offsets.resize(num_instances);
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    network_cache_indices[network_id].resize(num_frequent);
-    network_cache_indices_offsets[network_id].resize(num_instances + 1);
-
-    uint32_t sum = 0;
-    for (uint32_t i = 0; i < num_frequent; ++i) {
-      if (network_cache_mask[network_id][i]) {
-        network_cache_indices[network_id][sum] = i;
-        sum++;
-      }
-    }
-
-    /* Compute offsets */
-    for (uint32_t i = 0; i < num_instances; i++) {
-      network_cache_indices_offsets[network_id][i] =
-          std::lower_bound(network_cache_indices[network_id].begin(),
-                           network_cache_indices[network_id].begin() + sum,
-                           i * num_frequent_per_model) -
-          network_cache_indices[network_id].begin();
-    }
-    network_cache_indices_offsets[network_id][num_instances] = sum;
-
-    network_cache_indices[network_id].resize(sum);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::calculate_frequent_network_cache_mask() {
-  network_cache_mask.resize(num_instances);
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    network_cache_mask[network_id].resize(num_frequent);
-
-    for (uint32_t j = local_batch_size * network_id;
-         j < std::min(batch_size, local_batch_size * (network_id + 1)); j++) {
-      for (uint32_t i = 0; i < num_tables; i++) {
-        uint32_t idx = j * num_tables + i;
-        dtype category = samples[idx];
-        if (category_location[2 * category] == num_instances) {
-          dtype frequent_index = category_location[2 * category + 1];
-          network_cache_mask[network_id][frequent_index] = 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::generate_embedding_vectors() {
-  frequent_embedding_vectors.resize(num_instances);
-  infrequent_embedding_vectors.resize(num_instances);
-
-  // Fixed seed for reproducibility
-  std::default_random_engine generator(1234UL);
-  std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
-
-  for (size_t i = 0; i < num_instances; i++) {
-    frequent_embedding_vectors[i].resize(num_frequent * embedding_vec_size);
-    infrequent_embedding_vectors[i].resize(
-        utils::ceildiv<dtype>(num_categories - num_frequent, num_instances) * embedding_vec_size);
-  }
-  for (dtype category = 0; category < num_categories; category++) {
-    dtype model_id = category_location[2 * category];
-    dtype location = category_location[2 * category + 1];
-    if (model_id == num_instances) {
-      dtype freq_index = location;
-      HCTR_CHECK(freq_index < num_frequent);
-      for (uint32_t k = 0; k < embedding_vec_size; k++) {
-        float value = distribution(generator);
-        for (uint32_t i = 0; i < num_instances; i++)
-          frequent_embedding_vectors[i][freq_index * embedding_vec_size + k] = value;
-      }
-    } else {
-      for (uint32_t k = 0; k < embedding_vec_size; k++)
-        infrequent_embedding_vectors[model_id][location * embedding_vec_size + k] =
-            distribution(generator);
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::generate_gradients() {
-  gradients.resize(num_instances);
-
-  // Fixed seed for reproducibility
-  std::default_random_engine generator(1234UL);
-  std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
-
-  for (size_t i = 0; i < num_instances; i++)
-    gradients[i].resize(local_samples_size * embedding_vec_size);
-  for (size_t i = 0; i < num_instances; i++) {
-    for (size_t j = 0; j < local_samples_size; j++) {
-      for (size_t k = 0; k < embedding_vec_size; k++) {
-        gradients[i][j * embedding_vec_size + k] =
-            utils::TypeConvertFunc<emtype, float>::convert(distribution(generator));
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::forward_a2a_messages() {
-  forward_sent_messages.resize(num_instances);
-  forward_received_messages.resize(num_instances);
-
-  for (uint32_t i = 0; i < num_instances; i++) {
-    for (uint32_t j = 0; j < num_instances; j++) {
-      uint32_t k0 = model_indices_offsets[i][j];
-      uint32_t k1 = model_indices_offsets[i][j + 1];
-      for (uint32_t k = k0; k < k1; ++k) {
-        uint32_t model_indices_to_dst = model_indices[i][k];
-        dtype category_to_dst = samples[model_indices_to_dst];
-        uint32_t embedding_vec_indices = category_location[2 * category_to_dst + 1];
-        for (uint32_t m = 0; m < embedding_vec_size; ++m) {
-          emtype value = utils::TypeConvertFunc<emtype, float>::convert(
-              infrequent_embedding_vectors[i][embedding_vec_indices * embedding_vec_size + m]);
-          forward_received_messages[j].push_back(value);
-          forward_sent_messages[i].push_back(value);
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::forward_a2a_messages_hier() {
-  forward_sent_messages.resize(num_instances);
-  forward_received_messages.resize(num_instances);
-  for (uint32_t i = 0; i < num_instances; i++) {
-    forward_received_messages[i].resize(num_instances * local_samples_size * embedding_vec_size);
-    forward_sent_messages[i].resize(num_instances * local_samples_size * embedding_vec_size);
-  }
-
-  uint32_t instances_per_node = num_instances / num_nodes;
-
-  for (uint32_t model_id = 0; model_id < num_instances; model_id++) {
-    for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-      uint32_t k0 = model_indices_offsets[model_id][network_id];
-      uint32_t k1 = model_indices_offsets[model_id][network_id + 1];
-      for (uint32_t k = k0; k < k1; ++k) {
-        uint32_t index = model_indices[model_id][k];
-        dtype category = samples[index];
-        uint32_t location = category_location[2 * category + 1];
-        for (uint32_t m = 0; m < embedding_vec_size; ++m) {
-          emtype value = utils::TypeConvertFunc<emtype, float>::convert(
-              infrequent_embedding_vectors[model_id][location * embedding_vec_size + m]);
-          forward_received_messages[network_id]
-                                   [(model_id * local_samples_size + k - k0) * embedding_vec_size +
-                                    m] = value;
-          forward_sent_messages
-              [model_id - model_id % instances_per_node + network_id % instances_per_node]
-              [((network_id - network_id % instances_per_node + model_id % instances_per_node) *
-                    local_samples_size +
-                k - k0) *
-                   embedding_vec_size +
-               m] = value;
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::backward_a2a_messages() {
-  backward_sent_messages.resize(num_instances);
-  backward_received_messages.resize(num_instances);
-
-  for (size_t i = 0; i < num_instances; i++) {
-    for (size_t j = 0; j < num_instances; j++) {
-      uint32_t k0 = model_indices_offsets[i][j];
-      uint32_t k1 = model_indices_offsets[i][j + 1];
-      for (size_t k = k0; k < k1; ++k) {
-        uint32_t index = model_indices[i][k];
-        uint32_t local_index = index % local_samples_size;
-        for (uint32_t m = 0; m < embedding_vec_size; ++m) {
-          backward_sent_messages[j].push_back(gradients[j][local_index * embedding_vec_size + m]);
-          backward_received_messages[i].push_back(
-              gradients[j][local_index * embedding_vec_size + m]);
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::backward_a2a_messages_hier() {
-  backward_sent_messages.resize(num_instances);
-  backward_received_messages.resize(num_instances);
-  for (uint32_t i = 0; i < num_instances; i++) {
-    backward_received_messages[i].resize(num_instances * local_samples_size * embedding_vec_size);
-    backward_sent_messages[i].resize(num_instances * local_samples_size * embedding_vec_size);
-  }
-
-  uint32_t instances_per_node = num_instances / num_nodes;
-
-  for (size_t model_id = 0; model_id < num_instances; model_id++) {
-    for (size_t network_id = 0; network_id < num_instances; network_id++) {
-      uint32_t k0 = model_indices_offsets[model_id][network_id];
-      uint32_t k1 = model_indices_offsets[model_id][network_id + 1];
-      for (size_t k = k0; k < k1; ++k) {
-        uint32_t index = model_indices[model_id][k];
-        uint32_t local_index = index % local_samples_size;
-        for (uint32_t m = 0; m < embedding_vec_size; ++m) {
-          emtype value = gradients[network_id][local_index * embedding_vec_size + m];
-          backward_received_messages[model_id][(network_id * local_samples_size + k - k0) *
-                                                   embedding_vec_size +
-                                               m] = value;
-          backward_sent_messages
-              [network_id - network_id % instances_per_node + model_id % instances_per_node]
-              [((model_id - model_id % instances_per_node + network_id % instances_per_node) *
-                    local_samples_size +
-                k - k0) *
-                   embedding_vec_size +
-               m] = value;
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::infrequent_update() {
-  for (size_t network_id = 0; network_id < num_instances; network_id++) {
-    for (size_t j = 0; j < local_samples_size; j++) {
-      dtype category = samples[network_id * local_samples_size + j];
-      dtype model_id = category_location[2 * category];
-      dtype location = category_location[2 * category + 1];
-      if (model_id < num_instances) {
-        {
-          for (uint32_t k = 0; k < embedding_vec_size; k++)
-            infrequent_embedding_vectors[model_id][location * embedding_vec_size + k] -=
-                lr * utils::TypeConvertFunc<float, emtype>::convert(
-                         gradients[network_id][j * embedding_vec_size + k]);
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::frequent_reduce_gradients() {
-  // Reduce to a float32 array
-  std::vector<float> reduced_gradients_f32(num_frequent * embedding_vec_size, 0.0f);
-  for (size_t network_id = 0; network_id < num_instances; network_id++) {
-    for (size_t j = 0; j < local_samples_size; j++) {
-      dtype category = samples[network_id * local_samples_size + j];
-      dtype model_id = category_location[2 * category];
-      if (model_id == num_instances) {
-        dtype freq_index = category_location[2 * category + 1];
-        HCTR_CHECK(freq_index < num_frequent);
-        for (uint32_t k = 0; k < embedding_vec_size; k++) {
-          reduced_gradients_f32[freq_index * embedding_vec_size + k] +=
-              utils::TypeConvertFunc<float, emtype>::convert(
-                  gradients[network_id][j * embedding_vec_size + k]);
-        }
-      }
-    }
-  }
-
-  // Copy to the emtype array
-  reduced_gradients.resize(num_frequent * embedding_vec_size);
-  for (size_t i = 0; i < num_frequent * embedding_vec_size; i++) {
-    reduced_gradients[i] = utils::TypeConvertFunc<emtype, float>::convert(reduced_gradients_f32[i]);
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::frequent_update() {
-  for (size_t model_id = 0; model_id < num_instances; model_id++) {
-    for (size_t i = 0; i < num_frequent * embedding_vec_size; i++) {
-      frequent_embedding_vectors[model_id][i] -=
-          lr * utils::TypeConvertFunc<float, emtype>::convert(reduced_gradients[i]);
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::frequent_update_single_node() {
-  uint32_t num_frequent_per_model = num_frequent / num_instances;
-  for (size_t network_id = 0; network_id < num_instances; network_id++) {
-    for (size_t j = 0; j < local_samples_size; j++) {
-      dtype category = samples[network_id * local_samples_size + j];
-      dtype model_id = category_location[2 * category];
-      if (model_id == num_instances) {
-        dtype freq_index = category_location[2 * category + 1];
-        HCTR_CHECK(freq_index < num_frequent);
-        uint32_t frequent_model_id = freq_index / num_frequent_per_model;
-        for (uint32_t k = 0; k < embedding_vec_size; k++)
-          frequent_embedding_vectors[frequent_model_id][freq_index * embedding_vec_size + k] -=
-              lr * utils::TypeConvertFunc<float, emtype>::convert(
-                       gradients[network_id][j * embedding_vec_size + k]);
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::forward_network() {
-  interaction_layer_input.resize(num_instances);
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    interaction_layer_input[network_id].resize(local_samples_size * embedding_vec_size);
-
-    for (uint32_t i = 0; i < local_samples_size; i++) {
-      dtype category = samples[local_samples_size * network_id + i];
-      dtype model_id = category_location[2 * category];
-      dtype location = category_location[2 * category + 1];
-      if (model_id == num_instances) {
-        dtype freq_index = location;
-        HCTR_CHECK(freq_index < num_frequent);
-        for (uint32_t k = 0; k < embedding_vec_size; k++) {
-          interaction_layer_input[network_id][embedding_vec_size * i + k] =
-              frequent_embedding_vectors[network_id][embedding_vec_size * freq_index + k];
-        }
-      } else {
-        for (uint32_t k = 0; k < embedding_vec_size; k++) {
-          interaction_layer_input[network_id][embedding_vec_size * i + k] =
-              infrequent_embedding_vectors[model_id][embedding_vec_size * location + k];
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype, typename emtype>
-void HybridEmbeddingCpu<dtype, emtype>::frequent_forward_model() {
-  frequent_embedding_vectors_cache.resize(num_instances);
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    if (sizeof(emtype) != sizeof(float)) {
-      // Separate buffers, initialize with zeros
-      frequent_embedding_vectors_cache[network_id].resize(num_frequent * embedding_vec_size,
-                                                          (emtype)0.0);
-    } else {
-      // Same buffers, copy previous values
-      frequent_embedding_vectors_cache[network_id].resize(num_frequent * embedding_vec_size);
-      for (size_t i = 0; i < num_frequent * embedding_vec_size; i++) {
-        frequent_embedding_vectors_cache[network_id][i] =
-            utils::TypeConvertFunc<emtype, float>::convert(
-                frequent_embedding_vectors[network_id][i]);
-      }
-    }
-  }
-
-  for (uint32_t network_id = 0; network_id < num_instances; network_id++) {
-    for (uint32_t model_id = 0; model_id < num_instances; model_id++) {
-      uint32_t i0 = network_cache_indices_offsets[network_id][model_id];
-      uint32_t i1 = network_cache_indices_offsets[network_id][model_id + 1];
-      for (uint32_t i = i0; i < i1; i++) {
-        uint32_t freq_index = network_cache_indices[network_id][i];
-        for (uint32_t k = 0; k < embedding_vec_size; k++) {
-          frequent_embedding_vectors_cache[network_id][embedding_vec_size * freq_index + k] =
-              utils::TypeConvertFunc<emtype, float>::convert(
-                  frequent_embedding_vectors[model_id][embedding_vec_size * freq_index + k]);
-        }
-      }
-    }
-  }
-}
-
-template class HybridEmbeddingCpu<uint32_t, __half>;
-template class HybridEmbeddingCpu<uint32_t, float>;
-template class HybridEmbeddingCpu<long long, __half>;
-template class HybridEmbeddingCpu<long long, float>;
\ No newline at end of file
diff --git a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp b/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp
deleted file mode 100644
index eaa091de82..0000000000
--- a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <common.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-using namespace HugeCTR::hybrid_embedding;
-
-namespace utils {
-template <typename IntType>
-constexpr static inline IntType ceildiv(IntType a, IntType b) {
-  return (a + b - 1) / b;
-}
-}  // namespace utils
-
-template <typename dtype, typename emtype>
-class HybridEmbeddingCpu {
- public:
-  uint32_t num_instances;
-  uint32_t num_nodes;
-  uint32_t num_tables;
-  float lr;
-
-  uint32_t batch_size;
-  uint32_t num_categories;
-  uint32_t num_frequent;
-  uint32_t embedding_vec_size;
-  const std::vector<dtype>& category_location;
-  const std::vector<dtype>& samples;
-
-  uint32_t local_batch_size;
-  uint32_t local_samples_size;
-
-  std::vector<std::vector<uint32_t>> model_indices;
-  std::vector<std::vector<uint32_t>> model_indices_offsets;
-  std::vector<std::vector<uint32_t>> network_indices;
-  std::vector<std::vector<uint32_t>> network_indices_offsets;
-  std::vector<std::vector<uint32_t>> frequent_sample_indices;
-  std::vector<std::vector<uint32_t>> model_cache_indices;
-  std::vector<std::vector<uint32_t>> model_cache_indices_offsets;
-  std::vector<std::vector<uint8_t>> network_cache_mask;
-  std::vector<std::vector<uint32_t>> network_cache_indices;
-  std::vector<std::vector<uint32_t>> network_cache_indices_offsets;
-
-  std::vector<std::vector<float>> frequent_embedding_vectors;
-  std::vector<std::vector<float>> infrequent_embedding_vectors;
-  std::vector<std::vector<emtype>> gradients;
-  std::vector<std::vector<emtype>> frequent_embedding_vectors_cache;
-
-  std::vector<std::vector<emtype>> forward_sent_messages;
-  std::vector<std::vector<emtype>> forward_received_messages;
-
-  std::vector<std::vector<emtype>> backward_sent_messages;
-  std::vector<std::vector<emtype>> backward_received_messages;
-
-  std::vector<emtype> reduced_gradients;
-
-  std::vector<std::vector<emtype>> interaction_layer_input;
-
-  HybridEmbeddingCpu(const HybridEmbeddingConfig<dtype>& config, size_t batch_size,
-                     const std::vector<dtype>& category_location, const std::vector<dtype>& samples)
-      : num_instances(config.num_instances),
-        num_nodes(config.num_nodes),
-        num_tables(config.num_tables),
-        lr(config.lr),
-        batch_size(batch_size),
-        num_categories(config.num_categories),
-        num_frequent(config.num_frequent),
-        embedding_vec_size(config.embedding_vec_size),
-        category_location(category_location),
-        samples(samples),
-        local_batch_size(utils::ceildiv<uint32_t>(batch_size, num_instances)),
-        local_samples_size(local_batch_size * num_tables) {}
-
-  void calculate_infrequent_model_indices();
-  void calculate_infrequent_network_indices();
-  void calculate_frequent_sample_indices();
-  void calculate_frequent_model_cache_indices();
-  void calculate_frequent_network_cache_indices();
-  void calculate_frequent_network_cache_mask();
-
-  void generate_embedding_vectors();
-  void generate_gradients();
-
-  void forward_a2a_messages();
-  void forward_a2a_messages_hier();
-  void backward_a2a_messages();
-  void backward_a2a_messages_hier();
-
-  void infrequent_update();
-  void frequent_reduce_gradients();
-  void frequent_update();
-  void frequent_update_single_node();
-
-  void forward_network();
-  void frequent_forward_model();
-};
diff --git a/test/utest/embedding/hybrid_embedding/indices_test.cpp b/test/utest/embedding/hybrid_embedding/indices_test.cpp
deleted file mode 100644
index f3320d890b..0000000000
--- a/test/utest/embedding/hybrid_embedding/indices_test.cpp
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp>
-#include <utest/embedding/hybrid_embedding/test_common.cuh>
-
-/******************** Infrequent embedding: model indices ********************/
-
-template <typename dtype, typename emtype>
-class CalculateModelIndicesTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  CalculateModelIndicesTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                            size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_infrequent_model_indices();
-
-    /* Compute indices */
-    this->build_infrequent();
-    std::vector<std::vector<uint32_t>> h_model_indices(this->num_instances);
-    std::vector<std::vector<uint32_t>> h_model_indices_offsets(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (this->config.comm_type == CommunicationType::NVLink_SingleNode) {
-        this->infrequent_embeddings_single_node[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream);
-        download_tensor(h_model_indices[i],
-                        this->infrequent_embeddings_single_node[i].indices_->model_indices_,
-                        this->stream);
-        download_tensor(h_model_indices_offsets[i],
-                        this->infrequent_embeddings_single_node[i].indices_->model_indices_offsets_,
-                        this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        this->infrequent_embeddings_ib_nvlink[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_model_indices(this->stream);
-        download_tensor(h_model_indices[i],
-                        this->infrequent_embeddings_ib_nvlink[i].indices_->model_indices_,
-                        this->stream);
-        download_tensor(h_model_indices_offsets[i],
-                        this->infrequent_embeddings_ib_nvlink[i].indices_->model_indices_offsets_,
-                        this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_model_indices(
-            this->stream);
-        download_tensor(h_model_indices[i],
-                        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->model_indices_,
-                        this->stream);
-        download_tensor(
-            h_model_indices_offsets[i],
-            this->infrequent_embeddings_ib_nvlink_hier[i].indices_->model_indices_offsets_,
-            this->stream);
-      }
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      h_model_indices[i].resize(h_model_indices_offsets[i][this->num_instances]);
-      EXPECT_THAT(h_model_indices[i], ::testing::ElementsAreArray(cpu_embedding.model_indices[i]));
-      EXPECT_THAT(h_model_indices_offsets[i],
-                  ::testing::ElementsAreArray(cpu_embedding.model_indices_offsets[i]));
-    }
-  }
-};
-
-/******************* Infrequent embedding: network indices *******************/
-
-template <typename dtype, typename emtype>
-class CalculateNetworkIndicesTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  CalculateNetworkIndicesTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                              size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_infrequent_network_indices();
-
-    /* Compute indices */
-    this->build_infrequent();
-    std::vector<std::vector<uint32_t>> h_network_indices(this->num_instances);
-    std::vector<std::vector<uint32_t>> h_network_indices_offsets(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (this->config.comm_type == CommunicationType::NVLink_SingleNode) {
-        this->infrequent_embeddings_single_node[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        this->infrequent_embeddings_single_node[i].indices_->calculate_network_indices(
-            80, this->stream);
-        download_tensor(h_network_indices[i],
-                        this->infrequent_embeddings_single_node[i].indices_->network_indices_,
-                        this->stream);
-        download_tensor(
-            h_network_indices_offsets[i],
-            this->infrequent_embeddings_single_node[i].indices_->network_indices_offsets_,
-            this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        this->infrequent_embeddings_ib_nvlink[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_network_indices(80,
-                                                                                     this->stream);
-        download_tensor(h_network_indices[i],
-                        this->infrequent_embeddings_ib_nvlink[i].indices_->network_indices_,
-                        this->stream);
-        download_tensor(h_network_indices_offsets[i],
-                        this->infrequent_embeddings_ib_nvlink[i].indices_->network_indices_offsets_,
-                        this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_network_indices(
-            80, this->stream);
-        download_tensor(h_network_indices[i],
-                        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->network_indices_,
-                        this->stream);
-        download_tensor(
-            h_network_indices_offsets[i],
-            this->infrequent_embeddings_ib_nvlink_hier[i].indices_->network_indices_offsets_,
-            this->stream);
-      }
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      h_network_indices[i].resize(h_network_indices_offsets[i][this->num_instances]);
-      EXPECT_THAT(h_network_indices[i],
-                  ::testing::ElementsAreArray(cpu_embedding.network_indices[i]));
-      EXPECT_THAT(h_network_indices_offsets[i],
-                  ::testing::ElementsAreArray(cpu_embedding.network_indices_offsets[i]));
-    }
-  }
-};
-
-/**************** Frequent embedding: frequent sample indices ****************/
-
-template <typename dtype, typename emtype>
-class CalculateFrequentSampleIndicesTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  CalculateFrequentSampleIndicesTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                                     size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_frequent_sample_indices();
-    /* Compute indices */
-    this->build_frequent();
-    std::vector<std::vector<uint32_t>> h_frequent_sample_indices(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]);
-      this->get_frequent_embedding(i).indices_->calculate_frequent_sample_indices(this->stream);
-      download_tensor(h_frequent_sample_indices[i],
-                      this->get_frequent_embedding(i).indices_->frequent_sample_indices_,
-                      this->stream);
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      uint32_t num_frequent_sample_indices;
-      HCTR_LIB_THROW(cudaMemcpyAsync(
-          &num_frequent_sample_indices,
-          this->get_frequent_embedding(i).indices_->d_num_frequent_sample_indices_.get_ptr(),
-          sizeof(uint32_t), cudaMemcpyDeviceToHost, this->stream));
-      HCTR_LIB_THROW(cudaStreamSynchronize(this->stream));
-      h_frequent_sample_indices[i].resize(num_frequent_sample_indices);
-      EXPECT_THAT(h_frequent_sample_indices[i],
-                  ::testing::ElementsAreArray(cpu_embedding.frequent_sample_indices[i]));
-    }
-  }
-};
-
-/****************** Frequent embedding: model cache indices ******************/
-
-template <typename dtype, typename emtype>
-class CalculateModelCacheIndicesTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  CalculateModelCacheIndicesTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                                 size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_frequent_model_cache_indices();
-
-    /* Compute indices */
-    this->build_frequent();
-    std::vector<std::vector<uint32_t>> h_model_cache_indices(this->num_instances);
-    std::vector<std::vector<uint32_t>> h_model_cache_indices_offsets(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]);
-      this->get_frequent_embedding(i).indices_->calculate_cache_masks(this->stream);
-      this->get_frequent_embedding(i).indices_->calculate_model_cache_indices(80, this->stream);
-      download_tensor(h_model_cache_indices[i],
-                      this->get_frequent_embedding(i).indices_->model_cache_indices_, this->stream);
-      download_tensor(h_model_cache_indices_offsets[i],
-                      this->get_frequent_embedding(i).indices_->model_cache_indices_offsets_,
-                      this->stream);
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      h_model_cache_indices[i].resize(h_model_cache_indices_offsets[i][this->num_instances]);
-      EXPECT_THAT(h_model_cache_indices[i],
-                  ::testing::ElementsAreArray(cpu_embedding.model_cache_indices[i]));
-      EXPECT_THAT(h_model_cache_indices_offsets[i],
-                  ::testing::ElementsAreArray(cpu_embedding.model_cache_indices_offsets[i]));
-    }
-  }
-};
-
-/***************** Frequent embedding: network cache indices *****************/
-
-template <typename dtype, typename emtype>
-class CalculateNetworkCacheIndicesTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  CalculateNetworkCacheIndicesTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                                   size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_frequent_network_cache_mask();
-    cpu_embedding.calculate_frequent_network_cache_indices();
-
-    /* Compute mask and indices */
-    this->build_frequent();
-    std::vector<std::vector<uint8_t>> h_network_cache_mask(this->num_instances);
-    std::vector<std::vector<uint32_t>> h_network_cache_indices(this->num_instances);
-    std::vector<std::vector<uint32_t>> h_network_cache_indices_offsets(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]);
-      this->get_frequent_embedding(i).indices_->calculate_cache_masks(this->stream);
-      this->get_frequent_embedding(i).indices_->calculate_network_cache_indices(this->stream);
-      download_tensor(h_network_cache_indices[i],
-                      this->get_frequent_embedding(i).indices_->network_cache_indices_,
-                      this->stream);
-      download_tensor(h_network_cache_indices_offsets[i],
-                      this->get_frequent_embedding(i).indices_->network_cache_indices_offsets_,
-                      this->stream);
-      h_network_cache_mask[i].resize(this->config.num_frequent);
-      HCTR_LIB_THROW(
-          cudaMemcpyAsync(h_network_cache_mask[i].data(),
-                          reinterpret_cast<uint8_t*>(
-                              this->get_frequent_embedding(i).indices_->cache_masks_.get_ptr()),
-                          this->config.num_frequent, cudaMemcpyDeviceToHost, this->stream));
-      HCTR_LIB_THROW(cudaStreamSynchronize(this->stream));
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      h_network_cache_indices[i].resize(
-          cpu_embedding.network_cache_indices_offsets[i][this->num_instances]);
-      EXPECT_THAT(h_network_cache_indices[i],
-                  ::testing::ElementsAreArray(cpu_embedding.network_cache_indices[i]));
-      EXPECT_THAT(h_network_cache_indices_offsets[i],
-                  ::testing::ElementsAreArray(cpu_embedding.network_cache_indices_offsets[i]));
-      EXPECT_THAT(h_network_cache_mask[i],
-                  ::testing::ElementsAreArray(cpu_embedding.network_cache_mask[i]));
-    }
-  }
-};
-
-/**************************** Test instantiations ****************************/
-
-static const HybridEmbeddingConfig<uint32_t> config_uint32 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<long long> config_int64 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-
-// Edge cases: no frequent, all frequent
-static const HybridEmbeddingConfig<uint32_t> config_no_freq = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink};
-
-/* hybrid_embedding_model_indices_test */
-
-TEST(hybrid_embedding_model_indices_test, uint32_float_64) {
-  CalculateModelIndicesTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_model_indices_test, int64_float_64) {
-  CalculateModelIndicesTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_model_indices_test, uint32_float_2048) {
-  CalculateModelIndicesTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_model_indices_test, int64_float_2048) {
-  CalculateModelIndicesTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_model_indices_test, uint32_float_128_no_freq) {
-  CalculateModelIndicesTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_model_indices_test, uint32_float_128_all_freq) {
-  CalculateModelIndicesTest<uint32_t, float>(config_all_freq, 128).run();
-}
-
-/* hybrid_embedding_network_indices_test */
-
-TEST(hybrid_embedding_network_indices_test, uint32_float_64) {
-  CalculateNetworkIndicesTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_network_indices_test, int64_float_64) {
-  CalculateNetworkIndicesTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_network_indices_test, uint32_float_2048) {
-  CalculateNetworkIndicesTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_network_indices_test, int64_float_2048) {
-  CalculateNetworkIndicesTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_network_indices_test, uint32_float_128_no_freq) {
-  CalculateNetworkIndicesTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_network_indices_test, uint32_float_128_all_freq) {
-  CalculateNetworkIndicesTest<uint32_t, float>(config_all_freq, 128).run();
-}
-
-/* hybrid_embedding_frequent_sample_indices_test */
-
-TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_64) {
-  CalculateFrequentSampleIndicesTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_frequent_sample_indices_test, int64_float_64) {
-  CalculateFrequentSampleIndicesTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_2048) {
-  CalculateFrequentSampleIndicesTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_frequent_sample_indices_test, int64_float_2048) {
-  CalculateFrequentSampleIndicesTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_128_no_freq) {
-  CalculateFrequentSampleIndicesTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_128_all_freq) {
-  CalculateFrequentSampleIndicesTest<uint32_t, float>(config_all_freq, 128).run();
-}
-
-/* hybrid_embedding_model_cache_indices_test */
-
-TEST(hybrid_embedding_model_cache_indices_test, uint32_float_64) {
-  CalculateModelCacheIndicesTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_model_cache_indices_test, int64_float_64) {
-  CalculateModelCacheIndicesTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_model_cache_indices_test, uint32_float_2048) {
-  CalculateModelCacheIndicesTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_model_cache_indices_test, int64_float_2048) {
-  CalculateModelCacheIndicesTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_model_cache_indices_test, uint32_float_128_no_freq) {
-  CalculateModelCacheIndicesTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_model_cache_indices_test, uint32_float_128_all_freq) {
-  CalculateModelCacheIndicesTest<uint32_t, float>(config_all_freq, 128).run();
-}
-
-/* hybrid_embedding_network_cache_indices_test */
-
-TEST(hybrid_embedding_network_cache_indices_test, uint32_float_64) {
-  CalculateNetworkCacheIndicesTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_network_cache_indices_test, int64_float_64) {
-  CalculateNetworkCacheIndicesTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_network_cache_indices_test, uint32_float_2048) {
-  CalculateNetworkCacheIndicesTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_network_cache_indices_test, int64_float_2048) {
-  CalculateNetworkCacheIndicesTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_network_cache_indices_test, uint32_float_128_no_freq) {
-  CalculateNetworkCacheIndicesTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_network_cache_indices_test, uint32_float_128_all_freq) {
-  CalculateNetworkCacheIndicesTest<uint32_t, float>(config_all_freq, 128).run();
-}
diff --git a/test/utest/embedding/hybrid_embedding/input_generator.cpp b/test/utest/embedding/hybrid_embedding/input_generator.cpp
deleted file mode 100644
index 960068d2dd..0000000000
--- a/test/utest/embedding/hybrid_embedding/input_generator.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <numeric>
-#include <set>
-#include <tensor2.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-std::vector<size_t> HybridEmbeddingInputGenerator<dtype>::generate_rand_table_sizes(
-    size_t num_tables, size_t vec_size, double max_mem) {
-  std::vector<size_t> table_sizes(num_tables);
-
-  // mem = sizeof(float) * vec_size * num_tables * max_table_size;
-  // =>
-  const size_t max_table_size = (size_t)(max_mem / (sizeof(float) * vec_size * num_tables));
-  const double max_exp = log(max_table_size) / log(10.);
-
-  for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-    double r = rand() / (double)RAND_MAX;
-    // MATTHIAS. Remark: @alex & fan: There is a potiential underflow here.
-    table_sizes[embedding] = std::max((size_t)2, (size_t)floor(pow(10., 1. + r * (max_exp - 1))));
-  }
-
-  return table_sizes;
-}
-
-template <typename dtype>
-void HybridEmbeddingInputGenerator<dtype>::generate_uniform_rand_table_sizes(size_t num_categories,
-                                                                             size_t num_tables) {
-  if (num_categories > 0) config_.num_categories = num_categories;
-  if (num_tables > 0) config_.num_tables = num_tables;
-
-  std::set<size_t> separators;
-  separators.insert(0);
-  separators.insert(config_.num_categories);
-  std::uniform_int_distribution<size_t> dist(1, config_.num_categories - 1);
-
-  for (size_t i = 0; i < config_.num_tables - 1; i++) {
-    size_t sep;
-    do {
-      sep = dist(gen_);
-    } while (separators.find(sep) != separators.end());
-    separators.insert(sep);
-  }
-
-  for (auto it = std::next(separators.begin()); it != separators.end(); it++) {
-    table_sizes_.push_back(*it - *(std::prev(it)));
-  }
-}
-
-template <typename dtype>
-void HybridEmbeddingInputGenerator<dtype>::create_probability_distribution() {
-  const size_t num_embeddings = table_sizes_.size();
-  std::uniform_real_distribution<double> distr(0.3, 0.8);
-
-  embedding_prob_distribution_.resize(num_embeddings);
-  for (size_t embedding = 0; embedding < num_embeddings; ++embedding) {
-    embedding_prob_distribution_[embedding].resize(table_sizes_[embedding]);
-  }
-
-  for (size_t embedding = 0; embedding < num_embeddings; ++embedding) {
-    const size_t embedding_size = table_sizes_[embedding];
-    std::vector<size_t> embedding_shuffle_arg(table_sizes_[embedding]);
-    std::iota(embedding_shuffle_arg.begin(), embedding_shuffle_arg.end(), (size_t)0);
-    std::shuffle(embedding_shuffle_arg.begin(), embedding_shuffle_arg.end(), gen_);
-    embedding_shuffle_args.push_back(embedding_shuffle_arg);
-    if (embedding_size < 30) {
-      // choose uniform distribution
-      for (size_t c_e = 0; c_e < table_sizes_[embedding]; ++c_e)
-        embedding_prob_distribution_[embedding][c_e] = 1. / (double)embedding_size;
-    } else {
-      // MATTHIAS. Remark: @alex & fan: There is a potiential underflow here.
-      size_t size_first = std::max((size_t)1, size_t(4. * log10((double)embedding_size)));
-      size_first = std::min((size_t)embedding_size, (size_t)size_first);
-      double acc_prob_first = distr(gen_);
-      // a * (1 - r^n) / (1 - r) = acc_p
-      // Let a * r^{n} = 0.02 * acc_prob_first
-      // a - 0.02 * acc_prob_first = acc_prob_first * (1-r)
-
-      // (1 + 0.02) * acc_prob_first - a = r * acc_prob_first
-      double r = 0.9;
-      double a = acc_prob_first * (1. - r) / (1. - pow(r, (double)size_first));
-      for (size_t c_e = 0; c_e < size_first; ++c_e)
-        embedding_prob_distribution_[embedding][c_e] = a * pow(r, (double)c_e);
-
-      // the following is approximate, will be normalized..
-      //
-      // now apply power law to the remaining elements:
-      //
-      //   p = a * n^{-2}
-      // => 1 - acc_prob_first = a / N - a / n
-      // => a ( 1/n - 1/N ) = 1 - acc_prob_first
-      // => a (N-n) / (nN) = 1 - acc_prob_first
-      // => a = n * N / (N-n) * (1 - acc_prob_first)
-
-      a = size_first * embedding_size / (embedding_size - size_first) * (1. - acc_prob_first);
-      for (size_t c_e = size_first; c_e < table_sizes_[embedding]; ++c_e)
-        embedding_prob_distribution_[embedding][c_e] = a * pow((double)c_e, -2.);
-
-      // normalize probability distribution
-      // calculate norm
-      double sum_p = 0.;
-      for (size_t c_e = 0; c_e < table_sizes_[embedding]; ++c_e)
-        sum_p += embedding_prob_distribution_[embedding][c_e];
-      // correct
-      for (size_t c_e = 0; c_e < table_sizes_[embedding]; ++c_e)
-        embedding_prob_distribution_[embedding][c_e] /= sum_p;
-    }
-  }
-}
-
-template <typename dtype>
-void HybridEmbeddingInputGenerator<dtype>::generate_categories(dtype* data, size_t batch_size,
-                                                               bool normalized) {
-  const size_t num_embeddings = table_sizes_.size();
-  std::uniform_real_distribution<double> distr(0, 1);
-  std::vector<dtype> embedding_offsets;
-  HugeCTR::hybrid_embedding::EmbeddingTableFunctors<dtype>::get_embedding_offsets(embedding_offsets,
-                                                                                  table_sizes_);
-  // create samples
-  for (size_t embedding = 0; embedding < num_embeddings; ++embedding) {
-    std::vector<size_t>& embedding_shuffle_arg = embedding_shuffle_args[embedding];
-    std::vector<double>& f_prob_e = embedding_prob_distribution_[embedding];
-    std::vector<double> acc_prob(f_prob_e.size() + 1, 0.0);
-    double acc = 0.;
-    for (size_t c_e = 0; c_e < f_prob_e.size(); ++c_e) {
-      acc_prob[c_e] = acc;
-      acc += f_prob_e[c_e];
-    }
-
-    acc_prob.front() = -42.0;
-    acc_prob.back() = 42.0;
-
-    for (size_t sample = 0; sample < batch_size; ++sample) {
-      double r = distr(gen_);
-      size_t category =
-          (size_t)(std::lower_bound(acc_prob.begin(), acc_prob.end(), r) - acc_prob.begin()) - 1;
-
-      // category index within table
-      size_t category_shuffled = embedding_shuffle_arg[category];
-      data[sample * num_embeddings + embedding] = category_shuffled;
-
-      if (normalized) {
-        data[sample * num_embeddings + embedding] += (size_t)embedding_offsets[embedding];
-      }
-    }
-  }
-}
-
-template <typename dtype>
-void HybridEmbeddingInputGenerator<dtype>::generate_category_location() {
-  std::uniform_int_distribution<dtype> distr(0, config_.num_instances - 1);
-
-  std::vector<double> all_probabilities;
-  for (auto& v : embedding_prob_distribution_) {
-    all_probabilities.insert(all_probabilities.end(), v.begin(), v.end());
-  }
-  std::vector<dtype> original_index(config_.num_categories);
-  std::iota(original_index.begin(), original_index.end(), (dtype)0);
-
-  std::sort(original_index.begin(), original_index.end(), [&all_probabilities](dtype i1, dtype i2) {
-    return all_probabilities[i1] < all_probabilities[i2];
-  });
-
-  // First num_frequent categories are frequent
-  category_location_.resize(2 * config_.num_categories, config_.num_instances);
-  for (dtype i = 0; i < config_.num_frequent; i++) {
-    dtype cat = original_index[i];
-    category_location_[2 * cat + 1] = i;
-  }
-
-  dtype max_size_per_instance =
-      (config_.num_categories - config_.num_frequent + config_.num_instances - 1) /
-      config_.num_instances;
-  std::vector<dtype> sizes_per_instance(config_.num_instances, 0);
-  for (dtype i = config_.num_frequent; i < config_.num_categories; i++) {
-    dtype cat = original_index[i];
-    dtype instance;
-    do {
-      instance = distr(gen_);
-      // If the selected instance is already full, pick another one
-    } while (sizes_per_instance[instance] == max_size_per_instance);
-    category_location_[2 * cat + 0] = instance;
-    category_location_[2 * cat + 1] = sizes_per_instance[instance]++;
-  }
-}
-
-template <typename dtype>
-HybridEmbeddingInputGenerator<dtype>::HybridEmbeddingInputGenerator(
-    HybridEmbeddingConfig<dtype> config, size_t seed)
-    : config_(config), seed_(seed), gen_(seed) {
-  generate_uniform_rand_table_sizes(config_.num_categories, config_.num_tables);
-  create_probability_distribution();
-}
-
-template <typename dtype>
-HybridEmbeddingInputGenerator<dtype>::HybridEmbeddingInputGenerator(
-    HybridEmbeddingConfig<dtype> config, const std::vector<size_t>& table_sizes, size_t seed)
-    : config_(config), table_sizes_(table_sizes), seed_(seed), gen_(seed) {
-  config_.num_tables = table_sizes.size();
-  config_.num_categories = std::accumulate(table_sizes.begin(), table_sizes.end(), 0);
-  create_probability_distribution();
-}
-
-template <typename dtype>
-std::vector<dtype> HybridEmbeddingInputGenerator<dtype>::generate_categorical_input(
-    size_t batch_size, size_t num_tables) {
-  table_sizes_ = generate_rand_table_sizes(num_tables);
-  config_.num_tables = table_sizes_.size();
-  config_.num_categories = std::accumulate(table_sizes_.begin(), table_sizes_.end(), 0);
-  create_probability_distribution();
-
-  std::vector<dtype> data(batch_size * config_.num_tables);
-  generate_categories(data.data(), batch_size, false);
-  return data;
-}
-
-template <typename dtype>
-std::vector<dtype> HybridEmbeddingInputGenerator<dtype>::generate_flattened_categorical_input(
-    size_t batch_size, size_t num_tables) {
-  table_sizes_ = generate_rand_table_sizes(num_tables);
-  config_.num_tables = table_sizes_.size();
-  config_.num_categories = std::accumulate(table_sizes_.begin(), table_sizes_.end(), 0);
-  create_probability_distribution();
-
-  std::vector<dtype> data(batch_size * config_.num_tables);
-  generate_categories(data.data(), batch_size, true);
-  return data;
-}
-
-template <typename dtype>
-std::vector<dtype> HybridEmbeddingInputGenerator<dtype>::generate_categorical_input(
-    size_t batch_size) {
-  std::vector<dtype> data(batch_size * config_.num_tables);
-  generate_categories(data.data(), batch_size, false);
-  return data;
-}
-
-template <typename dtype>
-std::vector<dtype> HybridEmbeddingInputGenerator<dtype>::generate_flattened_categorical_input(
-    size_t batch_size) {
-  std::vector<dtype> data(batch_size * config_.num_tables);
-  generate_categories(data.data(), batch_size, true);
-  return data;
-}
-
-template <typename dtype>
-void HybridEmbeddingInputGenerator<dtype>::generate_categorical_input(dtype* batch,
-                                                                      size_t batch_size) {
-  generate_categories(batch, batch_size, false);
-}
-
-template <typename dtype>
-void HybridEmbeddingInputGenerator<dtype>::generate_flattened_categorical_input(dtype* batch,
-                                                                                size_t batch_size) {
-  generate_categories(batch, batch_size, true);
-}
-
-template <typename dtype>
-std::vector<dtype>& HybridEmbeddingInputGenerator<dtype>::get_category_location() {
-  return category_location_;
-}
-
-template <typename dtype>
-std::vector<size_t>& HybridEmbeddingInputGenerator<dtype>::get_table_sizes() {
-  return table_sizes_;
-}
-
-template class HybridEmbeddingInputGenerator<uint32_t>;
-template class HybridEmbeddingInputGenerator<long long>;
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/test/utest/embedding/hybrid_embedding/input_generator.hpp b/test/utest/embedding/hybrid_embedding/input_generator.hpp
deleted file mode 100644
index 394b1775d2..0000000000
--- a/test/utest/embedding/hybrid_embedding/input_generator.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <random>
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-struct HybridEmbeddingConfig {
-  size_t num_nodes;
-  size_t num_instances;
-  size_t num_tables;
-  size_t embedding_vec_size;
-  dtype num_categories;
-  dtype num_frequent;
-  float lr;
-  CommunicationType comm_type;
-};
-
-template <typename dtype>
-class HybridEmbeddingInputGenerator {
- public:
-  HybridEmbeddingInputGenerator(size_t seed) : gen_(seed) {}
-  HybridEmbeddingInputGenerator(HybridEmbeddingConfig<dtype> config, size_t seed);
-  HybridEmbeddingInputGenerator(HybridEmbeddingConfig<dtype> config,
-                                const std::vector<size_t> &table_sizes, size_t seed);
-  // Multiple calls return different data
-
-  // By default the data is provided in the 'raw' format: each data point is
-  // a category which is indexed according to the table it belongs to.
-  // Each sample contains <number of tables> elements and its
-  // value lies within the integer range [0, number of categories in category feature)
-
-  /// @param batch_size number of samples to return
-  /// @param num_categories required sum of table sizes
-  /// @param num_tables required number of tables
-  /// @param flatten_input indicator whether generated categories have an associated unique value
-  std::vector<dtype> generate_categorical_input(size_t batch_size, size_t num_tables);
-  // _flattened means that the category indices are unique
-  // (i.e., table offsets are added to the raw data)
-  std::vector<dtype> generate_flattened_categorical_input(size_t batch_size, size_t num_tables);
-
-  // regenerate data with precalculated table_sizes_
-  std::vector<dtype> generate_categorical_input(size_t batch_size);
-  std::vector<dtype> generate_flattened_categorical_input(size_t batch_size);
-
-  void generate_categorical_input(dtype *batch, size_t batch_size);
-  void generate_flattened_categorical_input(dtype *batch, size_t batch_size);
-  void generate_category_location();
-
-  // Multiple calls return the same data
-  std::vector<dtype> &get_category_location();
-  std::vector<size_t> &get_table_sizes();
-
- private:
-  HybridEmbeddingConfig<dtype> config_;
-  std::vector<std::vector<double>> embedding_prob_distribution_;
-  std::vector<size_t> table_sizes_;
-  size_t seed_;
-  std::mt19937 gen_;
-
-  std::vector<dtype> category_location_;
-  std::vector<std::vector<size_t>> embedding_shuffle_args;
-
-  void generate_uniform_rand_table_sizes(size_t num_categories = 0, size_t num_tables = 0);
-  static std::vector<size_t> generate_rand_table_sizes(size_t num_tables,
-                                                       size_t embedding_vec_size = 128,
-                                                       double max_mem = 8.e9);
-  void create_probability_distribution();
-  void generate_categories(dtype *data, size_t batch_size, bool normalized);
-};
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
\ No newline at end of file
diff --git a/test/utest/embedding/hybrid_embedding/messages_test.cpp b/test/utest/embedding/hybrid_embedding/messages_test.cpp
deleted file mode 100644
index 6f6973a585..0000000000
--- a/test/utest/embedding/hybrid_embedding/messages_test.cpp
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp>
-#include <utest/embedding/hybrid_embedding/test_common.cuh>
-
-/**************** Infrequent embedding: forward sent message ****************/
-
-template <typename dtype, typename emtype>
-class ForwardSentMessageTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  ForwardSentMessageTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                         size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    uint32_t local_batch_size = ceildiv<uint32_t>(this->batch_size, this->num_instances);
-    uint32_t instances_per_node = this->num_instances / this->config.num_nodes;
-
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_infrequent_model_indices();
-    cpu_embedding.generate_embedding_vectors();
-    if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-      cpu_embedding.forward_a2a_messages_hier();
-    } else {
-      cpu_embedding.forward_a2a_messages();
-    }
-
-    /* Tensors and vectors for the generated messages */
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-    std::vector<Tensor2<emtype>> sent_messages(this->num_instances);
-    std::vector<Tensor2<emtype*>> message_buffer_pointers(this->num_instances);
-    std::vector<std::vector<emtype>> h_sent_messages(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({this->num_instances * local_batch_size * this->config.num_tables,
-                     this->config.embedding_vec_size},
-                    &sent_messages[i]);
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        buff->reserve({instances_per_node, 1}, &message_buffer_pointers[i]);
-      }
-    }
-    buff->allocate();
-
-    this->build_infrequent();
-
-    std::vector<std::vector<emtype*>> h_message_buffer_pointers(this->config.num_nodes);
-    if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-      /* Construct the arrays of pointers for each node */
-      for (size_t i = 0; i < this->config.num_nodes; i++) {
-        h_message_buffer_pointers[i].resize(instances_per_node);
-      }
-      for (size_t i = 0; i < this->num_instances; i++) {
-        h_message_buffer_pointers[i / instances_per_node][i % instances_per_node] =
-            sent_messages[i].get_ptr();
-      }
-
-      /* Copy the arrays to device */
-      for (size_t i = 0; i < this->num_instances; i++) {
-        HCTR_LIB_THROW(cudaMemcpyAsync(message_buffer_pointers[i].get_ptr(),
-                                       h_message_buffer_pointers[i / instances_per_node].data(),
-                                       instances_per_node * sizeof(emtype*), cudaMemcpyHostToDevice,
-                                       this->stream));
-      }
-
-      /* Fill buffers with zeroes */
-      for (size_t i = 0; i < this->num_instances; i++) {
-        HCTR_LIB_THROW(
-            cudaMemsetAsync(sent_messages[i].get_ptr(), 0,
-                            this->num_instances * local_batch_size * this->config.num_tables *
-                                this->config.embedding_vec_size * sizeof(emtype),
-                            this->stream));
-      }
-    }
-
-    /* Infrequent forward_model */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (this->config.comm_type == CommunicationType::NVLink_SingleNode) {
-        this->infrequent_embeddings_single_node[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_,
-                      this->stream);
-        this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        this->infrequent_embeddings_ib_nvlink[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_,
-                      this->stream);
-        this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_model_indices(this->stream);
-
-        this->infrequent_embeddings_ib_nvlink[i].forward_model(sent_messages[i].get_ptr(),
-                                                               this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_,
-                      this->stream);
-        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_model_indices(
-            this->stream);
-
-        this->infrequent_embeddings_ib_nvlink_hier[i].fused_intra_forward_model(
-            message_buffer_pointers[i].get_ptr(), this->stream);
-      }
-    }
-
-    for (size_t i = 0; i < this->num_instances; i++) {
-      download_tensor(h_sent_messages[i], sent_messages[i], this->stream);
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      uint32_t message_size = this->config.comm_type == CommunicationType::IB_NVLink_Hier
-                                  ? (this->num_instances * local_batch_size *
-                                     this->config.num_tables * this->config.embedding_vec_size)
-                                  : (this->config.embedding_vec_size *
-                                     cpu_embedding.model_indices_offsets[i][this->num_instances]);
-      ASSERT_TRUE(compare_array(message_size, h_sent_messages[i].data(),
-                                cpu_embedding.forward_sent_messages[i].data(), 1e-2));
-    }
-  }
-};
-
-/**************** Infrequent embedding: backward sent message ****************/
-
-template <typename dtype, typename emtype>
-class BackwardSentMessageTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- public:
-  BackwardSentMessageTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                          size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed) {}
-
-  void run() {
-    uint32_t local_batch_size = ceildiv<uint32_t>(this->batch_size, this->num_instances);
-    uint32_t instances_per_node = this->num_instances / this->config.num_nodes;
-
-    /* Compute expected results on host */
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_infrequent_model_indices();
-    cpu_embedding.calculate_infrequent_network_indices();
-    cpu_embedding.generate_gradients();
-    if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-      cpu_embedding.backward_a2a_messages_hier();
-    } else {
-      cpu_embedding.backward_a2a_messages();
-    }
-
-    /* Tensors and vectors for the gradients and generated messages */
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-    std::vector<Tensor2<emtype>> sent_messages(this->num_instances);
-    std::vector<Tensor2<emtype*>> message_buffer_pointers(this->num_instances);
-    std::vector<std::vector<emtype>> h_sent_messages(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({this->num_instances * local_batch_size * this->config.num_tables,
-                     this->config.embedding_vec_size},
-                    &sent_messages[i]);
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        buff->reserve({instances_per_node, 1}, &message_buffer_pointers[i]);
-      }
-    }
-    std::vector<Tensor2<emtype>> gradients(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size},
-                    &gradients[i]);
-    }
-    buff->allocate();
-
-    this->build_infrequent();
-
-    std::vector<std::vector<emtype*>> h_message_buffer_pointers(this->config.num_nodes);
-    if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-      /* Construct the arrays of pointers for each node */
-      for (size_t i = 0; i < this->config.num_nodes; i++) {
-        h_message_buffer_pointers[i].resize(instances_per_node);
-      }
-      for (size_t i = 0; i < this->num_instances; i++) {
-        h_message_buffer_pointers[i / instances_per_node][i % instances_per_node] =
-            sent_messages[i].get_ptr();
-      }
-
-      /* Copy the arrays to device */
-      for (size_t i = 0; i < this->num_instances; i++) {
-        HCTR_LIB_THROW(cudaMemcpyAsync(message_buffer_pointers[i].get_ptr(),
-                                       h_message_buffer_pointers[i / instances_per_node].data(),
-                                       instances_per_node * sizeof(emtype*), cudaMemcpyHostToDevice,
-                                       this->stream));
-      }
-
-      /* Fill buffers with zeroes */
-      for (size_t i = 0; i < this->num_instances; i++) {
-        HCTR_LIB_THROW(
-            cudaMemsetAsync(sent_messages[i].get_ptr(), 0,
-                            this->num_instances * local_batch_size * this->config.num_tables *
-                                this->config.embedding_vec_size * sizeof(emtype),
-                            this->stream));
-      }
-    }
-
-    /* Infrequent update_network */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (this->config.comm_type == CommunicationType::NVLink_SingleNode) {
-        this->infrequent_embeddings_single_node[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream);
-        this->infrequent_embeddings_single_node[i].indices_->calculate_network_indices(
-            80, this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        this->infrequent_embeddings_ib_nvlink[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream);
-        this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_network_indices(80,
-                                                                                     this->stream);
-
-        this->infrequent_embeddings_ib_nvlink[i].update_network(
-            gradients[i].get_ptr(), sent_messages[i].get_ptr(), this->stream);
-      }
-
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream);
-        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_network_indices(
-            80, this->stream);
-
-        this->infrequent_embeddings_ib_nvlink_hier[i].fused_intra_update_network(
-            gradients[i].get_ptr(), message_buffer_pointers[i].get_ptr(), this->stream);
-      }
-    }
-
-    for (size_t i = 0; i < this->num_instances; i++) {
-      download_tensor(h_sent_messages[i], sent_messages[i], this->stream);
-    }
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      uint32_t message_size = this->config.comm_type == CommunicationType::IB_NVLink_Hier
-                                  ? (this->num_instances * local_batch_size *
-                                     this->config.num_tables * this->config.embedding_vec_size)
-                                  : (this->config.embedding_vec_size *
-                                     cpu_embedding.network_indices_offsets[i][this->num_instances]);
-      ASSERT_TRUE(compare_array(message_size, h_sent_messages[i].data(),
-                                cpu_embedding.backward_sent_messages[i].data(), 1e-2));
-    }
-  }
-};
-
-/**************************** Test instantiations ****************************/
-
-static const HybridEmbeddingConfig<uint32_t> config_uint32 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<long long> config_int64 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-
-// Edge cases: no frequent, all frequent
-static const HybridEmbeddingConfig<uint32_t> config_no_freq = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink};
-
-// Hierarchical A2A
-static const HybridEmbeddingConfig<uint32_t> config_uint32_hier = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<long long> config_int64_hier = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<uint32_t> config_no_freq_hier = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq_hier = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink_Hier};
-
-/* hybrid_embedding_forward_sent_message_test */
-
-TEST(hybrid_embedding_forward_sent_message_test, uint32_half_64) {
-  ForwardSentMessageTest<uint32_t, __half>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, int64_half_64) {
-  ForwardSentMessageTest<long long, __half>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, uint32_half_2048) {
-  ForwardSentMessageTest<uint32_t, __half>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, int64_half_2048) {
-  ForwardSentMessageTest<long long, __half>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, uint32_float_64) {
-  ForwardSentMessageTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, int64_float_64) {
-  ForwardSentMessageTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, uint32_float_2048) {
-  ForwardSentMessageTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, int64_float_2048) {
-  ForwardSentMessageTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, uint32_float_128_no_freq) {
-  ForwardSentMessageTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_test, uint32_float_128_all_freq) {
-  ForwardSentMessageTest<uint32_t, float>(config_all_freq, 128).run();
-}
-
-/* hybrid_embedding_forward_sent_message_hier_test */
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_half_64) {
-  ForwardSentMessageTest<uint32_t, __half>(config_uint32_hier, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, int64_half_64) {
-  ForwardSentMessageTest<long long, __half>(config_int64_hier, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_half_2048) {
-  ForwardSentMessageTest<uint32_t, __half>(config_uint32_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, int64_half_2048) {
-  ForwardSentMessageTest<long long, __half>(config_int64_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_64) {
-  ForwardSentMessageTest<uint32_t, float>(config_uint32_hier, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, int64_float_64) {
-  ForwardSentMessageTest<long long, float>(config_int64_hier, 64).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_2048) {
-  ForwardSentMessageTest<uint32_t, float>(config_uint32_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, int64_float_2048) {
-  ForwardSentMessageTest<long long, float>(config_int64_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_128_no_freq) {
-  ForwardSentMessageTest<uint32_t, float>(config_no_freq_hier, 128).run();
-}
-
-TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_128_all_freq) {
-  ForwardSentMessageTest<uint32_t, float>(config_all_freq_hier, 128).run();
-}
-
-/* hybrid_embedding_backward_sent_message_test */
-
-TEST(hybrid_embedding_backward_sent_message_test, uint32_half_64) {
-  BackwardSentMessageTest<uint32_t, __half>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, int64_half_64) {
-  BackwardSentMessageTest<long long, __half>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, uint32_half_2048) {
-  BackwardSentMessageTest<uint32_t, __half>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, int64_half_2048) {
-  BackwardSentMessageTest<long long, __half>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, uint32_float_64) {
-  BackwardSentMessageTest<uint32_t, float>(config_uint32, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, int64_float_64) {
-  BackwardSentMessageTest<long long, float>(config_int64, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, uint32_float_2048) {
-  BackwardSentMessageTest<uint32_t, float>(config_uint32, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, int64_float_2048) {
-  BackwardSentMessageTest<long long, float>(config_int64, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, uint32_float_128_no_freq) {
-  BackwardSentMessageTest<uint32_t, float>(config_no_freq, 128).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_test, uint32_float_128_all_freq) {
-  BackwardSentMessageTest<uint32_t, float>(config_all_freq, 128).run();
-}
-
-/* hybrid_embedding_backward_sent_message_hier_test */
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_half_64) {
-  BackwardSentMessageTest<uint32_t, __half>(config_uint32_hier, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, int64_half_64) {
-  BackwardSentMessageTest<long long, __half>(config_int64_hier, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_half_2048) {
-  BackwardSentMessageTest<uint32_t, __half>(config_uint32_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, int64_half_2048) {
-  BackwardSentMessageTest<long long, __half>(config_int64_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_64) {
-  BackwardSentMessageTest<uint32_t, float>(config_uint32_hier, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, int64_float_64) {
-  BackwardSentMessageTest<long long, float>(config_int64_hier, 64).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_2048) {
-  BackwardSentMessageTest<uint32_t, float>(config_uint32_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, int64_float_2048) {
-  BackwardSentMessageTest<long long, float>(config_int64_hier, 2048).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_128_no_freq) {
-  BackwardSentMessageTest<uint32_t, float>(config_no_freq_hier, 128).run();
-}
-
-TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_128_all_freq) {
-  BackwardSentMessageTest<uint32_t, float>(config_all_freq_hier, 128).run();
-}
diff --git a/test/utest/embedding/hybrid_embedding/model_test.cpp b/test/utest/embedding/hybrid_embedding/model_test.cpp
deleted file mode 100644
index 5a04e4a286..0000000000
--- a/test/utest/embedding/hybrid_embedding/model_test.cpp
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <set>
-#include <utest/embedding/hybrid_embedding/data_test.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-#include <utest/embedding/hybrid_embedding/statistics_test.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-using namespace hybrid_embedding;
-
-namespace {
-
-template <typename dtype>
-void print_vector(const std::vector<dtype> &vec) {
-  for (auto v : vec) {
-    std::cout << v << " ,";
-  }
-  std::cout << std::endl;
-}
-
-template <typename dtype, typename emtype = float>
-void model_test() {
-  Tensor2<dtype> tmp_categories;
-  size_t batch_size = 4;
-  size_t num_iterations = 2;
-  CommunicationType comm_type = CommunicationType::IB_NVLink;
-  uint32_t global_instance_id = 1;
-  std::vector<uint32_t> num_instances_per_node{2, 2};
-  std::vector<size_t> table_sizes{100, 10, 10, 20};
-  std::vector<dtype> data_in{99, 3, 7, 19, 0,  0, 0, 0,  1, 1, 1, 1, 2, 2, 2, 2,
-                             3,  3, 3, 3,  50, 2, 4, 10, 2, 2, 2, 2, 1, 1, 1, 1};
-  std::vector<dtype> data_to_unique_categories_ref{
-      99, 103, 117, 139, 0,  100, 110, 120, 1, 101, 111, 121, 2, 102, 112, 122,
-      3,  103, 113, 123, 50, 102, 114, 130, 2, 102, 112, 122, 1, 101, 111, 121};
-
-  Tensor2<dtype> d_data_in;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  size_t num_categories = EmbeddingTableFunctors<dtype>::get_num_categories(table_sizes);
-  buff->reserve({batch_size * num_iterations * table_sizes.size()}, &d_data_in);
-  buff->reserve({num_categories, 1}, &tmp_categories);
-  buff->allocate();
-  upload_tensor(data_in, d_data_in, 0);
-
-  /*1. Data() and data.data_to_unique_categories()*/
-  Data<dtype> data(table_sizes, batch_size, num_iterations);
-  data.data_to_unique_categories(d_data_in, 0);
-  std::vector<dtype> data_to_unique_categories_ret;
-  download_tensor(data_to_unique_categories_ret, data.samples, 0);
-  EXPECT_THAT(data_to_unique_categories_ret,
-              ::testing::ElementsAreArray(data_to_unique_categories_ref));
-
-  /*2. Model()*/
-  // std::cout << "debug0:" << num_categories << std::endl;
-  Model<dtype> model(comm_type, global_instance_id, num_instances_per_node, num_categories);
-
-  /*3. CalibrationData()*/
-  size_t num_nodes = num_instances_per_node.size();
-  CalibrationData calibration_data(num_nodes, 1.0 / 10.0, 4.0, 1.0, 1.0);
-
-  /*4. Statistics()*/
-  Statistics<dtype> statistics(data.batch_size * data.num_iterations * data.table_sizes.size(),
-                               data.table_sizes.size(), model.num_instances, num_categories);
-  statistics.sort_categories_by_count(data.samples, 0);
-  std::vector<dtype> categories_sorted_ret;
-  std::vector<uint32_t> counts_sorted_ret;
-  download_tensor(categories_sorted_ret, statistics.categories_sorted, 0);
-  download_tensor(counts_sorted_ret, statistics.counts_sorted, 0);
-  std::vector<dtype> categories_sorted_ref{102, 1,  2,   101, 103, 111, 112, 121, 122, 0,   3,
-                                           50,  99, 100, 110, 113, 114, 117, 120, 123, 130, 139};
-  std::vector<uint32_t> counts_sorted_ref{3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
-                                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  EXPECT_THAT(categories_sorted_ret, ::testing::ElementsAreArray(categories_sorted_ref));
-  EXPECT_THAT(counts_sorted_ret, ::testing::ElementsAreArray(counts_sorted_ref));
-  // print_vector(counts_sorted_ret);
-  // print_vector(categories_sorted_ret);
-
-  /*5. Model<dtype>::init_hybrid_model*/
-  model.init_hybrid_model(calibration_data, statistics, data, tmp_categories, 0);
-  EXPECT_EQ(model.num_frequent, 12);
-  std::vector<dtype> category_location_ret;
-  download_tensor(category_location_ret, model.category_location, 0);
-
-  std::vector<dtype> category_location_ref{
-      4, 4,  4, 3,  4, 6,  4, 7,  0, 0,  1, 0,  2, 0,  3, 0,  0, 1,  1, 1,  2,   1,  3, 1,  0, 2,
-      1, 2,  2, 2,  3, 2,  0, 3,  1, 3,  2, 3,  3, 3,  0, 4,  1, 4,  2, 4,  3,   4,  0, 5,  1, 5,
-      2, 5,  3, 5,  0, 6,  1, 6,  2, 6,  3, 6,  0, 7,  1, 7,  2, 7,  3, 7,  0,   8,  1, 8,  2, 8,
-      3, 8,  0, 9,  1, 9,  2, 9,  3, 9,  0, 10, 1, 10, 2, 10, 3, 10, 0, 11, 1,   11, 4, 9,  2, 11,
-      3, 11, 0, 12, 1, 12, 2, 12, 3, 12, 0, 13, 1, 13, 2, 13, 3, 13, 0, 14, 1,   14, 2, 14, 3, 14,
-      0, 15, 1, 15, 2, 15, 3, 15, 0, 16, 1, 16, 2, 16, 3, 16, 0, 17, 1, 17, 2,   17, 3, 17, 0, 18,
-      1, 18, 2, 18, 3, 18, 0, 19, 1, 19, 2, 19, 3, 19, 0, 20, 1, 20, 2, 20, 3,   20, 0, 21, 1, 21,
-      2, 21, 3, 21, 0, 22, 1, 22, 2, 22, 3, 22, 0, 23, 1, 23, 2, 23, 3, 23, 4,   10, 4, 0,  4, 1,
-      0, 24, 1, 24, 2, 24, 3, 24, 0, 25, 1, 25, 2, 25, 4, 5,  4, 8,  3, 25, 0,   26, 1, 26, 2, 26,
-      3, 26, 0, 27, 1, 27, 2, 27, 4, 11, 4, 2,  3, 27, 0, 28, 1, 28, 2, 28, 3,   28, 0, 29, 1, 29,
-      2, 29, 3, 29, 0, 30, 1, 30, 2, 30, 3, 30, 0, 31, 1, 31, 2, 31, 3, 31, 140, 140};
-  EXPECT_THAT(category_location_ret, ::testing::ElementsAreArray(category_location_ref));
-
-  std::vector<dtype> h_frequent_model_table_offsets_ref{0, 0, 2, 2, 3, 3, 5,  5,  6,  6,
-                                                        6, 8, 8, 9, 9, 9, 10, 11, 11, 12};
-  std::vector<dtype> h_infrequent_model_table_offsets_ref{0, 24, 26, 28, 32};
-  EXPECT_THAT(model.h_frequent_model_table_offsets,
-              ::testing::ElementsAreArray(h_frequent_model_table_offsets_ref));
-  EXPECT_THAT(model.h_infrequent_model_table_offsets,
-              ::testing::ElementsAreArray(h_infrequent_model_table_offsets_ref));
-};
-
-template <typename dtype>
-void model_init_test(const size_t num_instances, const size_t num_tables, const size_t batch_size,
-                     CommunicationType ctype) {
-  // 1. generate the reference model from reference stats and corresponding data
-  // std::vector<size_t> categories;
-  // std::vector<size_t> counts;
-
-  const size_t num_iterations = 1;
-  std::cout << "Model init test ...  " << std::endl << std::endl;
-  std::cout << "number of instances  : " << num_instances << std::endl;
-  std::cout << "Number of tables     : " << num_tables << std::endl;
-  std::cout << "Batch size           : " << batch_size << std::endl;
-  std::cout << "Number of iterations : " << num_iterations << std::endl;
-
-  HybridEmbeddingInputGenerator<dtype> input_generator(848484);
-  std::vector<dtype> raw_data = input_generator.generate_categorical_input(batch_size, num_tables);
-  std::vector<size_t> table_sizes = input_generator.get_table_sizes();
-  const size_t num_categories =
-      std::accumulate(table_sizes.begin(), table_sizes.end(), static_cast<size_t>(0));
-  std::cout << "Table sizes          : ";
-  for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding)
-    std::cout << '\t' << table_sizes[embedding];
-  std::cout << std::endl;
-
-  // create the gpu tensor for the raw data
-  cudaStream_t stream = 0;
-  Tensor2<dtype> d_raw_data;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-  std::cout << "number of samples  : " << raw_data.size() << std::endl;
-  buf->reserve({raw_data.size(), 1}, &d_raw_data);
-  buf->allocate();
-  upload_tensor(raw_data, d_raw_data, stream);
-
-  std::cout << "Testing raw data..." << std::endl;
-  test_raw_data(d_raw_data.get_ptr(), batch_size, num_tables, num_iterations, table_sizes);
-  std::cout << "Done testing raw data..." << std::endl;
-
-  // 2. perform model initialization, data
-  std::cout << "performing statistics and calibration initialization..." << std::endl;
-  Data<dtype> data(table_sizes, batch_size, num_iterations);
-  data.data_to_unique_categories(d_raw_data, stream);
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-
-  std::cout << "Testing samples..." << std::endl;
-  test_samples<dtype>(d_raw_data.get_ptr(), data);
-  std::cout << "Done testing samples!" << std::endl;
-
-  Statistics<dtype> statistics(data, num_instances);
-  std::cout << "Statistics construction " << std::endl;
-  CalibrationData calibration(1, 1. / 10., 130.e9, 190.e9, 1.0);
-
-  //    model creation
-  std::cout << "performing model initialization..." << std::endl;
-  std::vector<uint32_t> num_instances_per_node(1);
-  num_instances_per_node[0] = (uint32_t)num_instances;
-  // Model<dtype> model(ctype, 0, num_instances_per_node, num_categories);
-  //  = {(uint32_t)num_instances};
-  std::vector<Model<dtype>> models;
-  std::vector<Tensor2<dtype>> frequent_infrequent_categories(num_instances);
-  for (size_t instance = 0; instance < num_instances; ++instance) {
-    models.emplace_back(ctype, (uint32_t)instance, num_instances_per_node, num_categories);
-    std::cout << "instance : " << instance << " out of " << num_instances << std::endl;
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    // std::shared_ptr<BufferBlock2<dtype>> temp_block_buffer =  buf->create_block<>(dtype);
-    // Tensor2<dtype> tmp_infrequent_categories;
-    buf->reserve({(size_t)num_categories, 1}, &frequent_infrequent_categories[instance]);
-    buf->allocate();
-    // std::cout << "constructing instance, allocating memory..." << std::endl;
-    // std::cout << "initializing model..." << std::endl;
-    models[instance].init_hybrid_model(calibration, statistics, data,
-                                       frequent_infrequent_categories[instance], stream);
-    // std::cout << "done initializing model" << std::endl;
-  }
-  std::vector<dtype> categories_sorted_stats;
-  std::vector<uint32_t> counts_sorted_stats;
-  download_tensor(categories_sorted_stats, statistics.categories_sorted, stream);
-  download_tensor(counts_sorted_stats, statistics.counts_sorted, stream);
-  // TODO: check consistency of
-  //   global_instance_id,
-  //   num_instances_per_node,
-  //   node_id,
-
-  // Check defining properties
-
-  std::cout << "Checking consistency and completeness of infrequent embedding..." << std::endl;
-  // check order of categories for infrequent
-  // - assuming default distributed embedding
-  std::vector<size_t> num_infrequent_model_vec(num_instances);
-  size_t num_infrequent_tables = 0;
-  for (size_t instance = 0; instance < num_instances; ++instance) {
-    Model<dtype> &model = models[instance];
-
-    std::vector<dtype> category_location;
-    download_tensor(category_location, model.category_location, stream);
-
-    size_t indx_infrequent = 0;
-    for (size_t category = 0; category < num_categories; ++category) {
-      if (category_location[2 * category] < num_instances) {
-        size_t instance_location = category_location[2 * category];
-        size_t buffer_index = category_location[2 * category + 1];
-
-        EXPECT_EQ(instance_location, indx_infrequent % num_instances);
-        EXPECT_EQ(buffer_index, indx_infrequent / num_instances);
-
-        indx_infrequent++;
-      }
-    }
-    const size_t num_infrequent_model = indx_infrequent;
-    num_infrequent_model_vec[instance] = num_infrequent_model;
-
-    // check consistency table offsets
-    size_t num_infrequent_tables_instance = 0;
-    for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-      size_t cur_offset = model.h_infrequent_model_table_offsets[embedding];
-      size_t next_offset = model.h_infrequent_model_table_offsets[embedding + 1];
-      size_t indx_infrequent_instance = 0;
-      for (size_t category = 0; category < num_categories; ++category) {
-        if (category_location[2 * category] == instance) {
-          if (indx_infrequent_instance >= cur_offset && indx_infrequent_instance < next_offset) {
-            size_t embedding_category =
-                EmbeddingTableFunctors<dtype>::get_embedding_table_index(table_sizes, category);
-            EXPECT_EQ(embedding_category, embedding);
-          }
-          indx_infrequent_instance++;
-        }
-      }
-      num_infrequent_tables_instance = indx_infrequent_instance;
-    }
-    num_infrequent_tables += num_infrequent_tables_instance;
-  }
-  // Check that the total number of embedding vectors in all instances for all tables equals
-  // the total number of infrequent embedding vectors
-  if (num_infrequent_model_vec.size() > 0) {
-    EXPECT_EQ(num_infrequent_tables, num_infrequent_model_vec[0]);
-    if (num_infrequent_tables != num_infrequent_model_vec[0]) {
-      std::cout << "num_infrequent_tables       = " << num_infrequent_tables << std::endl;
-      std::cout << "num_infrequent_model_vec[0] = " << num_infrequent_model_vec[0] << std::endl;
-    }
-  }
-  // Check that the number of infrequent categories is the same for all instances.
-  for (size_t instance = 1; instance < num_instances; ++instance) {
-    EXPECT_EQ(num_infrequent_model_vec[instance], num_infrequent_model_vec[0]);
-  }
-  std::cout << "Checking consistency and completeness of frequent embedding..." << std::endl;
-  // Check that the frequent embedding model is complete and self-consistent
-  //
-  // - num_frequent is consistent with data and num_categories - i.e. table_sizes
-  // - category_frequent_index and frequent_categories are consistent
-  // - both are consistent with num_frequent
-  // - table offsets frequent embedding are consistent with frequent_categories array
-  //
-  for (size_t instance = 0; instance < num_instances; ++instance) {
-    Model<dtype> &model = models[instance];
-    const size_t num_categories = model.num_categories;
-
-    std::vector<dtype> &frequent_table_offsets = model.h_frequent_model_table_offsets;
-    std::vector<dtype> category_location;
-    download_tensor(category_location, model.category_location, stream);
-    std::vector<dtype> frequent_categories;
-    download_tensor(frequent_categories, model.frequent_categories, stream);
-
-    // check that number of frequent categories in category_location == model.num_frequent
-    size_t num_frequent_model = 0;
-    for (size_t i = 0; i < num_categories; ++i) {
-      num_frequent_model += (size_t)(category_location[2 * i] == num_instances ? 1 : 0);
-    }
-    EXPECT_EQ(num_frequent_model, model.num_frequent);
-
-    // check that category in frequent_categories has corresponding index in category_frequent_index
-    for (size_t i = 0; i < frequent_categories.size(); ++i) {
-      size_t category = frequent_categories[i];
-      EXPECT_EQ(category_location[2 * category + 1], i);
-    }
-
-    std::map<dtype, size_t> category_to_stats_map;
-    for (size_t i = 0; i < categories_sorted_stats.size(); ++i) {
-      category_to_stats_map[categories_sorted_stats[i]] = i;
-    }
-
-    // check that table offsets are consistent with the frequent_categories array
-    //   - check that categories corresponding to embedding actually part of embedding
-    std::set<dtype> set_categories_from_table_offsets;
-    std::set<dtype> set_categories_frequent_categories_array(frequent_categories.begin(),
-                                                             frequent_categories.end());
-    for (size_t em_instance = 0; em_instance < num_instances; ++em_instance) {
-      for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-        size_t cur_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding];
-        size_t next_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding + 1];
-        size_t counts_cur = 0;
-        size_t counts_prev = 0;
-        for (size_t frequent_category_index = cur_offset; frequent_category_index < next_offset;
-             ++frequent_category_index) {
-          size_t category = frequent_categories[frequent_category_index];
-          size_t embedding_category =
-              EmbeddingTableFunctors<dtype>::get_embedding_table_index(table_sizes, category);
-
-          EXPECT_EQ(embedding, embedding_category);
-
-          // find category in category_sorted_stats array
-          size_t indx_stats = category_to_stats_map[category];
-          counts_cur = (size_t)counts_sorted_stats[indx_stats];
-          if (frequent_category_index > cur_offset) {
-            // find category in category_sorted_stats array
-            EXPECT_TRUE(counts_prev >= counts_cur);
-          }
-          counts_prev = counts_cur;
-
-          set_categories_from_table_offsets.insert(category);
-        }
-      }
-    }
-    //   - check that the table offsets cover all frequent categories
-    EXPECT_TRUE(set_categories_from_table_offsets == set_categories_frequent_categories_array);
-
-    // check that infrequent categories as per category_location are not present in
-    // frequent_categories array
-    for (size_t category = 0; category < num_categories; ++category) {
-      if (category_location[2 * category] < num_instances) {
-        EXPECT_TRUE(set_categories_frequent_categories_array.find(category) ==
-                    set_categories_frequent_categories_array.end());
-      }
-    }
-  }
-
-  // TODO:
-  // // Check that the models of all the instances are identical
-  // std::vector<dtype> category_frequent_index;
-  // std::vector<dtype> category_location;
-  // download_tensor(category_frequent_index, models[0].category_frequent_index, stream);
-  // download_tensor(category_location, models[0].category_location, stream);
-  // for (size_t instance = 0; instance < num_instances; ++instance) {
-  //   for (size_t category = 0; category < num_categories; ++category) {
-
-  //   }
-  // }
-
-  std::cout << "Finished the unit test for model init()!" << std::endl;
-}
-template <typename dtype>
-void model_init_test(const size_t batch_size, HybridEmbeddingConfig<dtype> config,
-                     std::vector<size_t> &table_sizes) {
-  // 1. generate the reference model from reference stats and corresponding data
-  // std::vector<size_t> categories;
-  // std::vector<size_t> counts;
-  long long num_instances = config.num_instances;
-  size_t num_tables = config.num_tables;
-  auto ctype = config.comm_type;
-  const size_t num_iterations = 1;
-  std::cout << "Model init test ...  " << std::endl << std::endl;
-  std::cout << "number of instances  : " << num_instances << std::endl;
-  std::cout << "Number of tables     : " << num_tables << std::endl;
-  std::cout << "Batch size           : " << batch_size << std::endl;
-  std::cout << "Number of iterations : " << num_iterations << std::endl;
-
-  HybridEmbeddingInputGenerator<dtype> input_generator(config, table_sizes, 848484);
-  std::vector<dtype> raw_data = input_generator.generate_categorical_input(batch_size);
-  // std::vector<size_t> table_sizes = input_generator.get_table_sizes();
-  const size_t num_categories =
-      std::accumulate(table_sizes.begin(), table_sizes.end(), static_cast<size_t>(0));
-  std::cout << "Table sizes          : ";
-  for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding)
-    std::cout << '\t' << table_sizes[embedding];
-  std::cout << std::endl;
-
-  // create the gpu tensor for the raw data
-  cudaStream_t stream = 0;
-  Tensor2<dtype> d_raw_data;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-  std::cout << "number of samples  : " << raw_data.size() / num_tables << std::endl;
-  buf->reserve({raw_data.size(), 1}, &d_raw_data);
-  buf->allocate();
-  upload_tensor(raw_data, d_raw_data, stream);
-
-  std::cout << "Testing raw data..." << std::endl;
-  test_raw_data(d_raw_data.get_ptr(), batch_size, num_tables, num_iterations, table_sizes);
-  std::cout << "Done testing raw data..." << std::endl;
-
-  // 2. perform model initialization, data
-  std::cout << "performing statistics and calibration initialization..." << std::endl;
-  Data<dtype> data(table_sizes, batch_size, num_iterations);
-  data.data_to_unique_categories(d_raw_data, stream);
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-
-  std::cout << "Testing samples..." << std::endl;
-  test_samples<dtype>(d_raw_data.get_ptr(), data);
-  std::cout << "Done testing samples!" << std::endl;
-
-  Statistics<dtype> statistics(data, num_instances);
-  CalibrationData calibration(1, 1. / 10., 130.e9, 190.e9, 1.0);
-
-  //    model creation
-  std::cout << "performing model initialization..." << std::endl;
-  std::vector<uint32_t> num_instances_per_node(1);
-  num_instances_per_node[0] = (uint32_t)num_instances;
-  // Model<dtype> model(ctype, 0, num_instances_per_node, num_categories);
-  //  = {(uint32_t)num_instances};
-  std::vector<Model<dtype>> models;
-  std::vector<Tensor2<dtype>> frequent_infrequent_categories(num_instances);
-  for (long long instance = 0; instance < num_instances; ++instance) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    // std::shared_ptr<BufferBlock2<dtype>> temp_block_buffer =  buf->create_block<>(dtype);
-    // Tensor2<dtype> tmp_infrequent_categories;
-    buf->reserve({(size_t)num_categories, 1}, &frequent_infrequent_categories[instance]);
-    buf->allocate();
-
-    // std::cout << "instance : " << instance << std::endl;
-    std::cout << "constructing instance, allocating memory..." << std::endl;
-    models.emplace_back(ctype, (uint32_t)instance, num_instances_per_node, num_categories);
-    std::cout << "initializing model..." << std::endl;
-    models[instance].init_hybrid_model(calibration, statistics, data,
-                                       frequent_infrequent_categories[instance], stream);
-    std::cout << "done initializing model" << std::endl;
-  }
-  std::vector<dtype> categories_sorted_stats;
-  std::vector<uint32_t> counts_sorted_stats;
-  download_tensor(categories_sorted_stats, statistics.categories_sorted, stream);
-  download_tensor(counts_sorted_stats, statistics.counts_sorted, stream);
-
-  // TODO: check consistency of
-  //   global_instance_id,
-  //   num_instances_per_node,
-  //   node_id,
-
-  // Check defining properties
-
-  std::cout << "Checking consistency and completeness of infrequent embedding..." << std::endl;
-  // check order of categories for infrequent
-  // - assuming default distributed embedding
-  std::vector<size_t> num_infrequent_model_vec(num_instances);
-  size_t num_infrequent_tables = 0;
-  for (long long instance = 0; instance < num_instances; ++instance) {
-    Model<dtype> &model = models[instance];
-
-    std::vector<dtype> category_location;
-    download_tensor(category_location, model.category_location, stream);
-
-    size_t indx_infrequent = 0;
-    for (size_t category = 0; category < num_categories; ++category) {
-      if (category_location[2 * category] < num_instances) {
-        size_t instance_location = category_location[2 * category];
-        size_t buffer_index = category_location[2 * category + 1];
-
-        EXPECT_EQ(instance_location, indx_infrequent % num_instances);
-        EXPECT_EQ(buffer_index, indx_infrequent / num_instances);
-
-        indx_infrequent++;
-      }
-    }
-    const size_t num_infrequent_model = indx_infrequent;
-    num_infrequent_model_vec[instance] = num_infrequent_model;
-
-    // check consistency table offsets
-    size_t num_infrequent_tables_instance = 0;
-    for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-      size_t cur_offset = model.h_infrequent_model_table_offsets[embedding];
-      size_t next_offset = model.h_infrequent_model_table_offsets[embedding + 1];
-      size_t indx_infrequent_instance = 0;
-      for (size_t category = 0; category < num_categories; ++category) {
-        if (category_location[2 * category] == (dtype)instance) {
-          if (indx_infrequent_instance >= cur_offset && indx_infrequent_instance < next_offset) {
-            size_t embedding_category =
-                EmbeddingTableFunctors<dtype>::get_embedding_table_index(table_sizes, category);
-            EXPECT_EQ(embedding_category, embedding);
-          }
-          indx_infrequent_instance++;
-        }
-      }
-      num_infrequent_tables_instance = indx_infrequent_instance;
-    }
-    num_infrequent_tables += num_infrequent_tables_instance;
-  }
-  // Check that the total number of embedding vectors in all instances for all tables equals
-  // the total number of infrequent embedding vectors
-  if (num_infrequent_model_vec.size() > 0) {
-    EXPECT_EQ(num_infrequent_tables, num_infrequent_model_vec[0]);
-    if (num_infrequent_tables != num_infrequent_model_vec[0]) {
-      std::cout << "num_infrequent_tables       = " << num_infrequent_tables << std::endl;
-      std::cout << "num_infrequent_model_vec[0] = " << num_infrequent_model_vec[0] << std::endl;
-    }
-  }
-  // Check that the number of infrequent categories is the same for all instances.
-  for (long long instance = 1; instance < num_instances; ++instance) {
-    EXPECT_EQ(num_infrequent_model_vec[instance], num_infrequent_model_vec[0]);
-  }
-
-  std::cout << "Checking consistency and completeness of frequent embedding..." << std::endl;
-  // Check that the frequent embedding model is complete and self-consistent
-  //
-  // - num_frequent is consistent with data and num_categories - i.e. table_sizes
-  // - category_frequent_index and frequent_categories are consistent
-  // - both are consistent with num_frequent
-  // - table offsets frequent embedding are consistent with frequent_categories array
-  //
-  for (long long instance = 0; instance < num_instances; ++instance) {
-    Model<dtype> &model = models[instance];
-    const size_t num_categories = model.num_categories;
-
-    std::vector<dtype> &frequent_table_offsets = model.h_frequent_model_table_offsets;
-    std::vector<dtype> category_location;
-    download_tensor(category_location, model.category_location, stream);
-    std::vector<dtype> frequent_categories;
-    download_tensor(frequent_categories, model.frequent_categories, stream);
-
-    // check that number of frequent categories in category_location == model.num_frequent
-    size_t num_frequent_model = 0;
-    for (size_t i = 0; i < num_categories; ++i) {
-      num_frequent_model += (size_t)(category_location[2 * i] == num_instances ? 1 : 0);
-    }
-    EXPECT_EQ(num_frequent_model, model.num_frequent);
-
-    // check that category in frequent_categories has corresponding index in category_frequent_index
-    for (size_t i = 0; i < frequent_categories.size(); ++i) {
-      size_t category = frequent_categories[i];
-      EXPECT_EQ(category_location[2 * category + 1], i);
-    }
-
-    std::map<dtype, size_t> category_to_stats_map;
-    for (size_t i = 0; i < categories_sorted_stats.size(); ++i) {
-      category_to_stats_map[categories_sorted_stats[i]] = i;
-    }
-
-    // check that table offsets are consistent with the frequent_categories array
-    //   - check that categories corresponding to embedding actually part of embedding
-    std::set<dtype> set_categories_from_table_offsets;
-    std::set<dtype> set_categories_frequent_categories_array(frequent_categories.begin(),
-                                                             frequent_categories.end());
-    for (long long em_instance = 0; em_instance < num_instances; ++em_instance) {
-      for (size_t embedding = 0; embedding < num_tables; ++embedding) {
-        size_t cur_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding];
-        size_t next_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding + 1];
-        size_t counts_cur = 0;
-        size_t counts_prev = 0;
-        for (size_t frequent_category_index = cur_offset; frequent_category_index < next_offset;
-             ++frequent_category_index) {
-          size_t category = frequent_categories[frequent_category_index];
-          size_t embedding_category =
-              EmbeddingTableFunctors<dtype>::get_embedding_table_index(table_sizes, category);
-
-          EXPECT_EQ(embedding, embedding_category);
-
-          // find category in category_sorted_stats array
-          size_t indx_stats = category_to_stats_map[category];
-          counts_cur = (size_t)counts_sorted_stats[indx_stats];
-          if (frequent_category_index > cur_offset) {
-            // find category in category_sorted_stats array
-            EXPECT_TRUE(counts_prev >= counts_cur);
-          }
-          counts_prev = counts_cur;
-
-          set_categories_from_table_offsets.insert(category);
-        }
-      }
-    }
-    //   - check that the table offsets cover all frequent categories
-    EXPECT_TRUE(set_categories_from_table_offsets == set_categories_frequent_categories_array);
-
-    // check that infrequent categories as per category_location are not present in
-    // frequent_categories array
-    for (size_t category = 0; category < num_categories; ++category) {
-      if (category_location[2 * category] < num_instances) {
-        EXPECT_TRUE(set_categories_frequent_categories_array.find(category) ==
-                    set_categories_frequent_categories_array.end());
-      }
-    }
-  }
-  std::cout << "Finished the unit test for model init()!" << std::endl;
-}
-
-}  // namespace
-
-// TEST(hybrid_embedding_model_test, uint32) { model_test<uint32_t>(); }
-// TEST(hybrid_embedding_model_test, long_long) { model_test<long long>(); }
-TEST(hybrid_embedding_model_test, init_model) {
-  const size_t N = 5;
-  const size_t batch_size = 15 * 64 * 1024;
-
-  for (size_t num_instances = 1; num_instances <= 16; num_instances = 4 * num_instances) {
-    for (size_t num_tables = 1; num_tables <= 32; num_tables = 4 * num_tables) {
-      for (size_t i = 0; i < N; ++i) {
-        model_init_test<uint32_t>(num_instances, num_tables, batch_size,
-                                  CommunicationType::NVLink_SingleNode);
-      }
-    }
-  }
-}
-TEST(hybrid_embedding_model_test, large_table_sizes) {
-  const size_t batch_size = 64 * 1024;
-  size_t num_nodes = 1;
-  size_t num_instances = 1;
-  size_t num_tables = 2;
-  size_t embedding_vec_size = 4;
-  long long num_categories = -1;
-  long long num_frequent = -1;
-  float lr = 0.0001f;
-  std::vector<size_t> table_sizes{(1ul << 28), 1ul << 10};
-  HybridEmbeddingConfig<long long> config{
-      num_nodes, num_instances, num_tables, embedding_vec_size, num_categories, num_frequent, lr};
-  model_init_test<long long>(batch_size, config, table_sizes);
-}
-TEST(hybrid_embedding_model_test, debug) {
-  const size_t batch_size = 64 * 1024;
-  model_init_test<uint32_t>(2, 1, batch_size, CommunicationType::NVLink_SingleNode);
-}
diff --git a/test/utest/embedding/hybrid_embedding/select_test.cu b/test/utest/embedding/hybrid_embedding/select_test.cu
deleted file mode 100644
index efc379c7ff..0000000000
--- a/test/utest/embedding/hybrid_embedding/select_test.cu
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <cstdlib>
-#include <embeddings/hybrid_embedding/select.cuh>
-#include <iostream>
-#include <memory>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-namespace Predict {
-template <typename T>
-struct is_odd {
-  __host__ __device__ __forceinline__ bool operator()(const T &a) const { return (a & 1); }
-  is_odd() = default;
-};
-}  // namespace Predict
-
-template <typename T>
-void check(std::vector<T> &h_ref, std::vector<T> &h_gpu) {
-  for (size_t i = 0; i < h_ref.size(); i++) {
-    if (h_ref[i] != h_gpu[i]) {
-      std::cerr << " error at index " << i << std::endl;
-      exit(-1);
-    }
-  }
-  std::cout << "check pass" << std::endl;
-}
-template <typename T, typename Pred>
-struct SelectTest {
-  Pred Op_;
-  size_t len_;
-  std::vector<T> keys_;
-  std::vector<T> ref_cpu_;
-  std::vector<T> ref_gpu_;
-  T *d_keys_;
-  T *d_output_;
-  T *d_num_selected_out_;
-  T ref_count_;
-
-  void gather_if(const std::vector<T> &input, std::vector<T> &output) {
-    output.clear();
-    if (input.empty()) {
-      for (size_t i = 0; i < len_; i++) {
-        if (Op_(i)) {
-          output.push_back(i);
-        }
-      }
-    } else {
-      for (auto in : input) {
-        if (Op_(in)) {
-          output.push_back(in);
-        }
-      }
-    }
-  }
-
-  SelectTest(size_t len, Pred Op, bool no_input = false) : len_(len), Op_(Op), ref_count_(0) {
-    if (!no_input) {
-      cudaMalloc((void **)(&d_keys_), sizeof(T) * len);
-      keys_.resize(len, 0);
-      for (size_t i = 0; i < keys_.size(); i++) {
-        keys_[i] = std::rand();
-      }
-      std::cout << "keys init done" << std::endl;
-    } else {
-      d_keys_ = nullptr;
-      keys_.clear();
-    }
-    cudaMalloc((void **)(&d_num_selected_out_), sizeof(T));
-    cudaMalloc((void **)(&d_output_), sizeof(T) * len);
-  }
-
-  void test() {
-    if (d_keys_) {
-      cudaMemcpy(d_keys_, keys_.data(), sizeof(T) * len_, cudaMemcpyHostToDevice);
-    }
-    void *d_temp_storage = nullptr;
-    size_t temp_storage_bytes = 0;
-    HugeCTR::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_keys_, d_output_,
-                              d_num_selected_out_, len_, Op_);
-    std::cout << "temp storage bytes\n" << temp_storage_bytes << std::endl;
-    cudaMalloc((void **)&d_temp_storage, temp_storage_bytes);
-    HugeCTR::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_keys_, d_output_,
-                              d_num_selected_out_, len_, Op_);
-    cudaDeviceSynchronize();
-    cudaMemcpy(&ref_count_, d_num_selected_out_, sizeof(T), cudaMemcpyDeviceToHost);
-    gather_if(keys_, ref_cpu_);
-    if (ref_count_ != static_cast<T>(ref_cpu_.size())) {
-      std::cerr << "selected num mismatches\n" << std::endl;
-      std::cerr << "expected: " << ref_cpu_.size() << " got " << ref_count_ << std::endl;
-      exit(-1);
-    }
-    std::cout << "get num_selected " << ref_count_ << std::endl;
-    ref_gpu_.resize(ref_count_);
-    cudaMemcpy(ref_gpu_.data(), d_output_, sizeof(T) * ref_gpu_.size(), cudaMemcpyDeviceToHost);
-    check(ref_cpu_, ref_gpu_);
-    cudaFree(d_temp_storage);
-  }
-  ~SelectTest() {
-    if (d_keys_) {
-      cudaFree(d_keys_);
-    }
-    cudaFree(d_num_selected_out_);
-    cudaFree(d_output_);
-  }
-};
-
-TEST(select, is_odd_31) {
-  SelectTest<size_t, Predict::is_odd<size_t>> select_test((1ul << 32), Predict::is_odd<size_t>());
-  select_test.test();
-}
-TEST(select, counting) {
-  SelectTest<size_t, Predict::is_odd<size_t>> select_test((1ul << 20), Predict::is_odd<size_t>(),
-                                                          true);
-  select_test.test();
-}
-TEST(select, large_counting) {
-  SelectTest<size_t, Predict::is_odd<size_t>> select_test((1ul << 31), Predict::is_odd<size_t>(),
-                                                          true);
-  select_test.test();
-}
diff --git a/test/utest/embedding/hybrid_embedding/statistics_test.cpp b/test/utest/embedding/hybrid_embedding/statistics_test.cpp
deleted file mode 100644
index 0d900bce5b..0000000000
--- a/test/utest/embedding/hybrid_embedding/statistics_test.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <cmath>
-#include <common.hpp>
-#include <cstdlib>
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/hybrid_embedding/utils.hpp>
-#include <iostream>
-#include <map>
-#include <numeric>
-#include <tensor2.hpp>
-#include <unordered_set>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-#include <utest/embedding/hybrid_embedding/statistics_test.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-using namespace HugeCTR::hybrid_embedding;
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-void arg_sort(const std::vector<dtype> &v, std::vector<size_t> &arg) {
-  arg.resize(v.size());
-  std::iota(arg.begin(), arg.end(), (size_t)0);
-  std::stable_sort(arg.begin(), arg.end(), [&v](size_t i1, size_t i2) { return v[i1] > v[i2]; });
-}
-
-template <typename dtype>
-void generate_reference_stats(const std::vector<dtype> &data, std::vector<dtype> &samples,
-                              std::vector<size_t> &categories_stats,
-                              std::vector<size_t> &counts_stats,
-                              const std::vector<size_t> &table_sizes, const size_t batch_size) {
-  const size_t num_embeddings = table_sizes.size();
-
-  std::vector<dtype> embedding_offsets;
-  EmbeddingTableFunctors<dtype>::get_embedding_offsets(embedding_offsets, table_sizes);
-
-  samples.resize(data.size());
-  for (size_t sample = 0; sample < batch_size; ++sample) {
-    for (size_t embedding = 0; embedding < num_embeddings; ++embedding) {
-      size_t indx = sample * num_embeddings + embedding;
-      samples[indx] = embedding_offsets[embedding] + data[indx];
-    }
-  }
-
-  // create statistics
-  std::set<dtype> category_set(samples.begin(), samples.end());
-  const size_t num_unique_categories = category_set.size();
-
-  // helper structures
-  std::map<dtype, size_t> category_index;
-  std::vector<dtype> categories(num_unique_categories);
-  size_t indx = (size_t)0;
-  for (const auto &category : category_set) {
-    category_index[category] = indx;
-    categories[indx] = category;
-    indx++;
-  }
-
-  std::vector<size_t> counts(num_unique_categories, (size_t)0);
-  for (size_t sample = 0; sample < samples.size(); ++sample) {
-    size_t indx = category_index[samples[sample]];
-    counts[indx]++;
-  }
-
-  // sort categories and counts by argument
-  std::vector<size_t> arg;
-  arg_sort(counts, arg);
-  categories_stats.resize(num_unique_categories);
-  counts_stats.resize(num_unique_categories);
-  for (indx = 0; indx < num_unique_categories; ++indx) {
-    categories_stats[indx] = categories[arg[indx]];
-    counts_stats[indx] = counts[arg[indx]];
-
-    // check order counts
-    if (indx > 0 && counts_stats[indx] > counts_stats[indx - 1]) {
-      HCTR_LOG_S(DEBUG, WORLD) << "incorrect counts order!" << std::endl;
-    }
-  }
-}
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
-
-template <typename dtype>
-void statistics_test(const size_t batch_size, const size_t num_tables) {
-  // 1. generate reference samples and stats
-  cudaStream_t stream = 0;
-
-  std::vector<size_t> categories;
-  std::vector<size_t> counts;
-
-  HugeCTR::hybrid_embedding::HybridEmbeddingInputGenerator<dtype> input_generator(848484);
-  std::vector<dtype> raw_data = input_generator.generate_categorical_input(batch_size, num_tables);
-  std::vector<size_t> table_sizes = input_generator.get_table_sizes();
-  HCTR_LOG_S(DEBUG, WORLD) << "Number of tables : " << num_tables << std::endl;
-  {
-    auto log = HCTR_LOG_S(DEBUG, WORLD);
-    log << "Table sizes : ";
-    for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding) {
-      log << '\t' << table_sizes[embedding];
-    }
-    log << std::endl;
-  }
-
-  std::vector<dtype> samples_ref;
-  HugeCTR::hybrid_embedding::generate_reference_stats<dtype>(raw_data, samples_ref, categories,
-                                                             counts, table_sizes, batch_size);
-
-  size_t tot_count = 0;
-  for (size_t c = 0; c < categories.size(); ++c) {
-    tot_count += counts[c];
-  }
-  EXPECT_EQ(tot_count, raw_data.size());
-
-  // create the gpu tensor for the raw data
-  HCTR_LOG_S(DEBUG, WORLD) << "placing raw data on gpu..." << std::endl;
-  Tensor2<dtype> d_raw_data;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-  EXPECT_EQ(raw_data.size(), batch_size * num_tables);
-  HCTR_LOG_S(DEBUG, WORLD) << "number of samples  : " << raw_data.size() << std::endl;
-  HCTR_LOG_S(DEBUG, WORLD) << "number of unique categories : " << categories.size() << std::endl;
-  buf->reserve({raw_data.size(), 1}, &d_raw_data);
-  buf->allocate();
-  upload_tensor(raw_data, d_raw_data, stream);
-
-  // 2. perform hybrid_embedding statistics on gpu
-  Data<dtype> data(table_sizes, batch_size, 1);
-  data.data_to_unique_categories(d_raw_data, stream);
-  size_t num_instances = 8;  // not important here
-  HugeCTR::hybrid_embedding::Statistics<dtype> statistics(data, num_instances);
-  statistics.sort_categories_by_count(data.samples, stream);
-  HCTR_LIB_THROW(cudaStreamSynchronize(stream));
-  EXPECT_EQ(statistics.num_samples, raw_data.size());
-  EXPECT_EQ(categories.size(), statistics.num_unique_categories);
-
-  // check that the samples are the same..
-  std::vector<dtype> h_samples(samples_ref.size());
-  download_tensor(h_samples, data.samples, stream);
-  EXPECT_EQ(h_samples.size(), samples_ref.size());
-  for (size_t sample = 0; sample < samples_ref.size(); ++sample) {
-    EXPECT_EQ(h_samples[sample], samples_ref[sample]);
-  }
-
-  // 3. check that hybrid_embedding calculated stats == ref stats
-  std::vector<dtype> h_categories_sorted;
-  std::vector<uint32_t> h_counts_sorted;
-  download_tensor(h_categories_sorted, statistics.categories_sorted, stream);
-  download_tensor(h_counts_sorted, statistics.counts_sorted, stream);
-
-  size_t tot_count_stats = 0;
-  for (size_t c = 0; c < categories.size(); ++c) {
-    tot_count_stats += h_counts_sorted[c];
-  }
-  EXPECT_EQ(tot_count_stats, raw_data.size());
-
-  for (size_t c = 0; c < categories.size(); ++c) {
-    EXPECT_EQ(h_categories_sorted[c], categories[c]);
-    EXPECT_EQ(h_counts_sorted[c], counts[c]);
-  }
-
-  const size_t num_categories_sorted_test = statistics.num_unique_categories;
-  if (num_categories_sorted_test != categories.size()) {
-    HCTR_LOG_S(DEBUG, WORLD) << "Number of categories_sorted is NOT the same as the reference!"
-                             << std::endl;
-  } else {
-    HCTR_LOG_S(DEBUG, WORLD) << "Number of categories_sorted is the same as the reference!"
-                             << std::endl;
-  }
-  EXPECT_EQ(num_categories_sorted_test, categories.size());
-  std::unordered_set<dtype> category_set_test(
-      h_categories_sorted.begin(), h_categories_sorted.begin() + num_categories_sorted_test);
-  std::unordered_set<dtype> category_set_samples_test(h_samples.begin(), h_samples.end());
-  if (category_set_test == category_set_samples_test) {
-    HCTR_LOG_S(DEBUG, WORLD)
-        << "The sorted categories are the same as in the samples and cover all samples!"
-        << std::endl;
-  } else {
-    HCTR_LOG_S(DEBUG, WORLD)
-        << "The sorted categories are NOT the same as in the samples and cover all samples!"
-        << std::endl;
-  }
-  EXPECT_TRUE(category_set_test == category_set_samples_test);
-  std::unordered_set<dtype> category_set_ref(categories.begin(), categories.end());
-  if (category_set_test == category_set_ref) {
-    HCTR_LOG_S(DEBUG, WORLD) << "The sorted categories are the same as the reference sorted!"
-                             << std::endl;
-  } else {
-    HCTR_LOG_S(DEBUG, WORLD) << "The sorted categories are NOT the same as the reference sorted!"
-                             << std::endl;
-  }
-  EXPECT_TRUE(category_set_test == category_set_ref);
-  size_t count_ne = (size_t)0;
-  for (size_t c = 0; c < categories.size(); ++c) {
-    count_ne += ((size_t)h_categories_sorted[c] != (size_t)categories[c] ? 1 : 0);
-  }
-  if (count_ne > 0)
-    HCTR_LOG_S(DEBUG, WORLD) << "Number of different categories : "
-                             << static_cast<double>(count_ne) /
-                                    static_cast<double>(categories.size()) * 100.
-                             << " %" << std::endl;
-  EXPECT_EQ(count_ne, 0);
-}
-
-TEST(calculate_statistics_test, dtype_uint32) {
-  const size_t N = 5;
-  for (size_t batch_size = 128; batch_size < 15 * 64 * 1024; batch_size = 4 * batch_size) {
-    for (size_t num_tables = 1; num_tables <= 32; num_tables = 4 * num_tables) {
-      for (size_t i = 0; i < N; ++i) {
-        statistics_test<uint32_t>(batch_size, num_tables);
-      }
-    }
-  }
-}
-
-TEST(calculate_statistics_test, dtype_long_long) {
-  const size_t N = 5;
-  for (size_t batch_size = 128; batch_size < 15 * 64 * 1024; batch_size = 4 * batch_size) {
-    for (size_t num_tables = 1; num_tables <= 32; num_tables = 4 * num_tables) {
-      for (size_t i = 0; i < N; ++i) {
-        statistics_test<long long>(batch_size, num_tables);
-      }
-    }
-  }
-}
diff --git a/test/utest/embedding/hybrid_embedding/statistics_test.hpp b/test/utest/embedding/hybrid_embedding/statistics_test.hpp
deleted file mode 100644
index 9e8f0a26a2..0000000000
--- a/test/utest/embedding/hybrid_embedding/statistics_test.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-namespace HugeCTR {
-
-namespace hybrid_embedding {
-
-template <typename dtype>
-void generate_reference_stats(const std::vector<dtype> &data, std::vector<dtype> &samples,
-                              std::vector<size_t> &categories_stats,
-                              std::vector<size_t> &counts_stats,
-                              const std::vector<size_t> &table_sizes, const size_t batch_size);
-
-}  // namespace hybrid_embedding
-
-}  // namespace HugeCTR
diff --git a/test/utest/embedding/hybrid_embedding/test_common.cuh b/test/utest/embedding/hybrid_embedding/test_common.cuh
deleted file mode 100644
index 4c07abed3c..0000000000
--- a/test/utest/embedding/hybrid_embedding/test_common.cuh
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-
-template <typename IntType>
-constexpr inline IntType ceildiv(IntType a, IntType b) {
-  return (a + b - 1) / b;
-}
-
-using namespace HugeCTR;
-using namespace HugeCTR::hybrid_embedding;
-
-template <typename dtype, typename emtype>
-class HybridEmbeddingUnitTest {
- protected:
-  const HybridEmbeddingConfig<dtype> config;
-  HybridEmbeddingInputGenerator<dtype> input_generator;
-  const uint32_t batch_size;
-  const uint32_t num_instances;
-  const uint32_t embedding_vec_size;
-
-  const std::vector<dtype> category_location;
-  const std::vector<dtype> samples;
-  const std::vector<size_t> table_sizes;
-
-  cudaStream_t stream;
-  // std::shared_ptr<ResourceManager> fake_resource_manager;
-  GPUResource fake_resource;
-  std::vector<Model<dtype>> model_list;
-  std::vector<Data<dtype>> data_list;
-  std::vector<FrequentEmbeddingSingleNode<dtype, emtype>> frequent_embeddings_single_node;
-  std::vector<FrequentEmbeddingMultiNode<dtype, emtype>> frequent_embeddings_multi_node;
-
-  // std::vector<InfrequentEmbedding<dtype, emtype>> infrequent_embeddings;
-  std::vector<InfrequentEmbedding_NVLink_SingleNode<dtype, emtype>>
-      infrequent_embeddings_single_node;
-  std::vector<InfrequentEmbedding_IB_NVLINK<dtype, emtype>> infrequent_embeddings_ib_nvlink;
-  std::vector<InfrequentEmbedding_IB_NVLink_Hier<dtype, emtype>>
-      infrequent_embeddings_ib_nvlink_hier;
-
-  std::vector<FrequentEmbeddingCompression<dtype>> frequent_embedding_indices;
-  std::vector<InfrequentEmbeddingSelection<dtype>> infrequent_embedding_indices;
-
-  float *dev_lr;
-
-  FrequentEmbeddingData<dtype, emtype> &get_frequent_embedding_data(size_t i) {
-    if (frequent_embeddings_single_node.size()) {
-      return frequent_embeddings_single_node[i].frequent_data_;
-    } else {
-      return frequent_embeddings_multi_node[i].frequent_data_;
-    }
-  }
-
-  FrequentEmbeddingBase<dtype> &get_frequent_embedding(size_t i) {
-    if (frequent_embeddings_single_node.size()) {
-      return frequent_embeddings_single_node[i];
-    } else {
-      return frequent_embeddings_multi_node[i];
-    }
-  }
-
- public:
-  void build_model() {
-    model_list.reserve(num_instances);
-    std::vector<uint32_t> num_instances_per_node_list(config.num_nodes,
-                                                      config.num_instances / config.num_nodes);
-    for (size_t i = 0; i < num_instances; i++) {
-      model_list.emplace_back(config.comm_type, i, num_instances_per_node_list,
-                              config.num_categories);
-      model_list[i].num_frequent = config.num_frequent;
-    }
-
-    for (size_t i = 0; i < num_instances; i++) {
-      upload_tensor(category_location, model_list[i].category_location, stream);
-    }
-  }
-
-  void build_data() {
-    data_list.reserve(num_instances);
-    for (size_t i = 0; i < num_instances; i++) {
-      data_list.emplace_back(table_sizes, batch_size, 1);
-    }
-
-    for (size_t i = 0; i < num_instances; i++) {
-      upload_tensor(samples, data_list[i].samples, stream);
-      upload_tensor(samples, data_list[i].samples, stream);
-    }
-
-    HCTR_LIB_THROW(cudaMalloc(&dev_lr, sizeof(float)));
-    HCTR_LIB_THROW(cudaMemcpy(dev_lr, &config.lr, sizeof(float), cudaMemcpyHostToDevice));
-  }
-
-  void build_frequent() {
-    if (config.comm_type == CommunicationType::NVLink_SingleNode) {
-      frequent_embeddings_single_node.reserve(num_instances);
-    } else {
-      frequent_embeddings_multi_node.reserve(num_instances);
-    }
-
-    for (size_t i = 0; i < num_instances; i++) {
-      std::shared_ptr<BufferBlock2<emtype>> placeholder = NULL;
-      if (config.comm_type == CommunicationType::NVLink_SingleNode) {
-        frequent_embeddings_single_node.emplace_back(model_list[i], fake_resource, placeholder,
-                                                     embedding_vec_size, config.num_frequent);
-      } else {
-        frequent_embeddings_multi_node.emplace_back(model_list[i], fake_resource, placeholder,
-                                                    embedding_vec_size, config.num_frequent);
-      }
-      frequent_embedding_indices.emplace_back(config.num_frequent, data_list[i], model_list[i]);
-    }
-
-    if (config.comm_type == CommunicationType::NVLink_SingleNode) {
-      std::vector<emtype *> h_vectors_cache_pointers(num_instances);
-      for (uint32_t i = 0; i < num_instances; i++) {
-        h_vectors_cache_pointers[i] =
-            frequent_embeddings_single_node[i].get_embedding_vectors_cache().get_ptr();
-      }
-      for (uint32_t i = 0; i < num_instances; i++) {
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            frequent_embeddings_single_node[i].embedding_vectors_cache_pointers_.get_ptr(),
-            h_vectors_cache_pointers.data(), num_instances * sizeof(emtype *),
-            cudaMemcpyHostToDevice, stream));
-      }
-    }
-  }
-
-  void build_infrequent() {
-    if (config.comm_type == CommunicationType::NVLink_SingleNode) {
-      infrequent_embeddings_single_node.reserve(num_instances);
-      for (size_t i = 0; i < num_instances; i++) {
-        infrequent_embeddings_single_node.emplace_back(model_list[i], fake_resource,
-                                                       embedding_vec_size);
-        infrequent_embedding_indices.emplace_back(data_list[i], model_list[i]);
-      }
-    }
-
-    if (config.comm_type == CommunicationType::IB_NVLink) {
-      infrequent_embeddings_ib_nvlink.reserve(num_instances);
-      for (size_t i = 0; i < num_instances; i++) {
-        infrequent_embeddings_ib_nvlink.emplace_back(model_list[i], fake_resource,
-                                                     embedding_vec_size);
-        infrequent_embedding_indices.emplace_back(data_list[i], model_list[i]);
-      }
-    }
-
-    if (config.comm_type == CommunicationType::IB_NVLink_Hier) {
-      infrequent_embeddings_ib_nvlink_hier.reserve(num_instances);
-      for (size_t i = 0; i < num_instances; i++) {
-        infrequent_embeddings_ib_nvlink_hier.emplace_back(model_list[i], fake_resource,
-                                                          embedding_vec_size);
-        infrequent_embedding_indices.emplace_back(data_list[i], model_list[i]);
-        uint32_t samples_size = data_list[i].batch_size * data_list[i].table_sizes.size();
-        infrequent_embeddings_ib_nvlink_hier[i].max_num_infrequent_per_batch_ = samples_size;
-        infrequent_embeddings_ib_nvlink_hier[i].max_num_infrequent_per_train_batch_ = samples_size;
-      }
-    }
-  }
-
-  ncclComm_t get_fake_comm() {
-    ncclComm_t comm;
-    int device_list[1] = {0};
-    ncclCommInitAll(&comm, 1, device_list);
-    return comm;
-  }
-
-  HybridEmbeddingUnitTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                          size_t seed = 1234ll)
-      : config(config),
-        input_generator(config, seed),
-        batch_size(batch_size),
-        num_instances(config.num_instances),
-        embedding_vec_size(config.embedding_vec_size),
-        category_location((input_generator.generate_category_location(),
-                           input_generator.get_category_location())),
-        samples(input_generator.generate_flattened_categorical_input(batch_size)),
-        table_sizes(input_generator.get_table_sizes()),
-        fake_resource(0, 0, 0, seed, seed, get_fake_comm()) {
-    HCTR_LIB_THROW(cudaStreamCreate(&stream));
-    build_model();
-    build_data();
-  }
-};
-
-inline bool compare_element(float a, float b, float epsilon) {
-  // compare absolute error
-  if (fabs(a - b) < epsilon) return true;
-
-  // compare relative error
-  if (fabs(a) >= fabs(b))
-    if (fabs((a - b) / a) < epsilon)
-      return true;
-    else
-      return false;
-  else if (fabs((a - b) / b) < epsilon)
-    return true;
-  else
-    return false;
-}
-
-inline bool compare_array(size_t len, const float *a, const float *b, float epsilon) {
-  for (size_t i = 0; i < len; i++) {
-    if (!compare_element(a[i], b[i], epsilon)) {
-      HCTR_LOG(INFO, WORLD, "Error in compare_array: i=%zu, a=%.8f, b=%.8f\n", i, a[i], b[i]);
-      return false;
-    }
-  }
-
-  return true;
-}
-
-// overload for fp16 on GPU
-inline bool compare_array(size_t len, const __half *a, const __half *b, float epsilon) {
-  for (size_t i = 0; i < len; i++) {
-    float fa = __half2float(a[i]);
-    float fb = __half2float(b[i]);
-    if (!compare_element(fa, fb, epsilon)) {
-      HCTR_LOG(INFO, WORLD, "Error in compare_array: i=%zu, a=%.8f, b=%.8f\n", i, fa, fb);
-      return false;
-    }
-  }
-
-  return true;
-}
diff --git a/test/utest/embedding/hybrid_embedding/update_test.cpp b/test/utest/embedding/hybrid_embedding/update_test.cpp
deleted file mode 100644
index 4224a24b07..0000000000
--- a/test/utest/embedding/hybrid_embedding/update_test.cpp
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include <embeddings/hybrid_embedding/data.hpp>
-#include <embeddings/hybrid_embedding/frequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/infrequent_embedding.hpp>
-#include <embeddings/hybrid_embedding/model.hpp>
-#include <utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp>
-#include <utest/embedding/hybrid_embedding/test_common.cuh>
-
-/************************ Infrequent embedding update ************************/
-
-template <typename dtype, typename emtype>
-class InfrequentUpdateTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- protected:
-  bool single_node;
-
- public:
-  InfrequentUpdateTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size,
-                       bool single_node, size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed),
-        single_node(single_node) {}
-
-  void run() {
-    uint32_t local_batch_size = ceildiv<uint32_t>(this->batch_size, this->num_instances);
-
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_infrequent_model_indices();
-    cpu_embedding.calculate_infrequent_network_indices();
-    cpu_embedding.generate_embedding_vectors();
-    cpu_embedding.generate_gradients();
-    if (this->config.comm_type == CommunicationType::IB_NVLink) {
-      cpu_embedding.backward_a2a_messages();
-    } else if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-      cpu_embedding.backward_a2a_messages_hier();
-    }
-
-    /* Tensors for the messages and gradients */
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-    std::vector<Tensor2<emtype>> received_messages(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      buff->reserve({this->batch_size * this->config.num_tables, this->config.embedding_vec_size},
-                    &received_messages[i]);
-    }
-    std::vector<Tensor2<emtype>> gradients(this->num_instances);
-    if (single_node) {
-      for (size_t i = 0; i < this->num_instances; i++) {
-        buff->reserve({this->num_instances * local_batch_size * this->config.num_tables,
-                       this->config.embedding_vec_size},
-                      &gradients[i]);
-      }
-    }
-    buff->allocate();
-
-    /* Single-node: upload gradients */
-    this->build_infrequent();
-    if (single_node) {
-      for (size_t i = 0; i < this->num_instances; i++) {
-        upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream);
-      }
-    }
-
-    /* Infrequent update_model */
-    std::vector<std::vector<float>> updated_vectors(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (this->config.comm_type == CommunicationType::NVLink_SingleNode) {
-        this->infrequent_embeddings_single_node[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_,
-                      this->stream);
-        this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream);
-
-        std::vector<const emtype *> gradients_pointers(this->num_instances);
-        for (uint32_t network_id = 0; network_id < this->num_instances; network_id++)
-          gradients_pointers[network_id] = gradients[network_id].get_ptr();
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            this->infrequent_embeddings_single_node[i].gradients_pointers_.get_ptr(),
-            gradients_pointers.data(), this->num_instances * sizeof(emtype *),
-            cudaMemcpyHostToDevice, this->stream));
-        this->infrequent_embeddings_single_node[i].update_model_direct(this->dev_lr, 1.f,
-                                                                       this->stream);
-
-        download_tensor(updated_vectors[i],
-                        this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_,
-                        this->stream);
-      }
-      if (this->config.comm_type == CommunicationType::IB_NVLink) {
-        this->infrequent_embeddings_ib_nvlink[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_,
-                      this->stream);
-        this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_model_indices(this->stream);
-
-        upload_tensor(cpu_embedding.backward_received_messages[i], received_messages[i],
-                      this->stream);
-        this->infrequent_embeddings_ib_nvlink[i].update_model(received_messages[i].get_ptr(),
-                                                              this->dev_lr, 1.f, this->stream);
-
-        download_tensor(updated_vectors[i],
-                        this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_,
-                        this->stream);
-      }
-      if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) {
-        this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices(
-            &this->infrequent_embedding_indices[i]);
-        upload_tensor(cpu_embedding.infrequent_embedding_vectors[i],
-                      this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_,
-                      this->stream);
-        this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_model_indices(
-            this->stream);
-
-        upload_tensor(cpu_embedding.backward_received_messages[i], received_messages[i],
-                      this->stream);
-        this->infrequent_embeddings_ib_nvlink_hier[i].hier_update_model(
-            received_messages[i].get_ptr(), this->dev_lr, 1.f, this->stream);
-
-        download_tensor(updated_vectors[i],
-                        this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_,
-                        this->stream);
-      }
-    }
-
-    /* Reference update_model */
-    cpu_embedding.infrequent_update();
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      updated_vectors[i].resize(
-          ceildiv<dtype>(this->config.num_categories - this->config.num_frequent,
-                         this->num_instances) *
-          this->config.embedding_vec_size);
-      EXPECT_THAT(updated_vectors[i],
-                  ::testing::Pointwise(::testing::FloatNear(1e-2),
-                                       cpu_embedding.infrequent_embedding_vectors[i]));
-    }
-  }
-};
-
-/************************* Frequent embedding update *************************/
-
-template <typename dtype, typename emtype>
-class FrequentUpdateTest : public HybridEmbeddingUnitTest<dtype, emtype> {
- protected:
-  bool single_node;
-
- public:
-  FrequentUpdateTest(const HybridEmbeddingConfig<dtype> config, size_t batch_size, bool single_node,
-                     size_t seed = 1234ll)
-      : HybridEmbeddingUnitTest<dtype, emtype>(config, batch_size, seed),
-        single_node(single_node) {}
-
-  void run() {
-    uint32_t local_batch_size = ceildiv<uint32_t>(this->batch_size, this->num_instances);
-
-    HybridEmbeddingCpu<dtype, emtype> cpu_embedding(this->config, this->batch_size,
-                                                    this->category_location, this->samples);
-    cpu_embedding.calculate_frequent_network_cache_indices();
-    cpu_embedding.generate_embedding_vectors();
-    cpu_embedding.generate_gradients();
-    cpu_embedding.frequent_reduce_gradients();
-
-    /* Tensors for the gradients (single-node) */
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-    std::vector<Tensor2<emtype>> gradients(this->num_instances);
-    if (single_node) {
-      for (size_t i = 0; i < this->num_instances; i++) {
-        buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size},
-                      &gradients[i]);
-      }
-    }
-    buff->allocate();
-
-    /* Frequent update_model */
-    this->build_frequent();
-    std::vector<std::vector<float>> updated_vectors(this->num_instances);
-    std::vector<const emtype *> frequent_partial_gradients_pointers(this->num_instances);
-    for (size_t i = 0; i < this->num_instances; i++) {
-      this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]);
-      upload_tensor(cpu_embedding.frequent_embedding_vectors[i],
-                    this->get_frequent_embedding_data(i).frequent_embedding_vectors_, this->stream);
-      if (single_node) {
-        upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream);
-        frequent_partial_gradients_pointers[i] =
-            this->get_frequent_embedding_data(i).get_gradients().get_ptr();
-      } else
-        upload_tensor(cpu_embedding.reduced_gradients,
-                      this->get_frequent_embedding_data(i).get_gradients(), this->stream);
-    }
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (single_node) {
-        this->get_frequent_embedding(i).indices_->calculate_cache_masks(this->stream);
-        this->get_frequent_embedding(i).indices_->calculate_network_cache_indices(this->stream);
-        this->get_frequent_embedding(i).indices_->calculate_model_cache_indices(80, this->stream);
-        this->get_frequent_embedding(i).indices_->calculate_frequent_sample_indices(this->stream);
-        this->frequent_embeddings_single_node[i].local_reduce(gradients[i].get_ptr(), this->stream);
-      } else {
-        this->frequent_embeddings_multi_node[i].update_model(this->dev_lr, 1.f, this->stream);
-      }
-    }
-    for (size_t i = 0; i < this->num_instances; i++) {
-      if (single_node) {
-        HCTR_LIB_THROW(cudaMemcpyAsync(
-            this->frequent_embeddings_single_node[i].partial_gradients_pointers_.get_ptr(),
-            frequent_partial_gradients_pointers.data(), this->num_instances * sizeof(emtype *),
-            cudaMemcpyHostToDevice, this->stream));
-        this->frequent_embeddings_single_node[i].update_model_direct(this->dev_lr, 1.f,
-                                                                     this->stream);
-      }
-      download_tensor(updated_vectors[i],
-                      this->get_frequent_embedding_data(i).frequent_embedding_vectors_,
-                      this->stream);
-    }
-
-    /* Reference update_model */
-    if (single_node)
-      cpu_embedding.frequent_update_single_node();
-    else
-      cpu_embedding.frequent_update();
-
-    /* Compare */
-    for (size_t i = 0; i < this->num_instances; i++) {
-      updated_vectors[i].resize(this->config.num_frequent * this->config.embedding_vec_size);
-      EXPECT_THAT(updated_vectors[i],
-                  ::testing::Pointwise(::testing::FloatNear(5e-2),
-                                       cpu_embedding.frequent_embedding_vectors[i]));
-    }
-  }
-};
-
-/**************************** Test instantiations ****************************/
-
-static const HybridEmbeddingConfig<uint32_t> config_uint32 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<long long> config_int64 = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_uint32_single_node = {
-    1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode};
-static const HybridEmbeddingConfig<long long> config_int64_single_node = {
-    1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode};
-
-// Edge cases: no frequent, all frequent
-static const HybridEmbeddingConfig<uint32_t> config_no_freq = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink};
-static const HybridEmbeddingConfig<uint32_t> config_no_freq_single_node = {
-    1, 8, 10, 128, 1000, 0, 0.5f, CommunicationType::NVLink_SingleNode};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq_single_node = {
-    1, 8, 10, 128, 1000, 1000, 0.5f, CommunicationType::NVLink_SingleNode};
-
-// Hierarchical A2A
-static const HybridEmbeddingConfig<uint32_t> config_uint32_hier = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<long long> config_int64_hier = {
-    4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<uint32_t> config_no_freq_hier = {
-    4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink_Hier};
-static const HybridEmbeddingConfig<uint32_t> config_all_freq_hier = {
-    4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink_Hier};
-
-/* hybrid_embedding_infrequent_update_test */
-
-TEST(hybrid_embedding_infrequent_update_test, uint32_half_64) {
-  InfrequentUpdateTest<uint32_t, __half>(config_uint32, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, int64_half_64) {
-  InfrequentUpdateTest<long long, __half>(config_int64, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, uint32_half_2048) {
-  InfrequentUpdateTest<uint32_t, __half>(config_uint32, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, int64_half_2048) {
-  InfrequentUpdateTest<long long, __half>(config_int64, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, uint32_float_64) {
-  InfrequentUpdateTest<uint32_t, float>(config_uint32, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, int64_float_64) {
-  InfrequentUpdateTest<long long, float>(config_int64, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, uint32_float_2048) {
-  InfrequentUpdateTest<uint32_t, float>(config_uint32, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, int64_float_2048) {
-  InfrequentUpdateTest<long long, float>(config_int64, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, uint32_float_128_no_freq) {
-  InfrequentUpdateTest<uint32_t, float>(config_no_freq, 128, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_test, uint32_float_128_all_freq) {
-  InfrequentUpdateTest<uint32_t, float>(config_all_freq, 128, false).run();
-}
-
-/* hybrid_embedding_infrequent_update_single_node_test */
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_half_64) {
-  InfrequentUpdateTest<uint32_t, __half>(config_uint32_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, int64_half_64) {
-  InfrequentUpdateTest<long long, __half>(config_int64_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_half_2048) {
-  InfrequentUpdateTest<uint32_t, __half>(config_uint32_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, int64_half_2048) {
-  InfrequentUpdateTest<long long, __half>(config_int64_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_64) {
-  InfrequentUpdateTest<uint32_t, float>(config_uint32_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, int64_float_64) {
-  InfrequentUpdateTest<long long, float>(config_int64_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_2048) {
-  InfrequentUpdateTest<uint32_t, float>(config_uint32_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, int64_float_2048) {
-  InfrequentUpdateTest<long long, float>(config_int64_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_128_no_freq) {
-  InfrequentUpdateTest<uint32_t, float>(config_no_freq_single_node, 128, true).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_128_all_freq) {
-  InfrequentUpdateTest<uint32_t, float>(config_all_freq_single_node, 128, true).run();
-}
-
-/* hybrid_embedding_infrequent_update_hier_test */
-
-TEST(hybrid_embedding_infrequent_update_hier_test, uint32_half_64) {
-  InfrequentUpdateTest<uint32_t, __half>(config_uint32_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, int64_half_64) {
-  InfrequentUpdateTest<long long, __half>(config_int64_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, uint32_half_2048) {
-  InfrequentUpdateTest<uint32_t, __half>(config_uint32_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, int64_half_2048) {
-  InfrequentUpdateTest<long long, __half>(config_int64_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_64) {
-  InfrequentUpdateTest<uint32_t, float>(config_uint32_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, int64_float_64) {
-  InfrequentUpdateTest<long long, float>(config_int64_hier, 64, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_2048) {
-  InfrequentUpdateTest<uint32_t, float>(config_uint32_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, int64_float_2048) {
-  InfrequentUpdateTest<long long, float>(config_int64_hier, 2048, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_128_no_freq) {
-  InfrequentUpdateTest<uint32_t, float>(config_no_freq_hier, 128, false).run();
-}
-
-TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_128_all_freq) {
-  InfrequentUpdateTest<uint32_t, float>(config_all_freq_hier, 128, false).run();
-}
-
-/* hybrid_embedding_frequent_update_test */
-
-TEST(hybrid_embedding_frequent_update_test, uint32_half_64) {
-  FrequentUpdateTest<uint32_t, __half>(config_uint32, 64, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, int64_half_64) {
-  FrequentUpdateTest<long long, __half>(config_int64, 64, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, uint32_half_2048) {
-  FrequentUpdateTest<uint32_t, __half>(config_uint32, 2048, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, int64_half_2048) {
-  FrequentUpdateTest<long long, __half>(config_int64, 2048, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, uint32_float_64) {
-  FrequentUpdateTest<uint32_t, float>(config_uint32, 64, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, int64_float_64) {
-  FrequentUpdateTest<long long, float>(config_int64, 64, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, uint32_float_2048) {
-  FrequentUpdateTest<uint32_t, float>(config_uint32, 2048, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, int64_float_2048) {
-  FrequentUpdateTest<long long, float>(config_int64, 2048, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, uint32_float_128_no_freq) {
-  FrequentUpdateTest<uint32_t, float>(config_no_freq, 128, false).run();
-}
-
-TEST(hybrid_embedding_frequent_update_test, uint32_float_128_all_freq) {
-  FrequentUpdateTest<uint32_t, float>(config_all_freq, 128, false).run();
-}
-
-/* hybrid_embedding_frequent_update_single_node_test */
-
-TEST(hybrid_embedding_frequent_update_single_node_test, uint32_half_64) {
-  FrequentUpdateTest<uint32_t, __half>(config_uint32_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, int64_half_64) {
-  FrequentUpdateTest<long long, __half>(config_int64_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, uint32_half_2048) {
-  FrequentUpdateTest<uint32_t, __half>(config_uint32_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, int64_half_2048) {
-  FrequentUpdateTest<long long, __half>(config_int64_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_64) {
-  FrequentUpdateTest<uint32_t, float>(config_uint32_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, int64_float_64) {
-  FrequentUpdateTest<long long, float>(config_int64_single_node, 64, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_2048) {
-  FrequentUpdateTest<uint32_t, float>(config_uint32_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, int64_float_2048) {
-  FrequentUpdateTest<long long, float>(config_int64_single_node, 2048, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_128_no_freq) {
-  FrequentUpdateTest<uint32_t, float>(config_no_freq_single_node, 128, true).run();
-}
-
-TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_128_all_freq) {
-  FrequentUpdateTest<uint32_t, float>(config_all_freq_single_node, 128, true).run();
-}
diff --git a/test/utest/embedding/hybrid_sparse_embedding_test.cpp b/test/utest/embedding/hybrid_sparse_embedding_test.cpp
deleted file mode 100644
index 0ad6d1dfe1..0000000000
--- a/test/utest/embedding/hybrid_sparse_embedding_test.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#define private public
-#define protected public
-#include <embeddings/hybrid_sparse_embedding.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-
-using namespace HugeCTR;
-using namespace hybrid_embedding;
-
-namespace {
-// const int numprocs = 8;
-// const size_t train_batch_size = 55296;
-// const size_t evaluate_batch_size = 55296;
-const size_t num_iterations_statistics = 100;
-const size_t max_num_frequent_categories = 10;
-const double p_dup_max = 1. / 100;
-const double max_all_reduce_bandwidth = 1.3e11;
-const double max_all_to_all_bandwidth = 1.9e11;
-const size_t slot_num = 26;
-const size_t embedding_vec_size = 128;
-std::vector<size_t> slot_size_array{39884406, 39043,  17289,    7420,    20263,  3,        7120,
-                                    1543,     63,     38532951, 2953546, 403346, 10,       2208,
-                                    11938,    155,    4,        976,     14,     39979771, 25641295,
-                                    39664984, 585935, 12972,    108,     36};
-const float scaler = 1.0f;
-const float lr = 0.01f;
-const DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST;
-template <typename dtype>
-void print_vector(const std::vector<dtype> &vec, size_t num_elment, const std::string &vec_name) {
-  auto log = HCTR_LOG_S(INFO, WORLD);
-  log << "vector name: " << vec_name << ",vector size: " << vec.size() << std::endl;
-  for (size_t i = 0; i < std::min(num_elment, vec.size()); ++i) {
-    log << vec[i] << ",";
-  }
-  log << std::endl;
-}
-template <typename TypeKey, typename TypeFP>
-void hybrid_sparse_embedding_construct(const std::vector<int> &device_list, size_t train_batch_size,
-                                       size_t evaluate_batch_size, int numprocs,
-                                       hybrid_embedding::CommunicationType communication_type,
-                                       hybrid_embedding::HybridEmbeddingType hybrid_embedding_type,
-                                       const Optimizer_t &optimizer, const Update_t &update_type) {
-  // HCTR_LIB_THROW(nvmlInit_v2());
-  std::vector<std::vector<int>> vvgpu;
-  for (int i = 0; i < numprocs; i++) {
-    vvgpu.push_back(device_list);
-  }
-
-  auto resource_manager = ResourceManagerExt::create(vvgpu, (unsigned long long)1234);
-  size_t total_gpu_count = resource_manager->get_global_gpu_count();
-  size_t local_gpu_count = resource_manager->get_local_gpu_count();
-  size_t total_categories = 0;
-  for (size_t i = 0; i < slot_size_array.size(); ++i) {
-    // slot_size_array[i] = (slot_size_array[i] + 8)/8;
-    total_categories += slot_size_array[i];
-  }
-
-  HybridEmbeddingConfig<TypeKey> test_config = {
-      (size_t)numprocs,
-      total_gpu_count,
-      slot_num,
-      embedding_vec_size,
-      (TypeKey)total_categories,
-      (TypeKey)0,  // irrelevant here
-      1.0          // irrelevant here
-  };
-  HybridEmbeddingInputGenerator<TypeKey> generator(test_config, slot_size_array, 848484);
-
-  OptHyperParams hyper_params;
-  hyper_params.sgd.atomic_update = true;
-  const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler};
-  const HybridSparseEmbeddingParams embedding_params = {
-      train_batch_size,
-      evaluate_batch_size,
-      num_iterations_statistics,
-      max_num_frequent_categories * train_batch_size,
-      p_dup_max,
-      embedding_vec_size,
-      slot_num,
-      slot_size_array,
-      communication_type,
-      max_all_reduce_bandwidth,
-      max_all_to_all_bandwidth,
-      false,
-      hybrid_embedding_type,
-      opt_params};
-
-  Tensors2<TypeKey> train_input_tensors;
-  Tensors2<TypeKey> evaluate_input_tensors;
-  Tensors2<TypeKey> inits;
-  auto initial_input =
-      generator.generate_categorical_input(train_batch_size * num_iterations_statistics);
-  auto input = generator.generate_categorical_input(train_batch_size);
-  CudaDeviceContext context;
-
-  GpuLearningRateSchedulers lr_scheds;
-
-  for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-    int cur_device = resource_manager->get_local_gpu(lgpu)->get_device_id();
-
-    context.set_device(cur_device);
-
-    auto stream = resource_manager->get_local_gpu(lgpu)->get_stream();
-
-    Tensor2<TypeKey> tensor0;
-    buf->reserve({train_batch_size, slot_num}, &tensor0);
-    train_input_tensors.push_back(tensor0);
-
-    Tensor2<TypeKey> tensor1;
-    buf->reserve({evaluate_batch_size, slot_num}, &tensor1);
-    evaluate_input_tensors.push_back(tensor1);
-    Tensor2<TypeKey> tensor2;
-    buf->reserve({train_batch_size * num_iterations_statistics, slot_num}, &tensor2);
-    inits.push_back(tensor2);
-    buf->allocate();
-    // print_vector(initial_input, 26, "initial_input");
-    // print_vector(input, 26, "input");
-    upload_tensor(initial_input, inits[lgpu], stream);
-    upload_tensor(input, train_input_tensors[lgpu], stream);
-
-    lr_scheds.emplace_back(
-        new GpuLearningRateScheduler(lr, 1, 0, 1, 2.f, 0.f, resource_manager->get_local_gpu(lgpu)));
-  }
-  HCTR_LOG_S(INFO, WORLD) << "hybridEmbdeding" << std::endl;
-  std::vector<std::shared_ptr<BufferBlock2<TypeFP>>> placeholder(
-      resource_manager->get_local_gpu_count(), NULL);
-  std::unique_ptr<HybridSparseEmbedding<TypeKey, TypeFP>> embedding(
-      new HybridSparseEmbedding<TypeKey, TypeFP>(train_input_tensors, evaluate_input_tensors,
-                                                 embedding_params, placeholder, lr_scheds, false,
-                                                 resource_manager));
-  HCTR_LOG_S(INFO, WORLD) << "init_model" << std::endl;
-  embedding->init_model(inits);
-  // HCTR_LOG_S(DEBUG, WORLD) << "forward" << std::endl;
-  HCTR_LOG_S(INFO, WORLD) << "batch size = " << train_batch_size << std::endl;
-  HCTR_LOG_S(INFO, WORLD) << "total_categories = " << total_categories
-                          << ", num_frequent = " << embedding->model_[0].num_frequent << std::endl;
-  for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) {
-    HCTR_LOG_S(INFO, WORLD) << "GPU[" << lgpu << "]"
-                            << " num_infrequent = "
-                            << embedding->model_[lgpu].h_infrequent_model_table_offsets[slot_num]
-                            << std::endl;
-  }
-
-  std::chrono::time_point<std::chrono::steady_clock> check;
-  for (int j = 0; j < 10000; ++j) {
-    for (int i = 0; i < int(resource_manager->get_local_gpu_count()); i++) {
-      auto device_id = resource_manager->get_local_gpu(i)->get_device_id();
-      context.set_device(device_id);
-      HCTR_LIB_THROW(cudaDeviceSynchronize());
-    }
-    if (j % 100 == 0) {
-      auto cost = std::chrono::duration_cast<std::chrono::nanoseconds>(
-                      std::chrono::steady_clock::now() - check)
-                      .count() /
-                  1000000.0;
-      HCTR_LOG_S(INFO, ROOT) << "100 iter time: " << cost << std::endl;
-      check = std::chrono::steady_clock::now();
-    }
-
-    embedding->forward(true);
-    // HCTR_LOG_S(DEBUG, WORLD) << i << ": fwd" << std::endl;
-    embedding->backward();
-    // HCTR_LOG_S(DEBUG, WORLD) << i << ": bwd" << std::endl;
-    embedding->update_params();
-    // HCTR_LOG_S(DEBUG, WORLD) << i << ": update" << std::endl;
-    // HCTR_LOG_S(DEBUG, WORLD) << "forward, i = " << i << std::endl;
-  }
-  // HCTR_LOG_S(DEBUG, WORLD) << "backward" << std::endl;
-}
-
-}  // namespace
-
-// TEST(hybrid_sparse_embedding_profile, multi_node_uin32_float) {
-//   std::vector<size_t> local_batch_sizes{1024, 2048, 3072, 4096, 6144, 8192};
-//   // std::vector<size_t> local_batch_sizes{6912};
-//   size_t num_procs = 8;
-//   for (auto local_batch : local_batch_sizes) {
-//     hybrid_sparse_embedding_construct<uint32_t, float>(
-//         {0}, local_batch * num_procs, local_batch * num_procs, num_procs,
-//         hybrid_embedding::CommunicationType::IB_NVLink,
-//         hybrid_embedding::HybridEmbeddingType::Distributed, Optimizer_t::SGD, Update_t::Local);
-//   }
-// }
-
-// TEST(hybrid_sparse_embedding_profile, single_node_uin32_float) {
-//   std::vector<size_t> local_batch_sizes{1024, 2048, 3072, 4096, 6144, 8192};
-//   // std::vector<size_t> local_batch_sizes{6912};
-//   size_t num_procs = 1;
-//   for (auto local_batch : local_batch_sizes) {
-//     hybrid_sparse_embedding_construct<uint32_t, float>(
-//         {0, 1, 2, 3, 4, 5, 6, 7}, local_batch * 8, local_batch * 8, num_procs,
-//         hybrid_embedding::CommunicationType::NVLink_SingleNode,
-//         hybrid_embedding::HybridEmbeddingType::Distributed, Optimizer_t::SGD, Update_t::Local);
-//   }
-// }
diff --git a/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu b/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu
index 378fa84cb1..305422795e 100644
--- a/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu
+++ b/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu
@@ -25,7 +25,7 @@
 #include <filesystem>
 #include <fstream>
 #include <functional>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/embedding/embedding_test_utils.hpp>
 #include <utest/embedding/sparse_embedding_hash_cpu.hpp>
 #include <utest/test_utils.hpp>
@@ -209,7 +209,7 @@ void train_and_test(const std::vector<int> &device_list, const Optimizer_t &opti
   for (int i = 0; i < numprocs; i++) {
     vvgpu.push_back(device_list);
   }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
   if (resource_manager->is_master_process()) {
     HCTR_LOG_S(DEBUG, WORLD) << "rank " << resource_manager->get_process_id()
                              << " is generating data" << std::endl;
@@ -550,7 +550,7 @@ void load_and_dump(const std::vector<int> &device_list, const Optimizer_t &optim
 
   std::vector<std::vector<int>> vvgpu;
   vvgpu.push_back(device_list);
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
 
   // re-generate the dataset files
   {
@@ -755,7 +755,7 @@ void load_and_dump_file(const std::vector<int> &device_list, const Optimizer_t &
   for (int i = 0; i < numprocs; i++) {
     vvgpu.push_back(device_list);
   }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
 
   if (pid == 0) {
     // re-generate the dataset files
diff --git a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu b/test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu
deleted file mode 100644
index 9b853baebc..0000000000
--- a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu
+++ /dev/null
@@ -1,847 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_profiler_api.h>
-#include <gtest/gtest.h>
-#include <nvToolsExt.h>
-
-#include <algorithm>
-#include <core23/mpi_init_service.hpp>
-#include <core23_helper.hpp>
-#include <data_generator.hpp>
-#include <data_readers/data_reader.hpp>
-#include <embeddings/localized_slot_sparse_embedding_one_hot.hpp>
-#include <filesystem>
-#include <fstream>
-#include <functional>
-#include <random>
-#include <resource_managers/resource_manager_ext.hpp>
-#include <utest/embedding/embedding_test_utils.hpp>
-#include <utest/embedding/sparse_embedding_hash_cpu.hpp>
-#include <utest/test_utils.hpp>
-
-using namespace HugeCTR;
-using namespace embedding_test;
-
-namespace {
-
-//---------------------------------------------------------------------------------------
-// global params for all testing
-const int train_batch_num = 10;  // can not more than 32
-const int test_batch_num = 1;
-const int train_batchsize = 1024;
-const int test_batchsize = 2560;
-const int slot_num = 26;
-const int max_nnz_per_slot = 1;
-const int max_feature_num = max_nnz_per_slot * slot_num;  // max_feature_num in a sample
-const long long vocabulary_size = slot_num * 100;
-const int embedding_vec_size = 128;
-const int combiner = 0;  // 0-sum, 1-mean
-const long long label_dim = 1;
-const long long dense_dim = 0;
-typedef long long T;
-using SparseTensor23s = std::vector<SparseTensor23>;
-
-const float scaler = 1.0f;  // used in mixed precision training
-const float lr = 0.01f;
-
-// In order to not allocate the total size of hash table on each GPU, the users need to set the
-// size of max_vocabulary_size_per_gpu, which should be more than vocabulary_size/gpu_count,
-// eg: 1.25x of that.
-
-const int num_chunk_threads = 1;  // must be 1 for CPU and GPU results comparison
-const int num_files = 1;
-const Check_t CHK = Check_t::Sum;  // Check_t::Sum
-const char *train_file_list_name = "train_file_list.txt";
-const char *test_file_list_name = "test_file_list.txt";
-
-const char *train_file_list_parquet_name = "train_file_list_parquet.txt";
-const char *test_file_list_parquet_name = "test_file_list_parquet.txt";
-
-const char *prefix = "./data_reader_test_data/temp_dataset_";
-
-const char *sparse_model_file = "localized_hash_table";
-
-// std::vector<size_t> slot_sizes; // null means use vocabulary_size/gpu_count/load_factor as
-// max_vocabulary_size_per_gpu
-
-// CAUTION: must match vocabulary_size
-// std::vector<size_t> slot_sizes = {39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,
-//   2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36}; //
-//   for cretio dataset
-std::vector<size_t> slot_sizes = {100, 100, 100, 100, 100, 100, 100, 100, 100,
-                                  100, 100, 100, 100, 100, 100, 100, 100, 100,
-                                  100, 100, 100, 100, 100, 100, 100, 100};  // just for verify
-
-//-----------------------------------------------------------------------------------------
-
-auto load_sparse_model_to_map = [](std::vector<T> &key_vec, std::vector<size_t> &slot_vec,
-                                   std::vector<float> &vec_vec, const std::string &sparse_model) {
-  const std::string key_file(sparse_model + "/key");
-  const std::string slot_file(sparse_model + "/slot_id");
-  const std::string vec_file(sparse_model + "/emb_vector");
-
-  std::ifstream fs_key(key_file, std::ifstream::binary);
-  std::ifstream fs_slot(slot_file, std::ifstream::binary);
-  std::ifstream fs_vec(vec_file, std::ifstream::binary);
-
-  const size_t key_file_size_in_B = std::filesystem::file_size(key_file);
-  const size_t slot_file_size_in_B = std::filesystem::file_size(slot_file);
-  const size_t vec_file_size_in_B = std::filesystem::file_size(vec_file);
-
-  const long long num_key = key_file_size_in_B / sizeof(long long);
-  const long long num_slot = slot_file_size_in_B / sizeof(size_t);
-  const long long num_vec = vec_file_size_in_B / (sizeof(float) * embedding_vec_size);
-  if (num_key != num_vec || num_key != num_slot || num_key != vocabulary_size) {
-    HCTR_OWN_THROW(Error_t::BrokenFile,
-                   "num_key != num_vec (num_slot) || num_key != vocabulary_size");
-  }
-  key_vec.clear();
-  key_vec.resize(num_key);
-  slot_vec.clear();
-  slot_vec.resize(num_key);
-  vec_vec.clear();
-  vec_vec.resize(num_vec * embedding_vec_size);
-
-  using TypeKey = typename std::decay<decltype(*key_vec.begin())>::type;
-  if (std::is_same<TypeKey, long long>::value) {
-    fs_key.read(reinterpret_cast<char *>(key_vec.data()), key_file_size_in_B);
-  } else {
-    std::vector<long long> i64_key_vec(num_key, 0);
-    fs_key.read(reinterpret_cast<char *>(i64_key_vec.data()), key_file_size_in_B);
-    std::transform(i64_key_vec.begin(), i64_key_vec.end(), key_vec.begin(),
-                   [](long long key) { return static_cast<unsigned>(key); });
-  }
-  fs_slot.read(reinterpret_cast<char *>(slot_vec.data()), slot_file_size_in_B);
-  fs_vec.read(reinterpret_cast<char *>(vec_vec.data()), vec_file_size_in_B);
-};
-
-void init_sparse_model(const char *sparse_model) {
-  HCTR_LOG_S(DEBUG, WORLD) << "Init hash table" << std::endl;
-  // init hash table file: <key, solt_id, value>
-  if (!std::filesystem::exists(sparse_model)) {
-    std::filesystem::create_directories(sparse_model);
-  }
-  const std::string key_file = std::string(sparse_model) + "/key";
-  const std::string slot_file = std::string(sparse_model) + "/slot_id";
-  const std::string vec_file = std::string(sparse_model) + "/emb_vector";
-  std::ofstream fs_key(key_file);
-  std::ofstream fs_slot(slot_file);
-  std::ofstream fs_vec(vec_file);
-  if (!fs_key.is_open() || !fs_slot.is_open() || !fs_vec.is_open()) {
-    HCTR_LOG_S(ERROR, WORLD) << "File not open for writing. " << HCTR_LOCATION() << std::endl;
-  }
-
-  // UnifiedDataSimulator<T> ldata_sim(0, slot_num-1); // for slot_id
-  test::UniformDataSimulator fdata_sim;  // for value
-  std::unique_ptr<float[]> buf(new float[embedding_vec_size]);
-  for (long long i = 0; i < vocabulary_size; i++) {
-    T key = (T)i;
-    // T key = ldata_sim.get_num();
-    // CAUTION: can not set random keys here, because we need to ensure that:
-    // 1) we can find keys in the data file from this hash table
-    // 2) there are no repeated keys
-    fs_key.write((char *)&key, sizeof(T));
-    T slot_id;
-    if (slot_sizes.size() == 0) {
-      // slot_id = key % slot_num;  // CAUTION: need to dedicate the slot_id for each key for
-      //                            // correctness verification
-      HCTR_OWN_THROW(
-          Error_t::WrongInput,
-          "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot");
-    } else {
-      size_t offset = 0;
-      for (size_t j = 0; j < slot_sizes.size(); j++) {
-        if ((key >= static_cast<T>(offset)) && (key < static_cast<T>(offset + slot_sizes[j]))) {
-          slot_id = (T)j;
-          break;
-        }
-        offset += slot_sizes[j];
-      }
-    }
-    fs_slot.write((char *)&slot_id, sizeof(T));
-    // float val = (float)i;
-    // float val = 0.1f;
-    fdata_sim.fill(buf.get(), embedding_vec_size, -0.1f, 0.1f);
-    fs_vec.write(reinterpret_cast<const char *>(buf.get()), embedding_vec_size * sizeof(float));
-  }
-  HCTR_LOG_S(DEBUG, WORLD) << " Done" << std::endl;
-}
-
-template <typename TypeEmbeddingComp>
-void train_and_test(const std::vector<int> &device_list, const Optimizer_t &optimizer,
-                    const Update_t &update_type,
-                    const DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST) {
-  OptHyperParams hyper_params;
-  hyper_params.sgd.atomic_update = true;
-  const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler};
-  float tolerance;
-  if (std::is_same<TypeEmbeddingComp, __half>::value) {
-    tolerance = 5e-3f;
-  } else {
-    tolerance = 1e-4f;
-  }
-
-  test::mpi_init();
-  const int numprocs{core23::MpiInitService::get().world_size()};
-
-  // if there are multi-node, we assume each node has the same gpu device_list
-  std::vector<std::vector<int>> vvgpu;
-  for (int i = 0; i < numprocs; i++) {
-    vvgpu.push_back(device_list);
-  }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0, layout);
-
-  if (resource_manager->is_master_process()) {
-    HCTR_LOG_S(DEBUG, WORLD) << "rank " << resource_manager->get_process_id()
-                             << " is generating data" << std::endl;
-    {
-      // re-generate the dataset files
-      std::ifstream file(train_file_list_name);
-      if (file.good()) {
-        std::remove(train_file_list_name);
-      }
-    }
-    {
-      // re-generate the dataset files
-      std::ifstream file(test_file_list_name);
-      if (file.good()) {
-        std::remove(test_file_list_name);
-      }
-    }
-    std::vector<std::vector<std::vector<T>>> train_generated_value;
-    std::vector<std::vector<std::vector<T>>> train_generated_rowoffset;
-    std::vector<std::vector<std::vector<float>>> train_generated_label;
-    std::vector<std::vector<std::vector<float>>> train_generated_dense;
-
-    std::vector<std::vector<std::vector<T>>> test_generated_value;
-    std::vector<std::vector<std::vector<T>>> test_generated_rowoffset;
-    std::vector<std::vector<std::vector<float>>> test_generated_label;
-    std::vector<std::vector<std::vector<float>>> test_generated_dense;
-    // data generation: key's corresponding slot_id=(key%slot_num)
-    // TODO currently, generate norm file for CPU reference while parquet for GPU reader
-    if (slot_sizes.size() > 0) {
-      HugeCTR::data_generation_for_localized_test<T, CHK>(
-          train_file_list_name, prefix, num_files, train_batch_num * train_batchsize, slot_num,
-          vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0,
-          &train_generated_value, &train_generated_rowoffset, &train_generated_label,
-          &train_generated_dense);
-      HugeCTR::data_generation_for_localized_test<T, CHK>(
-          test_file_list_name, prefix, num_files, test_batch_num * test_batchsize, slot_num,
-          vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0,
-          &test_generated_value, &test_generated_rowoffset, &test_generated_label,
-          &test_generated_dense);
-    } else {
-      HCTR_OWN_THROW(
-          Error_t::WrongInput,
-          "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot");
-    }
-    HugeCTR::data_generation_for_parquet<T>(train_file_list_parquet_name, prefix,
-                                            train_generated_value, train_generated_rowoffset,
-                                            train_generated_label, train_generated_dense);
-    HugeCTR::data_generation_for_parquet<T>(test_file_list_parquet_name, prefix,
-                                            test_generated_value, test_generated_rowoffset,
-                                            test_generated_label, test_generated_dense);
-  }
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-  HCTR_LOG_S(DEBUG, WORLD) << "This is rank: " << resource_manager->get_process_id() << std::endl;
-#endif
-
-  // setup a data reader
-  const DataReaderSparseParam param = {"localized", max_nnz_per_slot, true, slot_num};
-  std::vector<DataReaderSparseParam> params;
-  params.push_back(param);
-
-  std::unique_ptr<DataReader<T>> train_data_reader(
-      new DataReader<T>(train_batchsize, label_dim, dense_dim, params, resource_manager, true,
-                        num_chunk_threads, false));
-
-  train_data_reader->create_drwg_parquet(
-      train_file_list_parquet_name, false, std::vector<long long>(slot_num, 0), true,
-      std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize),
-      label_dim + dense_dim, label_dim + dense_dim);
-
-  std::unique_ptr<DataReader<T>> test_data_reader(
-      new DataReader<T>(test_batchsize, label_dim, dense_dim, params, resource_manager, true,
-                        num_chunk_threads, false));
-
-  test_data_reader->create_drwg_parquet(
-      test_file_list_parquet_name, false, std::vector<long long>(slot_num, 0), true,
-      std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize),
-      label_dim + dense_dim, label_dim + dense_dim);
-
-  // generate hashtable
-  if (resource_manager->is_master_process()) {
-    init_sparse_model(sparse_model_file);
-  }
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-  const SparseEmbeddingHashParams embedding_params = {train_batchsize,
-                                                      test_batchsize,
-                                                      0,
-                                                      slot_sizes,
-                                                      embedding_vec_size,
-                                                      max_feature_num,
-                                                      slot_num,
-                                                      combiner,
-                                                      opt_params,
-                                                      true,
-                                                      false};
-
-  SparseTensor23s train_input = train_data_reader->get_sparse_tensor23s("localized");
-  SparseTensor23s test_input = test_data_reader->get_sparse_tensor23s("localized");
-
-  std::unique_ptr<LocalizedSlotSparseEmbeddingOneHot<T, TypeEmbeddingComp>> embedding(
-      new LocalizedSlotSparseEmbeddingOneHot<T, TypeEmbeddingComp>(
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<T>(train_input),
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<T>(test_input), embedding_params,
-          resource_manager));
-
-  // upload hash table to device
-  embedding->load_parameters(sparse_model_file);
-
-  // for SparseEmbeddingCpu
-  std::unique_ptr<SparseEmbeddingHashCpu<T, TypeEmbeddingComp>> embedding_cpu(
-      new SparseEmbeddingHashCpu<T, TypeEmbeddingComp>(
-          train_batchsize, max_feature_num, vocabulary_size, embedding_vec_size, slot_num,
-          label_dim, dense_dim, CHK, train_batch_num * train_batchsize, combiner, opt_params,
-          train_file_list_name, sparse_model_file, SparseEmbedding_t::Localized));
-
-  TypeEmbeddingComp *embedding_feature_from_cpu = embedding_cpu->get_forward_results();
-  TypeEmbeddingComp *wgrad_from_cpu = embedding_cpu->get_backward_results();
-  T *hash_table_key_from_cpu = embedding_cpu->get_hash_table_key_ptr();
-  float *hash_table_value_from_cpu = embedding_cpu->get_hash_table_value_ptr();
-
-  // for results check
-  std::shared_ptr<GeneralBuffer2<HostAllocator>> buf = GeneralBuffer2<HostAllocator>::create();
-
-  Tensor2<TypeEmbeddingComp> embedding_feature_from_gpu;
-  buf->reserve({train_batchsize * slot_num * embedding_vec_size}, &embedding_feature_from_gpu);
-  Tensor2<TypeEmbeddingComp> wgrad_from_gpu;
-  buf->reserve({train_batchsize * slot_num * embedding_vec_size}, &wgrad_from_gpu);
-  Tensor2<TypeEmbeddingComp> embedding_feature_from_gpu_eval;
-  buf->reserve({test_batchsize * slot_num * embedding_vec_size}, &embedding_feature_from_gpu_eval);
-
-  buf->allocate();
-
-  typedef struct TypeHashValue_ {
-    float data[embedding_vec_size];
-  } TypeHashValue;
-
-  for (int i = 0; i < train_batch_num; i++) {
-    HCTR_LOG(INFO, WORLD, "Rank%d: Round %d start training:\n", resource_manager->get_process_id(),
-             i);
-
-    // call read a batch
-    HCTR_LOG(INFO, WORLD, "Rank%d: data_reader->read_a_batch_to_device()\n",
-             resource_manager->get_process_id());
-    train_data_reader->read_a_batch_to_device();
-
-    // GPU forward
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding->forward()\n", resource_manager->get_process_id());
-    embedding->forward(true);
-
-    // check the result of forward
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding->get_forward_results()\n",
-             resource_manager->get_process_id());
-    embedding->get_forward_results(true, embedding_feature_from_gpu);  // memcpy from GPU to CPU
-
-    if (resource_manager->is_master_process()) {
-      // CPU forward
-      HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu->forward()\n");
-      embedding_cpu->forward();
-
-      HCTR_LOG(INFO, WORLD, "Rank0: check forward results\n");
-      ASSERT_TRUE(compare_embedding_feature(train_batchsize * slot_num * embedding_vec_size,
-                                            embedding_feature_from_gpu.get_ptr(),
-                                            embedding_feature_from_cpu, tolerance));
-    }
-
-#ifdef ENABLE_MPI
-    HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-    // GPU backward
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding->backward()\n", resource_manager->get_process_id());
-    embedding->backward();
-
-    // check the result of backward
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding->get_backward_results()\n",
-             resource_manager->get_process_id());
-    embedding->get_backward_results(wgrad_from_gpu, 0);
-
-    if (resource_manager->is_master_process()) {
-      // CPU backward
-      HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu->backward()\n");
-      embedding_cpu->backward();
-
-      HCTR_LOG(INFO, WORLD, "Rank0: check backward results: GPU and CPU\n");
-      ASSERT_TRUE(compare_wgrad(train_batchsize * slot_num * embedding_vec_size,
-                                wgrad_from_gpu.get_ptr(), wgrad_from_cpu, tolerance));
-    }
-
-#ifdef ENABLE_MPI
-    HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-    // GPU update_params
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding->update_params()\n",
-             resource_manager->get_process_id());
-    embedding->update_params();
-
-    if (resource_manager->is_master_process()) {
-      // CPU update_params
-      HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu->update_params()\n");
-      embedding_cpu->update_params();
-    }
-
-#ifdef ENABLE_MPI
-    HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-    HCTR_LOG(INFO, WORLD, "Rank%d: Round %d end:\n", resource_manager->get_process_id(), i);
-  }
-
-  ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // create new obj for eval()
-  embedding->dump_parameters(sparse_model_file);
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-  // for SparseEmbeddingCpu eval
-  std::unique_ptr<SparseEmbeddingHashCpu<T, TypeEmbeddingComp>> test_embedding_cpu(
-      new SparseEmbeddingHashCpu<T, TypeEmbeddingComp>(
-          test_batchsize, max_feature_num, vocabulary_size, embedding_vec_size, slot_num, label_dim,
-          dense_dim, CHK, test_batch_num * test_batchsize, combiner, opt_params,
-          test_file_list_name, sparse_model_file, SparseEmbedding_t::Localized));
-
-  TypeEmbeddingComp *embedding_feature_from_cpu_eval = test_embedding_cpu->get_forward_results();
-
-  {
-    /////////////////////////////////////////////////////////////////////////////////////////////
-    // eval
-    HCTR_LOG(INFO, WORLD, "\nRank%d: Round start eval:\n", resource_manager->get_process_id());
-
-    // call read a batch
-    HCTR_LOG(INFO, WORLD, "Rank%d: data_reader_eval->read_a_batch_to_device()\n",
-             resource_manager->get_process_id());
-    test_data_reader->read_a_batch_to_device();
-
-    // GPU forward
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding_eval->forward()\n",
-             resource_manager->get_process_id());
-    embedding->forward(false);
-
-    // check the result of forward
-    HCTR_LOG(INFO, WORLD, "Rank%d: embedding_eval->get_forward_results()\n",
-             resource_manager->get_process_id());
-    embedding->get_forward_results(false,
-                                   embedding_feature_from_gpu_eval);  // memcpy from GPU to CPU
-
-    if (resource_manager->is_master_process()) {
-      // CPU forward
-      HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu_eval->forward()\n");
-      test_embedding_cpu->forward();
-
-      HCTR_LOG(INFO, WORLD, "Rank0: check forward results\n");
-      ASSERT_TRUE(compare_embedding_feature(test_batchsize * slot_num * embedding_vec_size,
-                                            embedding_feature_from_gpu_eval.get_ptr(),
-                                            embedding_feature_from_cpu_eval, tolerance));
-    }
-  }
-
-  test::mpi_finalize();
-}
-
-template <typename TypeEmbeddingComp>
-void load_and_dump(const std::vector<int> &device_list, const Optimizer_t &optimizer,
-                   const Update_t &update_type) {
-  float tolerance = 1e-4f;
-  OptHyperParams hyper_params;
-  hyper_params.sgd.atomic_update = true;
-  const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler};
-  std::vector<std::vector<int>> vvgpu;
-  vvgpu.push_back(device_list);
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
-
-  // re-generate the dataset files
-  {
-    std::ifstream fs(train_file_list_name);
-    if (fs.good()) {
-      std::remove(train_file_list_name);
-    }
-  }
-  std::vector<std::vector<std::vector<T>>> train_generated_value;
-  std::vector<std::vector<std::vector<T>>> train_generated_rowoffset;
-  std::vector<std::vector<std::vector<float>>> train_generated_label;
-  std::vector<std::vector<std::vector<float>>> train_generated_dense;
-
-  // data generation: key's corresponding slot_id=(key%slot_num)
-  if (slot_sizes.size() > 0) {
-    HugeCTR::data_generation_for_localized_test<T, CHK>(
-        train_file_list_name, prefix, num_files, train_batch_num * train_batchsize, slot_num,
-        vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0,
-        &train_generated_value, &train_generated_rowoffset, &train_generated_label,
-        &train_generated_dense);
-  } else {
-    HCTR_OWN_THROW(
-        Error_t::WrongInput,
-        "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot");
-    HugeCTR::data_generation_for_parquet<T>(train_file_list_parquet_name, prefix,
-                                            train_generated_value, train_generated_rowoffset,
-                                            train_generated_label, train_generated_dense);
-  }
-
-  // setup a data reader
-  const DataReaderSparseParam param = {"localized", max_nnz_per_slot, true, slot_num};
-  std::vector<DataReaderSparseParam> params;
-  params.push_back(param);
-
-  std::unique_ptr<DataReader<T>> train_data_reader(
-      new DataReader<T>(train_batchsize, label_dim, dense_dim, params, resource_manager, true,
-                        num_chunk_threads, false));
-
-  train_data_reader->create_drwg_parquet(
-      train_file_list_parquet_name, false, std::vector<long long>(slot_num, 0), true,
-      std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize),
-      label_dim + dense_dim, label_dim + dense_dim);
-
-  // generate hashtable
-  init_sparse_model(sparse_model_file);
-
-  const SparseEmbeddingHashParams embedding_params = {train_batchsize,
-                                                      test_batchsize,
-                                                      0,
-                                                      slot_sizes,
-                                                      embedding_vec_size,
-                                                      max_feature_num,
-                                                      slot_num,
-                                                      combiner,
-                                                      opt_params,
-                                                      true,
-                                                      false};
-
-  SparseTensor23s train_input = train_data_reader->get_sparse_tensor23s("localized");
-
-  std::unique_ptr<LocalizedSlotSparseEmbeddingOneHot<T, TypeEmbeddingComp>> embedding(
-      new LocalizedSlotSparseEmbeddingOneHot<T, TypeEmbeddingComp>(
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<T>(train_input),
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<T>(train_input), embedding_params,
-          resource_manager));
-
-  // upload hash table to device
-  embedding->load_parameters(sparse_model_file);
-
-  HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n",
-           embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-
-  std::shared_ptr<GeneralBuffer2<CudaHostAllocator>> blobs_buff =
-      GeneralBuffer2<CudaHostAllocator>::create();
-
-  Tensor2<T> keys;
-  blobs_buff->reserve({embedding->get_max_vocabulary_size()}, &keys);
-
-  Tensor2<size_t> slot_id;
-  blobs_buff->reserve({embedding->get_max_vocabulary_size()}, &slot_id);
-
-  Tensor2<float> embeddings;
-  blobs_buff->reserve({embedding->get_max_vocabulary_size(), embedding_vec_size}, &embeddings);
-
-  blobs_buff->allocate();
-
-  BufferBag buf_bag;
-  buf_bag.keys = keys.shrink();
-  buf_bag.slot_id = slot_id.shrink();
-  buf_bag.embedding = embeddings;
-
-  size_t dump_size;
-  embedding->dump_parameters(buf_bag, &dump_size);
-
-  HCTR_LOG(INFO, WORLD, "dump_size=%zu, max_vocabulary_size=%zu, vocabulary_size=%zu\n", dump_size,
-           embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-
-  embedding->dump_parameters(buf_bag, &dump_size);
-
-  HCTR_LOG(INFO, WORLD, "dump_size=%zu, max_vocabulary_size=%zu, vocabulary_size=%zu\n", dump_size,
-           embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-
-  embedding->reset();
-
-  HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n",
-           embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-
-  embedding->load_parameters(buf_bag, dump_size);
-
-  HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n",
-           embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-
-  embedding->dump_parameters(buf_bag, &dump_size);
-
-  HCTR_LOG(INFO, WORLD, "dump_size=%zu, max_vocabulary_size=%zu, vocabulary_size=%zu\n", dump_size,
-           embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-
-  std::string tmp_sparse_model_file{"tmp_sparse_model"};
-  embedding->dump_parameters(tmp_sparse_model_file);
-
-  std::vector<T> hash_table_key_from_cpu;
-  std::vector<size_t> slot_id_from_cpu;
-  std::vector<float> hash_table_value_from_cpu;
-  load_sparse_model_to_map(hash_table_key_from_cpu, slot_id_from_cpu, hash_table_value_from_cpu,
-                           sparse_model_file);
-
-  std::vector<T> hash_table_key_from_gpu;
-  std::vector<size_t> slot_id_from_gpu;
-  std::vector<float> hash_table_value_from_gpu;
-  load_sparse_model_to_map(hash_table_key_from_gpu, slot_id_from_gpu, hash_table_value_from_gpu,
-                           tmp_sparse_model_file);
-
-  typedef struct TypeHashValue_ {
-    float data[embedding_vec_size];
-  } TypeHashValue;
-
-  ASSERT_TRUE(compare_hash_table(
-      vocabulary_size, hash_table_key_from_gpu.data(),
-      reinterpret_cast<TypeHashValue *>(hash_table_value_from_gpu.data()),
-      hash_table_key_from_cpu.data(),
-      reinterpret_cast<TypeHashValue *>(hash_table_value_from_cpu.data()), tolerance));
-
-  ASSERT_TRUE(compare_key_slot(vocabulary_size, hash_table_key_from_gpu.data(),
-                               slot_id_from_gpu.data(), hash_table_key_from_cpu.data(),
-                               slot_id_from_cpu.data()));
-}
-
-template <typename TypeEmbeddingComp>
-void load_and_dump_file(const std::vector<int> &device_list, const Optimizer_t &optimizer,
-                        const Update_t &update_type) {
-  std::string sparse_model_src("sparse_model_src");
-  std::string sparse_model_dst("sparse_model_dst");
-
-  float tolerance = 1e-4f;
-  OptHyperParams hyper_params;
-  hyper_params.sgd.atomic_update = true;
-  const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler};
-
-  int numprocs = 1, pid = 0;
-  std::vector<std::vector<int>> vvgpu;
-  test::mpi_init();
-  for (int i = 0; i < numprocs; i++) {
-    vvgpu.push_back(device_list);
-  }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
-
-  if (pid == 0) {
-    // re-generate the dataset files
-    if (std::filesystem::exists(train_file_list_name)) {
-      std::filesystem::remove(train_file_list_name);
-    }
-    std::vector<std::vector<std::vector<T>>> train_generated_value;
-    std::vector<std::vector<std::vector<T>>> train_generated_rowoffset;
-    std::vector<std::vector<std::vector<float>>> train_generated_label;
-    std::vector<std::vector<std::vector<float>>> train_generated_dense;
-    // data generation: key's corresponding slot_id=(key%slot_num)
-    if (slot_sizes.size() > 0) {
-      HugeCTR::data_generation_for_localized_test<T, CHK>(
-          train_file_list_name, prefix, num_files, train_batch_num * train_batchsize, slot_num,
-          vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0,
-          &train_generated_value, &train_generated_rowoffset, &train_generated_label,
-          &train_generated_dense);
-    } else {
-      HCTR_OWN_THROW(
-          Error_t::WrongInput,
-          "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot");
-    }
-    HugeCTR::data_generation_for_parquet<T>(train_file_list_parquet_name, prefix,
-                                            train_generated_value, train_generated_rowoffset,
-                                            train_generated_label, train_generated_dense);
-  }
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-  // setup a data reader
-  const DataReaderSparseParam param = {"localized", max_nnz_per_slot, true, slot_num};
-  std::vector<DataReaderSparseParam> params;
-  params.push_back(param);
-
-  std::unique_ptr<DataReader<T>> train_data_reader(
-      new DataReader<T>(train_batchsize, label_dim, dense_dim, params, resource_manager, true,
-                        num_chunk_threads, false));
-
-  train_data_reader->create_drwg_parquet(
-      train_file_list_parquet_name, false, std::vector<long long>(slot_num, 0), true,
-      std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize),
-      label_dim + dense_dim, label_dim + dense_dim);
-
-  const SparseEmbeddingHashParams embedding_params = {train_batchsize,
-                                                      test_batchsize,
-                                                      0,
-                                                      slot_sizes,
-                                                      embedding_vec_size,
-                                                      max_feature_num,
-                                                      slot_num,
-                                                      combiner,
-                                                      opt_params,
-                                                      true,
-                                                      false};
-
-  SparseTensor23s train_input = train_data_reader->get_sparse_tensor23s("localized");
-  std::unique_ptr<LocalizedSlotSparseEmbeddingOneHot<T, TypeEmbeddingComp>> embedding(
-      new LocalizedSlotSparseEmbeddingOneHot<T, TypeEmbeddingComp>(
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<T>(train_input),
-          core_helper::convert_sparse_tensors23_to_sparse_tensors<T>(train_input), embedding_params,
-          resource_manager));
-
-  // init hash table file
-  if (pid == 0) {
-    init_sparse_model(sparse_model_src.c_str());
-  }
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-  // upload hash table to device
-  embedding->load_parameters(sparse_model_src);
-
-  if (pid == 0) {
-    HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n",
-             embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size());
-  }
-
-  // dump sparse model to file
-  embedding->dump_parameters(sparse_model_dst);
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-
-  std::vector<T> hash_table_key_from_cpu;
-  std::vector<size_t> slot_id_from_cpu;
-  std::vector<float> hash_table_value_from_cpu;
-  load_sparse_model_to_map(hash_table_key_from_cpu, slot_id_from_cpu, hash_table_value_from_cpu,
-                           sparse_model_src);
-
-  std::vector<T> hash_table_key_from_gpu;
-  std::vector<size_t> slot_id_from_gpu;
-  std::vector<float> hash_table_value_from_gpu;
-  load_sparse_model_to_map(hash_table_key_from_gpu, slot_id_from_gpu, hash_table_value_from_gpu,
-                           sparse_model_dst);
-
-  typedef struct TypeHashValue_ {
-    float data[embedding_vec_size];
-  } TypeHashValue;
-
-  ASSERT_TRUE(compare_hash_table(
-      vocabulary_size, hash_table_key_from_gpu.data(),
-      reinterpret_cast<TypeHashValue *>(hash_table_value_from_gpu.data()),
-      hash_table_key_from_cpu.data(),
-      reinterpret_cast<TypeHashValue *>(hash_table_value_from_cpu.data()), tolerance));
-
-  ASSERT_TRUE(compare_key_slot(vocabulary_size, hash_table_key_from_gpu.data(),
-                               slot_id_from_gpu.data(), hash_table_key_from_cpu.data(),
-                               slot_id_from_cpu.data()));
-
-  test::mpi_finalize();
-}
-
-}  // namespace
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_1gpu) {
-  train_and_test<float>({0}, Optimizer_t::SGD, Update_t::Local);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_4gpu) {
-  train_and_test<float>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_1gpu) {
-  train_and_test<float>({0}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_4gpu) {
-  train_and_test<float>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_1gpu) {
-  train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Local);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_4gpu) {
-  train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_1gpu) {
-  train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_4gpu) {
-  train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, load_and_dump_1gpu) {
-  load_and_dump<float>({0}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, load_and_dump_4gpu) {
-  load_and_dump<float>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, load_and_dump_file_1gpu) {
-  load_and_dump_file<float>({0}, Optimizer_t::SGD, Update_t::Global);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, load_and_dump_file_4gpu) {
-  load_and_dump_file<float>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global);
-}
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_1gpu_nf) {
-  train_and_test<float>({0}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_4gpu_nf) {
-  train_and_test<float>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_1gpu_nf) {
-  train_and_test<float>({0}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_4gpu_nf) {
-  train_and_test<float>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_1gpu_nf) {
-  train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_4gpu_nf) {
-  train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_1gpu_nf) {
-  train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST);
-}
-
-TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_4gpu_nf) {
-  train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST);
-}
diff --git a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu b/test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu
deleted file mode 100644
index 3ff50682b7..0000000000
--- a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_runtime_api.h>
-#include <gtest/gtest.h>
-#include <nvToolsExt.h>
-
-#include <algorithm>
-#include <common.hpp>
-#include <cuda_utils.cuh>
-#include <embeddings/hybrid_embedding/statistics.hpp>
-#include <embeddings/localized_slot_sparse_embedding_one_hot.hpp>
-#include <embeddings/sparse_embedding_functors.hpp>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <numeric>
-#include <random>
-#include <utest/embedding/embedding_test_utils.hpp>
-#include <utest/embedding/hybrid_embedding/input_generator.hpp>
-#include <utest/test_utils.hpp>
-#include <utility>
-#include <utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-using namespace embedding_test;
-
-template <typename TypeEmbeddingComp>
-class GpuData {
- public:
-  GpuData() {}
-  ~GpuData() {}
-  GpuData(const std::vector<size_t>& h_value_index, const size_t max_vocabulary_size,
-          const size_t embedding_vec_size) {
-    size_t num_samples = h_value_index.size();
-    init_data(num_samples, max_vocabulary_size, embedding_vec_size);
-    HCTR_LIB_THROW(cudaMemcpy(value_index.get_ptr(), h_value_index.data(),
-                              sizeof(size_t) * num_samples, cudaMemcpyHostToDevice));
-  }
-
-  void init_data(const size_t num_samples, const size_t max_vocabulary_size,
-                 const size_t embedding_vec_size) {
-    std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-
-    buf->reserve({num_samples}, &value_index);
-    buf->reserve({max_vocabulary_size * embedding_vec_size}, &weights);
-    buf->reserve({num_samples * embedding_vec_size}, &wgrad);
-
-    const size_t max_top_categories = get_max_size_top_categories();
-    buf->reserve({max_top_categories}, &top_categories);
-    size_top_categories = 0;
-
-    buf->allocate();
-  }
-
-  Tensor2<size_t> value_index;
-  Tensor2<size_t> top_categories;
-  size_t size_top_categories;
-
-  Tensor2<TypeEmbeddingComp> wgrad;
-  Tensor2<float> weights;
-
-  void init_weights(size_t num_samples, size_t max_vocabulary_size, size_t embedding_vec_size,
-                    const std::vector<TypeEmbeddingComp>& h_wgrad) {
-    HCTR_LIB_THROW(cudaMemcpy(wgrad.get_ptr(), h_wgrad.data(),
-                              sizeof(TypeEmbeddingComp) * num_samples * embedding_vec_size,
-                              cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMemset(weights.get_ptr(), 0.f,
-                              sizeof(float) * max_vocabulary_size * embedding_vec_size));
-  }
-};
-
-template <typename TypeEmbeddingComp>
-void update_test(const std::vector<size_t>& value_index, size_t max_vocabulary_size,
-                 size_t embedding_vec_size, const std::vector<TypeEmbeddingComp>& wgrad) {
-  HCTR_LOG_S(DEBUG, WORLD) << "Starting embedding update test..." << std::endl;
-  cudaStream_t stream = 0;
-
-  // get number of sms
-  cudaDeviceProp device_prop;
-  cudaGetDeviceProperties(&device_prop, 0);
-
-  // test sorting
-  std::map<size_t, std::set<uint32_t>> ref_categorize;
-  size_t num_samples = value_index.size();
-  for (size_t i = 0; i < num_samples; ++i) {
-    ref_categorize[value_index[i]].insert(i);
-  }
-  size_t num_unique_categories_ref = ref_categorize.size();
-
-  std::vector<size_t> value_index_sort;
-  std::vector<uint32_t> sample_id_sort;
-  std::vector<uint32_t> sorted_sample_offset_category;
-
-  GpuData<TypeEmbeddingComp> gpu_data(value_index, max_vocabulary_size, embedding_vec_size);
-
-  // now for the update
-  size_t weight_size = max_vocabulary_size * embedding_vec_size;
-  std::vector<float> weights_test(weight_size, 0.0f);
-  std::vector<float> weights_ref(weight_size, 0.0f);
-
-  // ref weight update :
-  for (auto const& pair : ref_categorize) {
-    for (size_t j = 0; j < embedding_vec_size; ++j) {
-      float sum_j = 0.f;
-      for (auto const& sample_index : pair.second) {
-        sum_j += (float)wgrad[sample_index * embedding_vec_size + j];
-      }
-      weights_ref[pair.first * embedding_vec_size + j] = -sum_j;
-    }
-  }
-  // done with calculating ref weights
-
-  // init wgrad and weights on gpu:
-  gpu_data.init_weights(num_samples, max_vocabulary_size, embedding_vec_size, wgrad);
-
-  HCTR_LOG_S(DEBUG, WORLD) << "performing atomic cached kernel..." << std::endl;
-  SparseEmbeddingFunctors::opt_sgd_atomic_cached<TypeEmbeddingComp>(
-      num_samples, embedding_vec_size, gpu_data.value_index.get_ptr(), 1.0f, 1.0f,
-      gpu_data.wgrad.get_ptr(), gpu_data.weights.get_ptr(), gpu_data.top_categories.get_ptr(),
-      gpu_data.size_top_categories, stream, true);
-
-  HCTR_LOG_S(DEBUG, WORLD) << "done performing kernel, testing results.." << std::endl;
-  HCTR_LIB_THROW(cudaMemcpy(weights_test.data(), gpu_data.weights.get_ptr(),
-                            sizeof(float) * embedding_vec_size * max_vocabulary_size,
-                            cudaMemcpyDeviceToHost));
-
-  const float epsilon = 1.0e-4;
-  double diff_ave = 0.0;
-
-  size_t count_neq = 0;
-  size_t count_all = 0;
-  bool all_el_equal = true;
-  for (auto const& pair : ref_categorize) {
-    const size_t& category = pair.first;
-    bool category_equal = true;
-    for (size_t j = 0; j < embedding_vec_size; ++j) {
-      size_t index = category * embedding_vec_size + j;
-      float diff = weights_ref[index] - weights_test[index];
-      diff = (diff > 0.f ? diff : -diff);
-      diff_ave += (double)diff;
-      all_el_equal = (all_el_equal && (diff < epsilon));
-      category_equal = category_equal && (diff < epsilon);
-
-      count_neq += (size_t)(diff >= epsilon);
-      count_all++;
-    }
-    if (!category_equal) {
-      HCTR_LOG_S(DEBUG, WORLD) << "Fail : the weights of category " << category
-                               << " are wrongly computed." << std::endl;
-      HCTR_LOG_S(DEBUG, WORLD) << "Weight expected : " << weights_ref[category * embedding_vec_size]
-                               << "\t weight calculated : "
-                               << weights_test[category * embedding_vec_size] << std::endl;
-    }
-  }
-
-  diff_ave /= static_cast<double>(count_all);
-  HCTR_LOG_S(DEBUG, WORLD) << "number of correct elements : " << count_all - count_neq
-                           << " out of  " << count_all << " = "
-                           << (double)(count_all - count_neq) / (double)count_all * 100.0 << " % "
-                           << std::endl;
-  if (!all_el_equal) {
-    HCTR_LOG_S(DEBUG, WORLD) << "average diff : " << diff_ave << std::endl;
-    {
-      auto log = HCTR_LOG_S(DEBUG, WORLD);
-      log << "CPU : ";
-      for (size_t i = 0; i < 10; ++i) {
-        log << '\t' << weights_ref[128 + i];
-      }
-      log << std::endl;
-    }
-    {
-      auto log = HCTR_LOG_S(DEBUG, WORLD);
-      log << "GPU : ";
-      for (size_t i = 0; i < 10; ++i) {
-        log << '\t' << weights_test[128 + i];
-      }
-      log << std::endl;
-    }
-  }
-  ASSERT_TRUE(all_el_equal && "not all embedding vector weights are updated correctly!");
-
-  bool all_el_zero = true;
-  for (size_t i = 0; i < max_vocabulary_size; ++i) {
-    if (ref_categorize.find(i) == ref_categorize.end()) {
-      for (size_t j = 0; j < embedding_vec_size; ++j) {
-        all_el_zero = all_el_zero && (weights_test[i * embedding_vec_size + j] == 0.f);
-      }
-    }
-  }
-  ASSERT_TRUE(all_el_zero && "some embedding vectors that shouldn't be updated were modified!");
-
-  HCTR_LOG_S(DEBUG, WORLD) << "Finished embedding update test SUCCESSFULLY!" << std::endl;
-}
-
-template <typename etype>
-void setup_and_run_randomized_test(const int N_test, const int embedding_vec_size,
-                                   const int num_samples) {
-  std::vector<size_t> category_size{39884, 3, 63, 10};
-  std::vector<size_t> category_offset(4);
-
-  size_t max_vocabulary_size = 0;
-  for (size_t i = 0; i < category_size.size(); ++i) {
-    category_offset[i] = max_vocabulary_size;
-    max_vocabulary_size += category_size[i];
-  }
-
-  std::vector<etype> wgrad(num_samples * embedding_vec_size, (etype)1.);
-
-  for (int n = 0; n < N_test; ++n) {
-    // create test input
-    std::vector<size_t> value_index;
-    for (int i = 0; i < num_samples; ++i) {
-      int embedding = rand() % 4;
-      size_t category = category_offset[embedding] + (size_t)rand() % category_size[embedding];
-      value_index.push_back(category);
-    }
-
-    // perform test
-    update_test<etype>(value_index, max_vocabulary_size, embedding_vec_size, wgrad);
-  }
-}
-
-TEST(localized_one_hot_update_test, fp16_sgd_atomic_cached) {
-  const int N_test = 5;
-  const int embedding_vec_size = 128;
-  const int num_samples = 64 * 1024;
-
-  for (size_t multiplier = 1; multiplier < 32; multiplier *= 2) {
-    setup_and_run_randomized_test<__half>(N_test, embedding_vec_size, num_samples);
-  }
-}
-
-TEST(localized_one_hot_update_test, fp32_sgd_atomic_cached) {
-  const int N_test = 5;
-  const int embedding_vec_size = 128;
-  const int num_samples = 64 * 1024;
-
-  for (size_t multiplier = 1; multiplier < 32; multiplier *= 2) {
-    setup_and_run_randomized_test<float>(N_test, embedding_vec_size, num_samples);
-  }
-}
diff --git a/test/utest/embedding/unified_embedding.hpp b/test/utest/embedding/unified_embedding.hpp
index ed7d8515d5..920ef2d780 100644
--- a/test/utest/embedding/unified_embedding.hpp
+++ b/test/utest/embedding/unified_embedding.hpp
@@ -23,7 +23,6 @@
 #include <embedding.hpp>
 #include <embeddings/distributed_slot_sparse_embedding_hash.hpp>
 #include <embeddings/localized_slot_sparse_embedding_hash.hpp>
-#include <embeddings/localized_slot_sparse_embedding_one_hot.hpp>
 #include <embeddings/sparse_embedding_functors.hpp>
 #include <memory>
 #include <utest/embedding/cpu_hashtable.hpp>
diff --git a/test/utest/embedding/unified_embedding_test.cpp b/test/utest/embedding/unified_embedding_test.cpp
index fc1859e25a..a084ad0af7 100644
--- a/test/utest/embedding/unified_embedding_test.cpp
+++ b/test/utest/embedding/unified_embedding_test.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <core23/mpi_init_service.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/embedding/unified_embedding.hpp>
 
 using namespace HugeCTR;
@@ -33,7 +33,7 @@ void unified_embedding_forward(const TestParams &test_param, const std::vector<i
   for (int i = 0; i < numprocs; ++i) {
     vvgpu.push_back(device_list);
   }
-  const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0);
   size_t total_gpu_count = resource_manager->get_global_gpu_count();
   size_t local_gpu_count = resource_manager->get_local_gpu_count();
 
diff --git a/test/utest/embedding_collection/configuration.hpp b/test/utest/embedding_collection/configuration.hpp
index d9f4217fef..26c5e4f94c 100644
--- a/test/utest/embedding_collection/configuration.hpp
+++ b/test/utest/embedding_collection/configuration.hpp
@@ -20,7 +20,7 @@
 #include <embedding/embedding.hpp>
 #include <embeddings/embedding_collection.hpp>
 #include <numeric>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/embedding_collection/embedding_collection_utils.hpp>
 #include <utest/embedding_collection/reference_embedding.hpp>
 
diff --git a/test/utest/embedding_collection/embedding_collection_utils.hpp b/test/utest/embedding_collection/embedding_collection_utils.hpp
index ebb922a766..c8bab4aae5 100644
--- a/test/utest/embedding_collection/embedding_collection_utils.hpp
+++ b/test/utest/embedding_collection/embedding_collection_utils.hpp
@@ -20,7 +20,7 @@
 #include <core/hctr_impl/hctr_backend.hpp>
 #include <embedding/embedding.hpp>
 #include <numeric>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 
 using namespace embedding;
 
diff --git a/test/utest/embedding_collection/test_compress_offset.cpp b/test/utest/embedding_collection/test_compress_offset.cpp
index 2aa2c52cf8..56bf9a8982 100644
--- a/test/utest/embedding_collection/test_compress_offset.cpp
+++ b/test/utest/embedding_collection/test_compress_offset.cpp
@@ -21,13 +21,13 @@
 #include <core23/tensor.hpp>
 #include <core23/tensor_operations.hpp>
 #include <embedding/operators/compress_offset.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utils.hpp>
 
 using namespace embedding;
 
 TEST(test_compress_offset, test_compress_offset) {
-  auto resource_manager = HugeCTR::ResourceManagerExt::create({{0}}, 0);
+  auto resource_manager = HugeCTR::ResourceManagerCore::create({{0}}, 0);
   auto core = std::make_shared<hctr_internal::HCTRCoreResourceManager>(resource_manager, 0);
   HugeCTR::CudaDeviceContext context(core->get_device_id());
   HugeCTR::core23::Device device(core23::DeviceType::GPU, core->get_device_id());
diff --git a/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp b/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp
index 3401d24806..5f6a5724c4 100644
--- a/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp
+++ b/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp
@@ -24,7 +24,7 @@
 #include <embeddings/embedding_collection.hpp>
 #include <include/embeddings/embedding_collection.hpp>
 #include <numeric>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/embedding_collection/embedding_collection_utils.hpp>
 #include <utest/embedding_collection/reference_embedding.hpp>
 
@@ -283,7 +283,7 @@ void embedding_collection_e2e_io(const std::vector<LookupParam>& lookup_params,
                                      {}};
   auto table_param_list = get_table_param_list_io(ebc_param.emb_type);
 
-  auto resource_manager = HugeCTR::ResourceManagerExt::create({device_list}, 0);
+  auto resource_manager = HugeCTR::ResourceManagerCore::create({device_list}, 0);
   EmbeddingIO emb_io = EmbeddingIO(resource_manager);
   int num_gpus = static_cast<int>(device_list.size());
   int batch_size_per_gpu = batch_size / num_gpus;
@@ -374,6 +374,7 @@ void embedding_collection_e2e_io(const std::vector<LookupParam>& lookup_params,
 
   std::vector<HugeCTR::DataDistributor::Result> data_distributor_outputs;
   for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
+    HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id());
     data_distributor_outputs.push_back(HugeCTR::allocate_output_for_data_distributor(
         core_resource_manager_list[gpu_id], ebc_param));
   }
@@ -443,6 +444,7 @@ void embedding_collection_e2e_io(const std::vector<LookupParam>& lookup_params,
 
   auto sync_gpus = [&]() {
     for (auto core : core_resource_manager_list) {
+      HugeCTR::CudaDeviceContext context(core->get_device_id());
       HCTR_LIB_THROW(cudaStreamSynchronize(core->get_local_gpu()->get_stream()));
     }
   };
@@ -486,6 +488,7 @@ void embedding_collection_e2e_io(const std::vector<LookupParam>& lookup_params,
     emb_ref.embedding_forward_cpu(key_list, bucket_range);
 #pragma omp parallel for num_threads(num_gpus)
     for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
+      HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id());
       data_distributor->distribute(gpu_id, sparse_dp_tensors[gpu_id],
                                    sparse_dp_num_keys_per_bucket[gpu_id],
                                    data_distributor_outputs[gpu_id], batch_size);
@@ -508,6 +511,7 @@ void embedding_collection_e2e_io(const std::vector<LookupParam>& lookup_params,
     emb_ref.embedding_backward_cpu(top_grads, key_list, bucket_range);
 #pragma omp parallel for num_threads(num_gpus)
     for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
+      HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id());
       ebc->backward_per_gpu(gpu_id, data_distributor_outputs[gpu_id], ebc_top_grads[gpu_id],
                             batch_size);
     }
@@ -517,6 +521,7 @@ void embedding_collection_e2e_io(const std::vector<LookupParam>& lookup_params,
     emb_ref.embedding_update_cpu();
 #pragma omp parallel for num_threads(num_gpus)
     for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
+      HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id());
       ebc->update_per_gpu(gpu_id);
     }
     sync_gpus();
diff --git a/test/utest/embedding_collection/test_embedding_collection_v2.cu b/test/utest/embedding_collection/test_embedding_collection_v2.cu
index dd0e0d233c..fdba5e744f 100644
--- a/test/utest/embedding_collection/test_embedding_collection_v2.cu
+++ b/test/utest/embedding_collection/test_embedding_collection_v2.cu
@@ -25,7 +25,7 @@
 #include <embeddings/embedding_collection.hpp>
 #include <filesystem>
 #include <numeric>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/embedding_collection/configuration.hpp>
 #include <utest/embedding_collection/reference_embedding.hpp>
 #include <utils.cuh>
@@ -280,7 +280,7 @@ void embedding_collection_e2e(const Configuration &config) {
   std::iota(device_list_per_node.begin(), device_list_per_node.end(), 0);
   std::vector<std::vector<int>> device_list(num_nodes, device_list_per_node);
 
-  auto resource_manager = HugeCTR::ResourceManagerExt::create(device_list, 0);
+  auto resource_manager = HugeCTR::ResourceManagerCore::create(device_list, 0);
 
   std::vector<std::shared_ptr<core::CoreResourceManager>> core_resource_manager_list;
   for (int gpu_id = 0; gpu_id < num_local_gpus; ++gpu_id) {
diff --git a/test/utest/embedding_collection/test_embedding_table.cpp b/test/utest/embedding_collection/test_embedding_table.cpp
index 9fe943be17..71f77d5c88 100644
--- a/test/utest/embedding_collection/test_embedding_table.cpp
+++ b/test/utest/embedding_collection/test_embedding_table.cpp
@@ -21,7 +21,7 @@
 #include <embedding/operators/keys_to_indices.hpp>
 #include <embedding_storage/dynamic_embedding.hpp>
 #include <embedding_storage/ragged_static_embedding.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 
 using namespace embedding;
 int num_embedding_table = 3;
@@ -48,7 +48,7 @@ template <typename key_t, typename index_t>
 void test_embedding_table(int device_id, int table_type) {
   std::vector<int> device_list{device_id};
   HugeCTR::CudaDeviceContext context(device_id);
-  auto resource_manager = HugeCTR::ResourceManagerExt::create({device_list}, 0);
+  auto resource_manager = HugeCTR::ResourceManagerCore::create({device_list}, 0);
   auto core = std::make_shared<hctr_internal::HCTRCoreResourceManager>(resource_manager, 0);
 
   auto key_type = HugeCTR::core23::ToScalarType<key_t>::value;
diff --git a/test/utest/embedding_collection/test_embedding_table_optimizer.cpp b/test/utest/embedding_collection/test_embedding_table_optimizer.cpp
index c99ecc346b..cd72f15c46 100644
--- a/test/utest/embedding_collection/test_embedding_table_optimizer.cpp
+++ b/test/utest/embedding_collection/test_embedding_table_optimizer.cpp
@@ -23,7 +23,7 @@
 #include <embedding_storage/dynamic_embedding_cpu.hpp>
 #include <embedding_storage/ragged_static_embedding.hpp>
 #include <random>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 
 using namespace embedding;
 
@@ -41,7 +41,7 @@ void test_embedding_table_optimizer(int device_id, const char table_type[],
   std::vector<int> device_list{device_id};
   HugeCTR::CudaDeviceContext context(device_id);
 
-  auto resource_manager = HugeCTR::ResourceManagerExt::create({device_list}, 0);
+  auto resource_manager = HugeCTR::ResourceManagerCore::create({device_list}, 0);
   auto core = std::make_shared<hctr_internal::HCTRCoreResourceManager>(resource_manager, 0);
 
   const auto key_type = core23::ToScalarType<Key>::value;
diff --git a/test/utest/metrics/auc_test.cpp b/test/utest/metrics/auc_test.cpp
index 4643caf561..e974a5e9f6 100644
--- a/test/utest/metrics/auc_test.cpp
+++ b/test/utest/metrics/auc_test.cpp
@@ -21,7 +21,7 @@
 #include <functional>
 #include <general_buffer2.hpp>
 #include <metrics.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 #include <utest/test_utils.hpp>
 #include <vector>
@@ -217,7 +217,7 @@ void metric_test(std::vector<int> device_list, size_t batch_size, size_t num_tot
   for (int i = 0; i < num_procs; i++) {
     vvgpu.push_back(device_list);
   }
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
+  const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242);
 
   // Setup the containers
   core23::Shape dims = {static_cast<int64_t>(batch_size / num_classes),
diff --git a/test/utest/metrics/averageloss_test.cpp b/test/utest/metrics/averageloss_test.cpp
index 54c5da6ba0..7decb7b6d7 100644
--- a/test/utest/metrics/averageloss_test.cpp
+++ b/test/utest/metrics/averageloss_test.cpp
@@ -21,7 +21,7 @@
 #include <fstream>
 #include <functional>
 #include <metrics.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <sstream>
 #include <utest/test_utils.hpp>
 #include <vector>
@@ -129,7 +129,7 @@ void averageloss_test(std::vector<int> device_list, size_t batch_size, size_t nu
   for (int i = 0; i < num_procs; i++) {
     vvgpu.push_back(device_list);
   }
-  const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242);
+  const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242);
 
   // Create AverageLoss metric
   auto metric = std::make_unique<metrics::AverageLoss<T>>(resource_manager);
diff --git a/test/utest/network/network_build_test.cpp b/test/utest/network/network_build_test.cpp
index a1f089d8ca..421e1d5cac 100644
--- a/test/utest/network/network_build_test.cpp
+++ b/test/utest/network/network_build_test.cpp
@@ -30,7 +30,7 @@
 #include <optimizer.hpp>
 #include <random>
 #include <regularizer_factory.hpp>
-#include <resource_managers/resource_manager_ext.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <tuple>
 #include <utest/test_utils.hpp>
 
@@ -117,7 +117,7 @@ void network_build_test() {
   std::vector<int> device_vec(core23::Device::count());
   std::generate(device_vec.begin(), device_vec.end(), [dev = 0]() mutable { return dev++; });
   std::vector<std::vector<int>> vvgpu(1, device_vec);
-  const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0);
+  const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0);
 
   std::vector<std::shared_ptr<Network>> networks;
   std::vector<std::pair<core23::Tensor, core23::Tensor>> train_label_and_first_tensors;
diff --git a/test/utest/pipeline/pipeline_test.cu b/test/utest/pipeline/pipeline_test.cu
index 6587f31e4d..b71a2081af 100644
--- a/test/utest/pipeline/pipeline_test.cu
+++ b/test/utest/pipeline/pipeline_test.cu
@@ -19,7 +19,7 @@
 
 #include <gpu_resource.hpp>
 #include <pipeline.hpp>
-#include <resource_manager.hpp>
+#include <resource_managers/resource_manager_core.hpp>
 #include <utest/test_utils.hpp>
 
 using namespace HugeCTR;
@@ -43,7 +43,7 @@ __global__ void setC(float *var, int count) {
 }
 
 void pipeline_test(const std::vector<int> &device_list, bool use_graph) {
-  const auto &resource_manager = ResourceManager::create({device_list}, 0);
+  const auto &resource_manager = ResourceManagerCore::create({device_list}, 0);
   cudaProfilerStart();
   std::vector<Pipeline> pipeline_list;
   std::vector<Pipeline> dup_pipeline_list;
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 0e7c96ade7..dc0492db15 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -17,7 +17,6 @@ cmake_minimum_required(VERSION 3.20)
 if(NOT DISABLE_CUDF)
     add_subdirectory(raw_script)
     add_subdirectory(dlrm_script)
-    add_subdirectory(io_benchmark)
     add_subdirectory(db_benchmark)
     add_subdirectory(inference_test_scripts)
 endif()
\ No newline at end of file
diff --git a/tools/io_benchmark/main.cpp b/tools/io_benchmark/main.cpp
deleted file mode 100644
index 0b180cd756..0000000000
--- a/tools/io_benchmark/main.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <argparse/argparse.hpp>
-#include <chrono>
-#include <common.hpp>
-#include <data_readers/async_reader/async_reader.hpp>
-#include <filesystem>
-#include <resource_manager.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-std::vector<int> str_to_vec(const std::string& str) {
-  std::istringstream is(str);
-  std::vector<std::string> tokens{std::istream_iterator<std::string>{is},
-                                  std::istream_iterator<std::string>{}};
-  std::vector<int> res;
-  for (auto& s : tokens) {
-    res.push_back(std::stoi(s));
-  }
-  return res;
-}
-
-int main(int argc, char** argv) {
-  argparse::ArgumentParser args("read_upload_bench");
-
-  args.add_argument("--num_dense").default_value(13).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("--num_categorical").default_value(26).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("--batch_size").default_value(65536).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("--gpus")
-      .default_value(std::string("0"))
-      .help("Space-delimited list of GPUs to upload the data onto");
-
-  args.add_argument("--num_threads").default_value(1).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("--num_batches_per_thread")
-      .default_value(1)
-      .action([](const std::string& value) { return std::stoi(value); });
-
-  args.add_argument("--io_block_size").default_value(524288).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("--io_depth").default_value(2).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("--io_alignment").default_value(512).action([](const std::string& value) {
-    return std::stoi(value);
-  });
-
-  args.add_argument("file").remaining();
-
-  try {
-    args.parse_args(argc, argv);
-  } catch (const std::runtime_error& err) {
-    std::cout << err.what() << std::endl;
-    std::cout << args;
-    exit(1);
-  }
-
-  std::string fname;
-  try {
-    fname = args.get<std::string>("file");
-  } catch (std::logic_error& e) {
-    std::cout << "No input file provided" << std::endl;
-    exit(1);
-  }
-
-  const int sample_dim = args.get<int>("--num_dense") + args.get<int>("--num_categorical") + 1;
-  const int batch_size_bytes = args.get<int>("--batch_size") * sample_dim * sizeof(int);
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Init(&argc, &argv));
-#endif
-  HCTR_LIB_THROW(nvmlInit_v2());
-
-  std::vector<std::vector<int>> vvgpu;
-  vvgpu.push_back(str_to_vec(args.get<std::string>("--gpus")));
-  const auto resource_manager = ResourceManager::create(vvgpu, 424242);
-
-  AsyncReaderImpl reader_impl(
-      fname, batch_size_bytes, resource_manager.get(), args.get<int>("--num_threads"),
-      args.get<int>("--num_batches_per_thread"), args.get<int>("--io_block_size"),
-      args.get<int>("--io_depth"), args.get<int>("--io_alignment"));
-
-  HCTR_LOG(INFO, WORLD, "Initialization done, starting to read...\n");
-  fflush(stdout);
-  auto start = std::chrono::high_resolution_clock::now();
-
-  reader_impl.load_async();
-
-  size_t sz = 1;
-  while (sz > 0) {
-    BatchDesc desc = reader_impl.get_batch();
-    sz = desc.size_bytes;
-    // usleep(200);
-    reader_impl.finalize_batch();
-  }
-
-  auto end = std::chrono::high_resolution_clock::now();
-  auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  HCTR_LOG(INFO, WORLD, "Reading took %.3fs, B/W %.2f GB/s\n", elapsed.count() / 1000.0,
-           std::filesystem::file_size(fname) / ((double)elapsed.count() * 1e6));
-
-#ifdef ENABLE_MPI
-  HCTR_MPI_THROW(MPI_Finalize());
-#endif
-
-  return 0;
-}