From 80925331117a93cf804db9a55d4d9ab4f4edb163 Mon Sep 17 00:00:00 2001 From: "Alex Liu (Engrg-Hardware 1)" Date: Thu, 18 Jan 2024 21:59:24 -0800 Subject: [PATCH] [READY]Deprecate AsyncDataReader, LocalizedOneHot and Hybrid Embedding --- .gitlab-ci.yml | 55 - HugeCTR/include/collectives/collective.hpp | 53 + HugeCTR/include/common.hpp | 46 +- HugeCTR/include/data_reader.hpp | 2 +- .../async_reader/async_reader.hpp | 74 - .../async_reader/async_reader_adapter.hpp | 151 -- .../async_reader/async_reader_common.hpp | 104 -- .../async_reader/split_label_dense_sparse.hpp | 26 - .../async_reader/thread_async_reader.hpp | 75 - .../async_reader_common.hpp} | 61 +- .../parquet_data_reader_worker.hpp | 2 +- .../data_readers/row_group_reading_thread.hpp | 2 +- HugeCTR/include/embedding.hpp | 23 - .../embeddings/hybrid_embedding/allocator.hpp | 45 - .../hybrid_embedding/calibration_data.hpp | 118 -- .../hybrid_embedding/communication.hpp | 135 -- .../embeddings/hybrid_embedding/data.hpp | 97 -- .../hybrid_embedding/frequent_embedding.hpp | 168 --- .../hybrid_embedding/hybrid_indices.hpp | 230 --- .../hybrid_embedding/indices_container.hpp | 65 - .../hybrid_embedding/infrequent_embedding.hpp | 195 --- .../embeddings/hybrid_embedding/model.hpp | 102 -- .../embeddings/hybrid_embedding/select.cuh | 125 -- .../hybrid_embedding/statistics.hpp | 131 -- .../embeddings/hybrid_embedding/update.cuh | 90 -- .../embeddings/hybrid_embedding/utils.cuh | 33 - .../embeddings/hybrid_embedding/utils.hpp | 38 - .../embeddings/hybrid_sparse_embedding.hpp | 273 ---- ...ocalized_slot_sparse_embedding_one_hot.hpp | 485 ------ .../embeddings/sparse_embedding_functors.hpp | 41 - HugeCTR/include/exchange_wgrad.hpp | 11 +- HugeCTR/include/parser.hpp | 13 +- HugeCTR/include/pybind/common_wrapper.hpp | 25 - HugeCTR/include/pybind/model.hpp | 71 +- HugeCTR/include/pybind/model_wrapper.hpp | 9 +- HugeCTR/include/resource_manager.hpp | 12 +- HugeCTR/include/resource_manager_base.hpp | 1 + .../resource_manager_core.hpp | 27 +- .../resource_manager_ext.hpp | 112 -- HugeCTR/include/scheduleable.hpp | 18 - HugeCTR/src/collectives/collective.cpp | 50 + .../async_reader/async_reader.cpp | 253 ---- .../async_reader/async_reader_adapter.cpp | 514 ------- .../data_readers/async_reader/broadcast.cu | 88 -- .../async_reader/split_label_dense_sparse.cu | 226 --- .../async_reader/thread_async_reader.cpp | 336 ----- .../multi_hot/async_data_reader.cpp | 3 +- .../hybrid_embedding/calibration_data.cu | 211 --- .../hybrid_embedding/communication.cu | 174 --- .../src/embeddings/hybrid_embedding/data.cu | 148 -- .../hybrid_embedding/frequent_embedding.cu | 487 ------ .../hybrid_embedding/hybrid_indices.cu | 541 ------- .../hybrid_embedding/indices_container.cu | 66 - .../hybrid_embedding/infrequent_embedding.cu | 670 --------- .../src/embeddings/hybrid_embedding/model.cu | 154 -- .../embeddings/hybrid_embedding/statistics.cu | 412 ----- .../src/embeddings/hybrid_embedding/utils.cu | 127 -- .../src/embeddings/hybrid_sparse_embedding.cu | 820 ---------- ...localized_slot_sparse_embedding_one_hot.cu | 1334 ----------------- .../src/embeddings/update_params_functor.cu | 256 ---- HugeCTR/src/exchange_wgrad.cpp | 22 +- HugeCTR/src/pybind/add_dense_layer.cpp | 22 - HugeCTR/src/pybind/add_input.cpp | 146 +- HugeCTR/src/pybind/add_sparse_embedding.cpp | 110 +- HugeCTR/src/pybind/model.cpp | 1177 +-------------- HugeCTR/src/pybind/model_compile.cpp | 977 ++++++++++++ HugeCTR/src/pybind/model_pipeline.cpp | 489 +----- HugeCTR/src/resource_manager.cpp | 2 +- .../resource_manager_ext.cpp | 76 - ci/integration_test/dlrm/benchmark_14node.sub | 8 - ci/integration_test/dlrm/benchmark_1node.sub | 8 - ci/integration_test/dlrm/dlrm.sub | 8 - ci/integration_test/dlrm/ib_nvlink_1node.sub | 10 - ci/integration_test/dlrm/ib_nvlink_8node.sub | 10 - .../overlapped_pipeline.sub | 5 - ci/selene/ci.yml | 48 - ci/template.yml | 2 +- ci/utest/utest.sub | 1 - samples/dlrm/README.md | 142 +- samples/dlrm/config_DGXH100_16x8x1056.sh | 32 + samples/dlrm/config_DGXH100_1x8x6912.sh | 28 + samples/dlrm/config_DGXH100_8x8x2112.sh | 32 + samples/dlrm/dgx_a100.py | 229 --- samples/dlrm/dgx_a100_14x8x640.py | 248 --- samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py | 244 --- samples/dlrm/dgx_a100_ib_nvlink.py | 228 --- samples/dlrm/mlperf_logger/__init__.py | 3 + samples/dlrm/mlperf_logger/callbacks.py | 118 ++ samples/dlrm/mlperf_logger/param_info.py | 70 + .../dlrm/mlperf_logger/utils.py | 21 +- samples/dlrm/preprocessing/convert_to_raw.py | 251 ++++ .../preprocessing/md5sums_raw_dataset.txt | 3 + samples/dlrm/requirements.txt | 3 + samples/dlrm/run.sub | 140 ++ samples/dlrm/run_and_time.sh | 68 + samples/dlrm/run_with_docker.sh | 95 ++ samples/dlrm/sharding/__init__.py | 2 + samples/dlrm/sharding/generate_plan.py | 131 ++ samples/dlrm/sharding/planner.py | 327 ++++ samples/dlrm/train.py | 485 ++++++ .../sparse_operation_kit/dynamic_variable.py | 1 - test/utest/communication/ar_oneshot_test.cu | 11 +- .../ib_comms_a2a_v_integ_test.cu | 11 +- .../communication/ib_comms_a2a_v_test.cu | 13 +- test/utest/communication/ib_comms_ar_test.cu | 11 +- .../data_distributor_tests.cpp | 4 +- test/utest/data_reader/CMakeLists.txt | 7 - .../data_reader_async_adapter_test.cpp | 242 --- .../data_reader/data_reader_async_test.cpp | 127 -- .../data_reader/data_reader_benchmark.cu | 5 +- .../data_reader/data_reader_parquet_test.cpp | 12 +- .../data_reader/data_reader_v2_async_test.cpp | 4 +- .../multi_hot_async_data_reader_test.cpp | 6 +- ...ributed_slot_sparse_embedding_hash_test.cu | 8 +- .../embedding/hybrid_embedding/data_test.cpp | 181 --- .../embedding/hybrid_embedding/data_test.hpp | 62 - .../hybrid_embedding/end_to_end_test.cpp | 766 ---------- .../hybrid_embedding/forward_test.cpp | 475 ------ .../hybrid_embedding/hybrid_embedding_cpu.cpp | 591 -------- .../hybrid_embedding/hybrid_embedding_cpu.hpp | 113 -- .../hybrid_embedding/indices_test.cpp | 445 ------ .../hybrid_embedding/input_generator.cpp | 296 ---- .../hybrid_embedding/input_generator.hpp | 93 -- .../hybrid_embedding/messages_test.cpp | 466 ------ .../embedding/hybrid_embedding/model_test.cpp | 630 -------- .../embedding/hybrid_embedding/select_test.cu | 140 -- .../hybrid_embedding/statistics_test.cpp | 248 --- .../hybrid_embedding/statistics_test.hpp | 31 - .../hybrid_embedding/test_common.cuh | 242 --- .../hybrid_embedding/update_test.cpp | 488 ------ .../hybrid_sparse_embedding_test.cpp | 215 --- ...calized_slot_sparse_embedding_hash_test.cu | 8 +- ...ized_slot_sparse_embedding_one_hot_test.cu | 847 ----------- ...ot_sparse_embedding_one_hot_update_test.cu | 259 ---- test/utest/embedding/unified_embedding.hpp | 1 - .../embedding/unified_embedding_test.cpp | 4 +- .../embedding_collection/configuration.hpp | 2 +- .../embedding_collection_utils.hpp | 2 +- .../test_compress_offset.cpp | 4 +- .../test_embedding_collection_load_dump.cpp | 9 +- .../test_embedding_collection_v2.cu | 4 +- .../test_embedding_table.cpp | 4 +- .../test_embedding_table_optimizer.cpp | 4 +- test/utest/metrics/auc_test.cpp | 4 +- test/utest/metrics/averageloss_test.cpp | 4 +- test/utest/network/network_build_test.cpp | 4 +- test/utest/pipeline/pipeline_test.cu | 4 +- tools/CMakeLists.txt | 1 - tools/io_benchmark/main.cpp | 136 -- 149 files changed, 3214 insertions(+), 20266 deletions(-) create mode 100644 HugeCTR/include/collectives/collective.hpp delete mode 100644 HugeCTR/include/data_readers/async_reader/async_reader.hpp delete mode 100644 HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp delete mode 100644 HugeCTR/include/data_readers/async_reader/async_reader_common.hpp delete mode 100644 HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp delete mode 100644 HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp rename HugeCTR/include/data_readers/{async_reader/broadcast.hpp => multi_hot/async_reader_common.hpp} (66%) delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/communication.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/data.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/model.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/select.cuh delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/update.cuh delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/utils.cuh delete mode 100644 HugeCTR/include/embeddings/hybrid_embedding/utils.hpp delete mode 100644 HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp delete mode 100644 HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp delete mode 100644 HugeCTR/include/resource_managers/resource_manager_ext.hpp create mode 100644 HugeCTR/src/collectives/collective.cpp delete mode 100644 HugeCTR/src/data_readers/async_reader/async_reader.cpp delete mode 100644 HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp delete mode 100644 HugeCTR/src/data_readers/async_reader/broadcast.cu delete mode 100644 HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu delete mode 100644 HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/communication.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/data.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/model.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/statistics.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_embedding/utils.cu delete mode 100644 HugeCTR/src/embeddings/hybrid_sparse_embedding.cu delete mode 100644 HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu delete mode 100644 HugeCTR/src/embeddings/update_params_functor.cu create mode 100644 HugeCTR/src/pybind/model_compile.cpp delete mode 100644 HugeCTR/src/resource_managers/resource_manager_ext.cpp delete mode 100644 ci/integration_test/dlrm/benchmark_14node.sub delete mode 100644 ci/integration_test/dlrm/benchmark_1node.sub delete mode 100644 ci/integration_test/dlrm/dlrm.sub delete mode 100644 ci/integration_test/dlrm/ib_nvlink_1node.sub delete mode 100644 ci/integration_test/dlrm/ib_nvlink_8node.sub delete mode 100644 ci/integration_test/mlperf_generalization/overlapped_pipeline.sub create mode 100644 samples/dlrm/config_DGXH100_16x8x1056.sh create mode 100644 samples/dlrm/config_DGXH100_1x8x6912.sh create mode 100644 samples/dlrm/config_DGXH100_8x8x2112.sh delete mode 100644 samples/dlrm/dgx_a100.py delete mode 100644 samples/dlrm/dgx_a100_14x8x640.py delete mode 100755 samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py delete mode 100755 samples/dlrm/dgx_a100_ib_nvlink.py create mode 100644 samples/dlrm/mlperf_logger/__init__.py create mode 100644 samples/dlrm/mlperf_logger/callbacks.py create mode 100644 samples/dlrm/mlperf_logger/param_info.py rename tools/io_benchmark/CMakeLists.txt => samples/dlrm/mlperf_logger/utils.py (56%) create mode 100644 samples/dlrm/preprocessing/convert_to_raw.py create mode 100644 samples/dlrm/preprocessing/md5sums_raw_dataset.txt create mode 100644 samples/dlrm/requirements.txt create mode 100755 samples/dlrm/run.sub create mode 100755 samples/dlrm/run_and_time.sh create mode 100755 samples/dlrm/run_with_docker.sh create mode 100644 samples/dlrm/sharding/__init__.py create mode 100644 samples/dlrm/sharding/generate_plan.py create mode 100644 samples/dlrm/sharding/planner.py create mode 100644 samples/dlrm/train.py delete mode 100644 test/utest/data_reader/data_reader_async_adapter_test.cpp delete mode 100644 test/utest/data_reader/data_reader_async_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/data_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/data_test.hpp delete mode 100644 test/utest/embedding/hybrid_embedding/end_to_end_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/forward_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp delete mode 100644 test/utest/embedding/hybrid_embedding/indices_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/input_generator.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/input_generator.hpp delete mode 100644 test/utest/embedding/hybrid_embedding/messages_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/model_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/select_test.cu delete mode 100644 test/utest/embedding/hybrid_embedding/statistics_test.cpp delete mode 100644 test/utest/embedding/hybrid_embedding/statistics_test.hpp delete mode 100644 test/utest/embedding/hybrid_embedding/test_common.cuh delete mode 100644 test/utest/embedding/hybrid_embedding/update_test.cpp delete mode 100644 test/utest/embedding/hybrid_sparse_embedding_test.cpp delete mode 100644 test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu delete mode 100644 test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu delete mode 100644 tools/io_benchmark/main.cpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 757e2393ec..ed2bb603e1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -294,47 +294,6 @@ criteo_multi_node: DGXNNODES: 2 TEST_CMD: ./ci/integration_test/criteo/criteo_multi_node.sub -dlrm_benchmark_14node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/dlrm_benchmark_14node - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /raid:/raid - WALLTIME: "00:15:00" - SBATCH_OTHER_PARAMS: --network sharp - DGXNNODES: 14 - TEST_CMD: ./ci/integration_test/dlrm/benchmark_14node.sub - -dlrm_ib_nvlink_1node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/dlrm_ib_nvlink_1node - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /raid/datasets/criteo/mlperf/40m.limit_preshuffled:/data - WALLTIME: "00:10:00" - DGXNNODES: 1 - TEST_CMD: ./ci/integration_test/dlrm/ib_nvlink_1node.sub - -dlrm_ib_nvlink_8node: - extends: .cluster_test_job_daily - needs: - - build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/dlrm_ib_nvlink_8node - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /raid/datasets/criteo/mlperf/40m.limit_preshuffled:/data - WALLTIME: "00:10:00" - SBATCH_OTHER_PARAMS: --comment=metrics - DGXNNODES: 8 - TEST_CMD: ./ci/integration_test/dlrm/ib_nvlink_8node.sub - dlrm_dcnv2_benchmark_8node: extends: .cluster_test_job_daily needs: @@ -576,20 +535,6 @@ inference_CPU_Memory_check: DGXNNODES: 1 TEST_CMD: ./ci/post_test/check_cpu_usage.sub -dlrm_14node_check: - # Push logs to gitlab - extends: .cluster_post_test_job_daily - needs: - - dlrm_benchmark_14node - variables: - GPFSFOLDER: $LOGDIR/dlrm_14node_check - GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: $LOGDIR/dlrm_benchmark_14node:/logs - WALLTIME: "00:15:00" - DGXNNODES: 1 - TEST_CMD: ./ci/post_test/check_dlrm_14node.sub - dlrm_dcnv2_8node_check: # Push logs to gitlab extends: .cluster_post_test_job_daily diff --git a/HugeCTR/include/collectives/collective.hpp b/HugeCTR/include/collectives/collective.hpp new file mode 100644 index 0000000000..565b5ee5c8 --- /dev/null +++ b/HugeCTR/include/collectives/collective.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace HugeCTR { + +/** + * @brief GPU resources manager which holds all the resources required by training + * + * An extended GPU Resource manager + */ +class CollectiveManager { + std::shared_ptr core_; + +#ifdef ENABLE_MPI + std::unique_ptr ib_comm_ = NULL; +#endif + std::shared_ptr ar_comm_ = NULL; + + public: + CollectiveManager() = default; + CollectiveManager(const std::shared_ptr& core) : core_(core) {} + + HCTR_DISALLOW_COPY_AND_MOVE(CollectiveManager); + +#ifdef ENABLE_MPI + void init_ib_comm(); + IbComm* get_ib_comm() const { return ib_comm_.get(); } + void set_ready_to_transfer() { + if (ib_comm_) ib_comm_->set_ready_to_transfer(); + } +#endif + void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision); + AllReduceInPlaceComm* get_ar_comm() const { return ar_comm_.get(); } +}; +} // namespace HugeCTR diff --git a/HugeCTR/include/common.hpp b/HugeCTR/include/common.hpp index 389d00f6bb..00bc55e692 100644 --- a/HugeCTR/include/common.hpp +++ b/HugeCTR/include/common.hpp @@ -64,17 +64,8 @@ namespace HugeCTR { #define WARP_SIZE 32 -namespace hybrid_embedding { - -enum class HybridEmbeddingType; -enum class CommunicationType; - -} // namespace hybrid_embedding - enum class Check_t { Sum, None, Unknown }; -enum class DataReaderSparse_t { Distributed, Localized }; - enum class DataReaderType_t { Norm, Raw, Parquet, RawAsync }; enum class SourceType_t { FileList, Mmap, Parquet }; @@ -154,36 +145,17 @@ enum class Layer_t { enum class Embedding_t { DistributedSlotSparseEmbeddingHash, LocalizedSlotSparseEmbeddingHash, - LocalizedSlotSparseEmbeddingOneHot, - HybridSparseEmbedding, None }; enum class Initializer_t { Default, Uniform, XavierNorm, XavierUniform, Sinusoidal, Zero }; -enum class TrainState_t { - Init, - BottomMLPFprop, - TopMLPFprop, - BottomMLPBprop, - TopMLPBprop, - MLPExchangeWgrad, - MLPUpdate, - Finalize -}; - enum class Distribution_t { Uniform, PowerLaw }; enum class PowerLaw_t { Long, Medium, Short, Specific }; enum class Tensor_t { Train, Evaluate }; -// TODO: Consider to move them into a separate file -struct TrainState { - TrainState_t state = TrainState_t::Init; - cudaEvent_t* event = nullptr; -}; - struct AsyncParam { int num_threads; int num_batches_per_thread; @@ -209,17 +181,6 @@ struct AsyncParam { is_dense_float(is_dense_float) {} }; -struct HybridEmbeddingParam { - size_t max_num_frequent_categories; - int64_t max_num_infrequent_samples; - double p_dup_max; - double max_all_reduce_bandwidth; - double max_all_to_all_bandwidth; - double efficiency_bandwidth_ratio; - hybrid_embedding::CommunicationType communication_type; - hybrid_embedding::HybridEmbeddingType hybrid_embedding_type; -}; - typedef struct DataSetHeader_ { long long error_check; // 0: no error check; 1: check_sum long long number_of_records; // the number of samples in this data file @@ -278,7 +239,6 @@ struct DataReaderSparseParam { std::vector is_slot_fixed_length; int slot_num; - DataReaderSparse_t type; int max_feature_num; int max_nnz; @@ -289,8 +249,7 @@ struct DataReaderSparseParam { nnz_per_slot(nnz_per_slot_), is_fixed_length(is_fixed_length_), is_slot_fixed_length(std::vector(slot_num_, is_fixed_length_)), - slot_num(slot_num_), - type(DataReaderSparse_t::Distributed) { + slot_num(slot_num_) { HCTR_CHECK_HINT(slot_num_ > 0, "Illegal value for slot_num!"); if (static_cast(slot_num_) != nnz_per_slot_.size()) { HCTR_OWN_THROW(Error_t::WrongInput, "slot num != nnz_per_slot.size()."); @@ -312,8 +271,7 @@ struct DataReaderSparseParam { nnz_per_slot(slot_num_, nnz_per_slot_), is_fixed_length(is_fixed_length_), is_slot_fixed_length(std::vector(slot_num_, is_fixed_length_)), - slot_num(slot_num_), - type(DataReaderSparse_t::Distributed) { + slot_num(slot_num_) { HCTR_CHECK_HINT(slot_num_ > 0, "Illegal value for slot_num!"); for (size_t i = 0; i < nnz_per_slot.size(); i++) { if (nnz_per_slot[i] == 1) { diff --git a/HugeCTR/include/data_reader.hpp b/HugeCTR/include/data_reader.hpp index db08757b6f..2e7bfc3c45 100644 --- a/HugeCTR/include/data_reader.hpp +++ b/HugeCTR/include/data_reader.hpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/HugeCTR/include/data_readers/async_reader/async_reader.hpp b/HugeCTR/include/data_readers/async_reader/async_reader.hpp deleted file mode 100644 index 664680dad3..0000000000 --- a/HugeCTR/include/data_readers/async_reader/async_reader.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -class ResourceManager; - -class AsyncReaderImpl { - public: - AsyncReaderImpl(std::string fname, size_t batch_size_bytes, - const ResourceManager* resource_manager, int num_threads, - int num_batches_per_thread, size_t io_block_size, int io_depth, int io_alignment, - bool shuffle = false, bool wait_for_gpu_idle = false); - - bool is_currently_loading(); - size_t get_num_buffers() const; - size_t get_num_batches() const; - void load_async(); - void reset(); - BatchDesc get_batch(); - void finalize_batch(); - void finalize_batch(cudaEvent_t* event); - int get_last_batch_device(); - void wait_for_gpu_events(const std::vector events); - void wait_for_gpu_event(cudaEvent_t* event, int raw_device_id); - ~AsyncReaderImpl(); - - private: - std::string fname_; - size_t batch_size_bytes_; - size_t num_batches_; - const ResourceManager* resource_manager_; - int num_devices_, num_threads_, num_batches_per_thread_; - size_t io_block_size_; - int io_depth_, io_alignment_; - InternalBatchBuffer* last_buffer_ = nullptr; - size_t total_file_size_; - bool wait_for_gpu_idle_; - int queue_id_; - bool loop_ = true; - cudaEvent_t event_success_; - - std::vector batch_ids_; - std::vector> buffers_; - std::vector threads_; - std::vector streams_; - std::vector> thread_batch_ids_; - std::vector> thread_buffer_ids_, gpu_thread_ids_; - std::vector> local_readers_; - - void create_workers(); -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp b/HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp deleted file mode 100644 index 65f8700e19..0000000000 --- a/HugeCTR/include/data_readers/async_reader/async_reader_adapter.hpp +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -namespace HugeCTR { -template -class AsyncReader : public SchedulableDataReader { - using LabelType = float; - using InputType = int; - - public: - // Default params: num_threads = num_local_gpus, io_block_size = 512000, io_depth = 2, - // io_alignment = 512 - AsyncReader(std::string fname, size_t batch_size, size_t label_dim, size_t dense_dim, - std::vector& params, bool mixed_precision, - const std::shared_ptr& resource_manager, int num_threads, - int num_batches_per_thread, size_t io_block_size, int io_depth, int io_alignment, - bool shuffle = false, bool wait_for_gpu_idle = false, - Alignment_t aligned = Alignment_t::None); - - long long read_a_batch_to_device_delay_release() override; - long long get_full_batchsize() const override; - - cudaStream_t get_split_3_way_stream(int raw_device_id) const { - return s3w_streams_.at(raw_device_id); - } - - cudaStream_t get_d2d_stream(int raw_device_id) const { return d2d_streams_.at(raw_device_id); } - - void set_schedule_streams(cudaStream_t s3w_stream, cudaStream_t d2d_stream, - int raw_device_id) override; - - void stream_wait_sparse_tensors(cudaStream_t stream, int raw_device_id, bool from_graph) override; - void stream_wait_dense_tensors(cudaStream_t stream, int raw_device_id, bool from_graph) override; - - /** - * @brief Once the batch is retrieved from the AsyncReaderImpl, the batch needs to be - * split into its respective tensor buffers. This allows us to buffer the last N batches - * with their respective tensors. - */ - void set_tensor_buffering(size_t num_batches_to_buffer); - - bool current_batch_incomplete() const override; - void ready_to_collect() override; - long long read_a_batch_to_device() override; - void schedule_split_3_way_here(cudaStream_t stream, int raw_device_id, bool from_graph) override; - void schedule_d2d_here(cudaStream_t stream, int raw_device_id, bool from_graph) override; - void schedule_here(cudaStream_t stream, int raw_device_id) override; - void schedule_here_graph(cudaStream_t stream, int raw_device_id) override; - void update_schedule_graph(int raw_device_id) override; - - size_t get_max_batches_inflight() const; - bool is_mixed_precision(); - // TODO: need to get rid of this, pass the dims directly from Model to the HybridEmbedding - void get_dimensions(size_t& label_dim, size_t& dense_dim, size_t& sparse_dim, - size_t& sample_size_items); - - long long get_current_batchsize_per_device(size_t local_id) override; - long long get_current_batchsize() override { return current_batch_size_; }; - TensorScalarType get_scalar_type() const override; - bool is_started() const override; - void start() override; - - std::vector get_label_tensor23s() const override; - std::vector get_dense_tensor23s() const override; - std::vector get_value_tensor23s() const; - std::vector> get_value_tensors() const; - - bool is_batch_cached() const { return current_batch_cached_; } - size_t get_current_inflight_id() const { return inflight_id_; } // TODO: remove? - - // FIXME: This is a temporary fix to get around the fact that HybridSpaseEmbedding - // needs to be constructed with the SparseTensor buffers - // std::vector> get_value_tensor_buffers() const; - std::vector>> get_value_tensor_buffers() const; - std::vector> get_value_tensor_buffer23s() const; -#ifndef DISABLE_CUDF - void create_drwg_parquet(std::string file_list, bool strict_order_of_batches, - const std::vector slot_offset, - bool start_reading_from_beginning = true, - long long max_samples_per_group = 0, int label_dense_num = 0, - int label_dense_dim = 0) override; -#endif - void set_source(std::string file_list = std::string()) override; - ~AsyncReader(); - - private: - std::vector temp_tensors_; - struct BatchTensors { - size_t tag; - std::vector label_tensors; - std::vector dense_tensors; - std::vector sparse_tensors; - }; - - void assign_dense_and_label_tensors(core23::Tensor& label_tensor, core23::Tensor& dense_tensor, - int raw_device_id, cudaStream_t stream); - - void init_batch_tensors(size_t num_inflight); - - const std::shared_ptr resource_manager_; - std::unique_ptr reader_impl_; - int64_t sample_size_items_, current_batch_size_; - bool mixed_precision_, wait_for_gpu_idle_; - int64_t batch_size_, batch_size_per_dev_; - int64_t label_dim_, dense_dim_, sparse_dim_; - - size_t inflight_id_ = 0; - std::vector inflight_batch_tensors_; // in-flight batches - - std::vector label_tensors_; - std::vector dense_tensors_; - std::vector current_sparse_tensors_; - - bool current_batch_cached_ = false; - - std::vector completion_events_; - std::vector schedule_events_; - std::vector split_schedule_events_; - std::vector d2d_schedule_events_; - - std::vector s3w_streams_; // split_3_way streams - std::vector d2d_streams_; // d2d copy streams - - bool cache_buffers_ = false; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/data_readers/async_reader/async_reader_common.hpp b/HugeCTR/include/data_readers/async_reader/async_reader_common.hpp deleted file mode 100644 index ea70e2d7de..0000000000 --- a/HugeCTR/include/data_readers/async_reader/async_reader_common.hpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -// For the tensor bags -#include -#include -#include -#include - -struct iocb; - -namespace HugeCTR { - -enum class BufferStatus : int { - IOReady = 0, - IOInProcess = 1, - UploadInProcess = 2, - UploadSubmitted = 3, - ReadReady = 4, - PermanentlyResident = 5, - Finished = 6 -}; - -struct InternalBatchBuffer { - int64_t id = -1; - size_t size; - int raw_device_id; - - std::vector dev_data; - char* raw_host_ptr = nullptr; - char* host_data; - - std::atomic status; - std::vector io_reqs; - int num_outstanding_reqs; - std::atomic ready_to_upload_event, safe_to_upload_event; - int num_submitted_h2d_chunks; - int num_submitted_broadcasts; - bool preload_done; - cudaEvent_t event; - - // Following the rule of 5 just in case - // Only need the destructor here - InternalBatchBuffer() { status.store(BufferStatus::IOReady); }; - InternalBatchBuffer(InternalBatchBuffer const& other) = delete; - InternalBatchBuffer& operator=(InternalBatchBuffer const& other) = delete; - - InternalBatchBuffer(InternalBatchBuffer&& other) = default; - InternalBatchBuffer& operator=(InternalBatchBuffer&& other) = default; - - ~InternalBatchBuffer() { - for (auto ptr : dev_data) { - HCTR_LIB_CHECK_(cudaFree(ptr)); - } - HCTR_LIB_CHECK_(cudaHostUnregister(raw_host_ptr)); - free(raw_host_ptr); - } -}; - -struct BatchDesc { - size_t size_bytes; - std::vector dev_data; - bool cached; - size_t id; -}; - -class RawPtrWrapper : public TensorBuffer2 { - public: - RawPtrWrapper(void* ptr) : ptr_(ptr) {} - bool allocated() const override { return true; } - void* get_ptr() override { return ptr_; } - - private: - void* ptr_; -}; - -class RawPtrBuffer : public TensorBuffer2 { - public: - RawPtrBuffer(size_t size_bytes) { HCTR_LIB_THROW(cudaMalloc(&ptr_, size_bytes)); } - bool allocated() const override { return true; } - void* get_ptr() override { return ptr_; } - ~RawPtrBuffer() override { cudaFree(ptr_); } - - private: - void* ptr_; -}; - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp b/HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp deleted file mode 100644 index 7d51bf28d7..0000000000 --- a/HugeCTR/include/data_readers/async_reader/split_label_dense_sparse.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -namespace HugeCTR { -template -void split_3_way(core23::Tensor& label_tensor_per_dev, core23::Tensor& dense_tensor_per_dev, - core23::Tensor& sparse_tensor, const core23::Tensor& label_dense_sparse_buffer, - size_t local_idx_start, size_t local_idx_end, cudaStream_t stream); - -} // namespace HugeCTR diff --git a/HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp b/HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp deleted file mode 100644 index 98ba775300..0000000000 --- a/HugeCTR/include/data_readers/async_reader/thread_async_reader.hpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include -#include - -namespace HugeCTR { - -class InternalBatchBuffer; -class ResourceManager; -enum class WorkerStatus : int { OK, Terminate }; - -struct ThreadAsyncReaderParameters { - size_t io_block_size; - int io_alignment, io_depth; - int num_h2d_chunks; - bool wait_for_gpu_idle; - bool loop; -}; - -class ThreadAsyncReader { - public: - ThreadAsyncReader(std::string fname, const ResourceManager* resource_manager, - size_t batch_size_bytes, int device_id, cudaStream_t stream, - std::vector batch_ids, std::vector dest_buffers, - ThreadAsyncReaderParameters params, size_t total_file_size); - - void load(); - void reset(); - - ~ThreadAsyncReader(); - - private: - int fd_; - size_t batch_size_bytes_; - int device_id_; - cudaStream_t stream_; - int num_dest_buffers_; - int max_num_blocks_per_batch_; - size_t total_file_size_; - io_context_t ioctx_; - std::atomic status_; - - std::vector batch_ids_; - std::vector dest_buffers_; - ThreadAsyncReaderParameters params_; - int num_buffers_waiting_io_; - - void try_submit_io(size_t batch_id, int io_id); - void wait_io(); - bool wait_for_gpu_idle(InternalBatchBuffer* buffer); - void try_submit_upload(InternalBatchBuffer* buffer); - void try_submit_p2p(InternalBatchBuffer* buffer); - bool check_completion(InternalBatchBuffer* buffer); -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/data_readers/async_reader/broadcast.hpp b/HugeCTR/include/data_readers/multi_hot/async_reader_common.hpp similarity index 66% rename from HugeCTR/include/data_readers/async_reader/broadcast.hpp rename to HugeCTR/include/data_readers/multi_hot/async_reader_common.hpp index d45f937f86..af3785dd29 100644 --- a/HugeCTR/include/data_readers/async_reader/broadcast.hpp +++ b/HugeCTR/include/data_readers/multi_hot/async_reader_common.hpp @@ -1,25 +1,38 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { - -void broadcast(float** dev_pointers, const bool* dev_p2p_accessible, int batch_size_floats, - int num_dests, int src_id, cudaStream_t stream); - +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +// For the tensor bags +#include +#include + +#include "HugeCTR/core23/tensor.hpp" +#include "HugeCTR/include/tensor2.hpp" + +namespace HugeCTR { + +class RawPtrWrapper : public TensorBuffer2 { + public: + RawPtrWrapper(void* ptr) : ptr_(ptr) {} + bool allocated() const override { return true; } + void* get_ptr() override { return ptr_; } + + private: + void* ptr_; +}; } // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp b/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp index 28e741c03d..798294935a 100644 --- a/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp +++ b/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/HugeCTR/include/data_readers/row_group_reading_thread.hpp b/HugeCTR/include/data_readers/row_group_reading_thread.hpp index 8391df6619..8684881631 100644 --- a/HugeCTR/include/data_readers/row_group_reading_thread.hpp +++ b/HugeCTR/include/data_readers/row_group_reading_thread.hpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/HugeCTR/include/embedding.hpp b/HugeCTR/include/embedding.hpp index ea6e55e5bf..8f7f46d28d 100644 --- a/HugeCTR/include/embedding.hpp +++ b/HugeCTR/include/embedding.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -93,16 +92,6 @@ struct SparseEmbeddingHashParams { } }; -static size_t get_slot_num(const SparseTensorBag& bag) { - const std::vector& dimension = bag.get_dimensions(); - if (dimension.size() == 2) { - return dimension[1]; - } - HCTR_OWN_THROW(Error_t::IllegalCall, - "slot_num is available when sparse tensor shape is (batchsize, slot_num)"); - return 0; -} - // TODO remove Tensor2 Based BufferBag struct BufferBag { TensorBag2 keys; @@ -127,16 +116,4 @@ struct SparseInput { SparseInput() {} }; -struct BufferBag23 { - core23::Tensor keys; - core23::Tensor slot_id; - core23::Tensor embedding; - std::vector opt_states; - - std::vector h_value_tensors; - std::vector h_slot_id_tensors; - std::vector uvm_key_tensor_bags; - std::vector d_value_index_tensors; -}; - } // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp b/HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp deleted file mode 100644 index afe7f83d42..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/allocator.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -class CudaPreAllocator { - void *ptr_; - size_t size_; - - public: - CudaPreAllocator() : ptr_(nullptr), size_(0) {} - - template - void reserve(const std::vector &dimensions) { - size_t s = sizeof(T); - for (size_t dimension : dimensions) { - s *= dimension; - } - size_ += s; - } - - void pre_allocate() { HCTR_LIB_THROW(cudaMalloc(&ptr_, size_)); } - - void *allocate(size_t size) const { - if (size > size_) { - HCTR_OWN_THROW(Error_t::OutOfMemory, "Out of memory"); - } - return ptr_; - } - void deallocate(void *ptr) const { HCTR_LIB_THROW(cudaFree(ptr)); } -}; diff --git a/HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp b/HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp deleted file mode 100644 index ca513918f0..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/calibration_data.hpp +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -// depends on : Statistics, Data - -/// -/// This class contains the calibrated measurements for all-to-all and all-reduce -/// for different data sizes. Each calibration consists of two arrays, -/// ._data_size array and the ._time array which represent a mapping. -/// -/// This class will be executed on the cpu instead of the gpu if no -/// gpu memory is allocated for the calibration data. -struct CalibrationData { - CalibrationData(size_t num_nodes_in, double p_dup_max_in, double max_all_reduce_bandwidth_in, - double max_all_to_all_bandwidth_in, double efficiency_bandwidth_ratio_in) - : num_nodes(num_nodes_in), - p_dup_max(p_dup_max_in), - max_all_reduce_bandwidth(max_all_reduce_bandwidth_in), - max_all_to_all_bandwidth(max_all_to_all_bandwidth_in), - efficiency_bandwidth_ratio(efficiency_bandwidth_ratio_in){ - // TBD - }; - ~CalibrationData() {} - - size_t num_nodes; - - // Calibration all-to-all : - // the following two arrays map data sizes to all-to-all times / latencies. - std::vector h_all_to_all_data_size; - std::vector h_all_to_all_times; - Tensor2 all_to_all_data_size; // data size of message per gpu - Tensor2 all_to_all_times; // calibrated all-to-all times - - // Calibration all-reduce : - // the following two arrays map data sizes to all-to-all times / latencies. - std::vector h_all_reduce_data_size; - std::vector h_all_reduce_times; - Tensor2 all_reduce_data_size; // data size of message per gpu - Tensor2 all_reduce_times; // calibrated all-reduce times - - // Alternative calibration: (if no calibration provided) - // the threshold for frequent categories is calculated from maximum bandwidths - // for the all-reduce and all-to-all respectively. - // This approximation assumes that the communications are bandwidth limited. - double p_dup_max; - double max_all_reduce_bandwidth; // algorithm bandwidth all-reduce [data size message per gpu in - // bytes / sec] - double max_all_to_all_bandwidth; // algorithm bandwidth all-to-all [data size message per gpu in - // bytes / sec] - double efficiency_bandwidth_ratio; - // cpu functions - double interpolate(const std::vector &calibrated_data_size, - const std::vector &calibrated_times, - const std::vector &data_size, - std::vector &communication_times); - double interpolate_all_reduce(const std::vector &data_size, - std::vector &communication_times); - double interpolate_all_to_all(const std::vector &data_size, - std::vector &communication_times); - - // gpu functions - void interpolate(const Tensor2 &calibrated_data_size, - const Tensor2 &calibrated_times, const Tensor2 &data_size, - Tensor2 &communication_times); - void interpolate_all_reduce(const Tensor2 &data_size, Tensor2 &communication_times); - void interpolate_all_to_all(const Tensor2 &data_size, Tensor2 &communication_times); -}; - -template -class ModelInitializationFunctors { - public: - static double calculate_threshold(const CommunicationType communication_type, double p_dup_max, - double all_to_all_bandwidth, double all_reduce_bandwidth, - double efficiency_bandwidth_ratio, size_t num_nodes, - size_t batch_size, size_t num_networks, size_t num_iterations, - size_t num_tables); - static dtype calculate_num_frequent_categories(const CommunicationType &communication_type, - const size_t num_networks, - const CalibrationData &calibration, - const Statistics &statistics, - const Data &data, dtype *d_num_frequent, - cudaStream_t stream); - static double calculate_frequent_probability(const Statistics &statistics, - const dtype num_frequent, - uint32_t *d_total_frequent_count, - cudaStream_t stream); -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/embeddings/hybrid_embedding/communication.hpp b/HugeCTR/include/embeddings/hybrid_embedding/communication.hpp deleted file mode 100644 index 4ed51c9afb..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/communication.hpp +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -class Communication { - public: - Communication(size_t width_data_field); - virtual ~Communication() = default; - virtual void communicate(cudaStream_t stream) = 0; - virtual void update_sizes(cudaStream_t stream){}; - virtual void initiate_communication(cudaStream_t stream){}; - virtual void wait_completion(cudaStream_t stream){}; - - protected: - size_t width_data_field_; -}; - -/* - * All to All communications - */ -template -struct AllToAllStorage { - AllToAllStorage(GeneralBuffer2* buf, size_t max_buffer_size) { - buf->reserve({max_buffer_size}, &send_buffer); - buf->reserve({max_buffer_size}, &recv_buffer); - } - Tensor2 send_buffer, recv_buffer; - Tensor2 send_buffer_ptrs; -}; - -template -class AllToAllVComm : public Communication { - public: - AllToAllVComm(Tensor2 send_buffer, Tensor2 recv_buffer, - const uint32_t* send_offsets, const uint32_t* recv_offsets, - const GPUResource* gpu_resource, size_t width_data_field); - - protected: - Tensor2 send_buffer_; - Tensor2 recv_buffer_; - - const uint32_t* send_offsets_; - const uint32_t* recv_offsets_; - - const GPUResource* gpu_resource_; -}; - -template -class AllToAll_Multi_NCCL : public AllToAllVComm { - public: - using AllToAllVComm::AllToAllVComm; - void communicate(cudaStream_t stream) final override; - ~AllToAll_Multi_NCCL() = default; -}; - -// template -// class AllToAll_Single : public AllToAllVComm { -// public: -// using AllToAllVComm::AllToAllVComm; -// void communicate() final override; -// ~AllToAll_Single() = default; -// }; - -/* - * All Reduce communications - */ -template -class AllReduceComm : public Communication { - public: - AllReduceComm(AllReduceInPlaceComm* ar_comm, AllReduceInPlaceComm::Handle ar_handle, - const GPUResource* gpu_resource); - void communicate(cudaStream_t stream) final override; - ~AllReduceComm() = default; - - private: - AllReduceInPlaceComm* ar_comm_; - AllReduceInPlaceComm::Handle ar_handle_; - const GPUResource* gpu_resource_; -}; - -#ifdef ENABLE_MPI -template -class HierAll2Allv_Multi_IB : public Communication { - public: - HierAll2Allv_Multi_IB(uint32_t instance_id, HierA2AvCollHandle coll_handle, size_t** send_sizes, - const GPUResource* gpu_resource, IbComm* ib_comm, cudaStream_t comm_stream); - - void update_sizes(cudaStream_t stream) final override; - void communicate(cudaStream_t stream) final override; - void initiate_communication(cudaStream_t stream) final override; - void wait_completion(cudaStream_t stream) final override; - ~HierAll2Allv_Multi_IB(); - - private: - uint32_t instance_id_; - HierA2AvCollHandle coll_handle_; - size_t** send_sizes_; - const GPUResource* gpu_resource_; - IbComm* ib_comm_; - cudaStream_t comm_stream_; - cudaEvent_t comm_event_; -}; -#endif - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/data.hpp b/HugeCTR/include/embeddings/hybrid_embedding/data.hpp deleted file mode 100644 index 1147f0815a..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/data.hpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -struct EmbeddingTableFunctors { - static dtype get_num_categories(const std::vector &table_sizes); - static void get_embedding_offsets(std::vector &embedding_offsets, - const std::vector &table_sizes); - static size_t get_embedding_table_index(const std::vector &table_sizes, dtype category); -}; - -// depends on : data reader - or mock data - -template -struct Data { - std::vector table_sizes; - size_t batch_size; - size_t num_iterations; - size_t num_categories; - - Tensor2 embedding_offsets; - Tensor2 samples; - - Data(Tensor2 samples, const std::vector &table_sizes_in, size_t batch_size_in, - size_t num_iterations_in) - : samples(samples), - table_sizes(table_sizes_in), - batch_size(batch_size_in), - num_iterations(num_iterations_in) { - std::shared_ptr> buf = GeneralBuffer2::create(); - buf->reserve({table_sizes_in.size()}, &embedding_offsets); - buf->allocate(); - - std::vector h_embedding_offsets; - EmbeddingTableFunctors::get_embedding_offsets(h_embedding_offsets, table_sizes); - - num_categories = EmbeddingTableFunctors::get_num_categories(table_sizes); - HCTR_LIB_THROW(cudaMemcpy(embedding_offsets.get_ptr(), h_embedding_offsets.data(), - sizeof(dtype) * h_embedding_offsets.size(), cudaMemcpyHostToDevice)); - } - - Data(const std::vector &table_sizes_in, size_t batch_size_in, size_t num_iterations_in) - : table_sizes(table_sizes_in), batch_size(batch_size_in), num_iterations(num_iterations_in) { - std::shared_ptr> buf = GeneralBuffer2::create(); - reserve(buf); - buf->reserve({table_sizes_in.size()}, &embedding_offsets); - buf->allocate(); - - std::vector h_embedding_offsets; - EmbeddingTableFunctors::get_embedding_offsets(h_embedding_offsets, table_sizes); - - num_categories = EmbeddingTableFunctors::get_num_categories(table_sizes); - HCTR_LIB_THROW(cudaMemcpy(embedding_offsets.get_ptr(), h_embedding_offsets.data(), - sizeof(dtype) * h_embedding_offsets.size(), cudaMemcpyHostToDevice)); - } - - Data() {} - ~Data() {} - - void reserve(std::shared_ptr> buf) { - const size_t num_tables = table_sizes.size(); - buf->reserve({num_iterations * batch_size * num_tables, 1}, &samples); - } - - // convert raw input data such that categories of different - // categorical features have unique indices - void data_to_unique_categories(Tensor2 data, cudaStream_t stream); -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp b/HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp deleted file mode 100644 index 76ec86fec0..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/frequent_embedding.hpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -// TODO sort out public/private fields -// In order to use it easier in the IndicesContainer -template -class FrequentEmbeddingBase { - public: - const Data *data_ = nullptr; - FrequentEmbeddingCompressionView *indices_view_ = nullptr; - - // Frequent indices and device pointer! - FrequentEmbeddingCompression *indices_; - - void set_current_indices(FrequentEmbeddingCompression *indices); - FrequentEmbeddingBase(); - virtual ~FrequentEmbeddingBase(); -}; - -template -class FrequentEmbeddingData { - public: - // copy of the model parameters and the input data - const Model &model_; - const GPUResource &gpu_resource_; - - // locally stored embedding vectors for the data-parallel part of the embedding for each table - Tensor2 frequent_embedding_vectors_; - - // locally stored reduced gradients into fp32 type - Tensor2 float_frequent_gradients_; - // buffer for communication can have fp16 type instead of fp32: input for all-reduce - Tensor2 frequent_gradients_; - template - using BuffPtr = std::shared_ptr>; - BuffPtr grouped_wgrad_buff_; - std::shared_ptr wgrad_core23_buffer_; - - uint32_t embedding_vec_size_; - size_t max_num_frequent_categories_; - - FrequentEmbeddingData(const Model &model, const GPUResource &gpu_resource, - BuffPtr &grouped_wgrad_buff, uint32_t embedding_vec_size, - size_t max_num_frequent_categories); - ~FrequentEmbeddingData() {} - - void initialize_embedding_vectors(const std::vector &table_sizes, - size_t grouped_wgrad_offset); - template - void forward_network(const vectype *embedding_vectors, emtype *interaction_layer_input, - FrequentEmbeddingBase *base, cudaStream_t stream); - void local_reduce(const emtype *gradients, FrequentEmbeddingBase *base, - cudaStream_t stream); - - template - typename std::enable_if::value, Tensor2>::type &get_gradients() { - return float_frequent_gradients_; - } - - template - typename std::enable_if::value, Tensor2>::type &get_gradients() { - return frequent_gradients_; - } - - class ExternalManagedBuffer : public HugeCTR::TensorBuffer2 { - public: - ExternalManagedBuffer(void *ptr) : ptr_(ptr) {} - bool allocated() const override { return true; } - void *get_ptr() override { return ptr_; } - - private: - void *ptr_; - }; -}; - -template -class FrequentEmbeddingSingleNode : public FrequentEmbeddingBase { - public: - using FrequentEmbeddingBase::data_; - FrequentEmbeddingData frequent_data_; - Tensor2 frequent_embedding_vectors_cache_; - Tensor2 embedding_vectors_cache_pointers_; - Tensor2 partial_gradients_pointers_; - template - using BuffPtr = std::shared_ptr>; - - FrequentEmbeddingSingleNode(const Model &model, const GPUResource &gpu_resource, - BuffPtr &grouped_wgrad_buff, uint32_t embedding_vec_size, - size_t max_num_frequent_categories); - - void init(); - void forward_model(cudaStream_t stream); - void forward_model_eval(cudaStream_t stream); - void forward_network(emtype *interaction_layer_input, cudaStream_t stream); - void local_reduce(const emtype *gradients, cudaStream_t stream); - void update_model_direct(float *dev_lr, float scale, cudaStream_t stream); - - template - typename std::enable_if::value, Tensor2>::type - get_embedding_vectors_cache() { - return frequent_data_.frequent_embedding_vectors_; - } - - template - typename std::enable_if::value, Tensor2>::type - get_embedding_vectors_cache() { - return frequent_embedding_vectors_cache_; - } -}; - -template -class FrequentEmbeddingMultiNode : public FrequentEmbeddingBase { - public: - using FrequentEmbeddingBase::data_; - FrequentEmbeddingData frequent_data_; - template - using BuffPtr = std::shared_ptr>; - std::unique_ptr ar_comm_; - - FrequentEmbeddingMultiNode(const Model &model, const GPUResource &gpu_resource, - BuffPtr &grouped_wgrad_buff, uint32_t embedding_vec_size, - size_t max_num_frequent_categories) - : frequent_data_(model, gpu_resource, grouped_wgrad_buff, embedding_vec_size, - max_num_frequent_categories) {} - - void init(); - void init_ar_comm(AllReduceInPlaceComm *ar_comm, AllReduceInPlaceComm::Handle &handle, - int local_id); - void communicate(cudaStream_t stream); - void forward_network(emtype *interaction_layer_input, cudaStream_t stream); - void local_reduce(const emtype *gradients, cudaStream_t stream); - void update_model(float *dev_lr, float scale, cudaStream_t stream); -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp b/HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp deleted file mode 100644 index a42f668dff..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/hybrid_indices.hpp +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { -namespace hybrid_embedding { - -// =========================================================================================== -// Frequent Compression -// =========================================================================================== - -template -struct FrequentEmbeddingCompressionView { - const dtype* samples; - bool* cache_masks; - uint32_t *model_cache_indices, *model_cache_indices_offsets; - uint32_t *network_cache_indices, *network_cache_indices_offsets; - uint32_t *d_num_frequent_sample_indices, *frequent_sample_indices; -}; - -template -class FrequentEmbeddingCompression { - void calculate_frequent_sample_indices_temp_storage_bytes(const size_t local_samples_size); - void calculate_model_cache_indices_temp_storage_bytes(const size_t num_frequent); - void calculate_network_cache_indices_temp_storage_bytes(const size_t num_frequent); - - const Model& model_; - const Data& data_; - - FrequentEmbeddingCompressionView* device_indices_view_; - - public: - // Role: - // push from the locally reduced gradient buffer => update embedding vector - // pull embedding vector from the model => update local cache - // - // Def: - // 1 if frequent category is present in this network batch - // [size num_frequent] - Tensor2 cache_masks_; - - // model_cache_indices : list of cache indices of this frequent embedding model instance - // for each mlp deep learning network. - // Definition. - // given the frequent embedding model of frequent embedding vectors - // stored and updated by this instance, i.e. the range in - // frequent_embedding_vectors - // i * num_frequent /num_instances ... (i+1) * num_frequent /num_instances - // - 1 - // for each network n, the range within model_cache_indices specified by - // model_cache_indices_offsets_[n] .. model_cache_indices_offsets_[n] - 1 - // is the list of frequent cache indices that appear in network n. - // - // Role. - // - // 1. Forward-model : cache indices into the frequent_embedding_vector array - // for each send-message-buffer - per mlp network. - // 2. Backward-model : cache indices for each receive-message-buffer - mlp - // - Tensor2 model_cache_indices_; - Tensor2 model_cache_indices_offsets_; - - // network_cache_indices : list of cache indices contained in this network for each - // frequent embedding model instance - // Def. - // Given the mlp deep learning network samples for this instance, - // - network n, sample_ids starting with i * batch_size / num_instances - - // For each embedding model - model_id - list its cache indices that - // are present within network n's samples. The range of these indices is - // given by network_cache_indices_offsets_[i+1] ... - // network_cache_indices_offsets_[i+1] - // Role. - // 1. Forward-network : cache indices into the frequent_embedding_vector array - // for each receive-message-buffer - per frequent embedding model - // 2. Backward-network : cache indices into the frequent_gradient_vectors_ - // for each send-message-buffer - mlp - // - Tensor2 network_cache_indices_; - Tensor2 network_cache_indices_offsets_; - - // Role: - // from buffer => interaction layer - // sample gradients => gradient buffer - // - // Def: - // sample id's within this network batch - // containing frequent category [network batch size] - // "Network side" - Tensor2 d_num_frequent_sample_indices_; - Tensor2 frequent_sample_indices_; - - // scratch buffers for index calculations - Tensor2 frequent_sample_indices_temp_storage_; - Tensor2 model_cache_indices_temp_storage_; - Tensor2 network_cache_indices_temp_storage_; - size_t frequent_sample_indices_temp_storage_bytes_; - size_t model_cache_indices_temp_storage_bytes_; - size_t network_cache_indices_temp_storage_bytes_; - - FrequentEmbeddingCompression(size_t max_num_frequent_categories, const Data& data, - const Model& model); - - void calculate_frequent_sample_indices(cudaStream_t stream); - void calculate_model_cache_indices(size_t sm_count, cudaStream_t stream); - void calculate_network_cache_mask(cudaStream_t stream); - void calculate_network_cache_indices(cudaStream_t stream); - void calculate_cache_masks(cudaStream_t stream); - - FrequentEmbeddingCompressionView* get_device_view() { return device_indices_view_; }; - const Data* get_data() { return &data_; } -}; - -// =========================================================================================== -// Infrequent Selection -// =========================================================================================== - -template -struct InfrequentEmbeddingSelectionView { - const dtype* samples; - uint32_t *model_indices, *model_indices_offsets; - uint32_t *network_indices, *network_indices_offsets, *network_indices_src_model_id; -}; - -template -class InfrequentEmbeddingSelection { - void calculate_model_indices_temp_storage_bytes(size_t max_batch_size, size_t table_size); - void calculate_network_indices_temp_storage_bytes(size_t max_batch_size, size_t table_size, - const uint32_t num_instances); - - const Model& model_; - const Data& data_; - InfrequentEmbeddingSelectionView* device_indices_view_; - - public: - // model_indices : list of samples indices of categories for which the embedding vectors are - // stored in this infrequent embedding model instance. - // Sample-id's for entire batch, i.e. sorted by mlp deep learning network. - // Definition. - // Given the infrequent embedding model of infrequent embedding vectors - // stored and updated by this instance, sample indices for categories such - // that - // category_location[2*category] == model_id - // for each network n, the range within model_cache_indices specified by - // model_indices_offsets_[n] .. model_indices_offsets_[n+1] - 1 - // is the list of infrequent sample indices in network n. - // Role. - // 1. Forward-model : indices in the samples array for each send-message-buffer - // - per mlp network. - // 2. Backward-model : indices in the samples array for each receive-message-buffer - // - per mlp network. - Tensor2 model_indices_; - Tensor2 model_indices_offsets_; - // Tensor2 model_indices_sizes_; - // Tensor2 model_indices_sizes_ptrs_; - - // network_indices : list of sample indices of infrequent categories ordered per infrequent - // embedding model - model_id - where they're stored. - // Sample-id's for local batch (i.e sub-batch of this mlp network) - // Definition. - // Given the mlp deep learning network samples for this instance, - // - network n, sample_ids starting with i * batch_size / num_instances - - // For each embedding model - model_id - list its sample indices that - // are present within network n's samples. The range of these indices is given - // by - // network_indices_offsets_[n] .. network_indices_offsets_[n+1] - 1 - // Role. - // 1. Forward-network : local sample indices for each receive-message-buffer - // - per infrequent embedding model. - // 2. Backward-network : local sample indices for each send-message-buffer - // - mlp - Tensor2 network_indices_; - Tensor2 network_indices_offsets_; - Tensor2 network_indices_src_model_id_; - // Tensor2 network_indices_sizes_; - // Tensor2 network_indices_sizes_ptrs_; - - // scratch buffers for index calculations - /// TODO: if not overlapping, we can use the same storage - Tensor2 model_indices_temp_storage_; - size_t model_indices_temp_storage_bytes_; - Tensor2 network_indices_temp_storage_; - size_t network_indices_temp_storage_bytes_; - - InfrequentEmbeddingSelection(const Data& data, const Model& model); - - void calculate_model_indices(cudaStream_t stream); - void calculate_network_indices(size_t sm_count, cudaStream_t stream); - - // For now these functions stay in InfreqeuentEmbedding - // since the communications can only use one offsets tensor - // void calculate_model_indices_sizes_from_offsets( size_t embedding_vec_bytes, cudaStream_t - // stream); void calculate_network_indices_sizes_from_offsets(size_t embedding_vec_bytes, - // cudaStream_t stream); - - InfrequentEmbeddingSelectionView* get_device_view() { return device_indices_view_; } - const Data* get_data() { return &data_; } -}; - -// Single-stream version -template -void compute_indices(FrequentEmbeddingCompression& compression, - InfrequentEmbeddingSelection& selection, - CommunicationType communication_type, bool compute_network_cache_indices, - cudaStream_t main_stream, int sm_count); - -} // namespace hybrid_embedding -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp b/HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp deleted file mode 100644 index e0205878db..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/indices_container.hpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { -namespace hybrid_embedding { - -template -class BatchIndices { - public: - BatchIndices(std::vector>& models, std::vector> data_source, - std::shared_ptr& resource_manager, size_t batch_size, - std::vector& slot_size_array, size_t max_num_frequent_categories, - CommunicationType communication_type); - - void compute(int raw_device_id, size_t batch_size, cudaStream_t stream); - - FrequentEmbeddingCompression& get_frequent(int raw_device_id) { - return frequent_compression_[raw_device_id]; - } - - InfrequentEmbeddingSelection& get_infrequent(int raw_device_id) { - return infrequent_selection_[raw_device_id]; - } - - private: - size_t num_slots_ = 0; - std::shared_ptr resource_manager_; - CommunicationType communication_type_; - std::vector> data_; - std::vector> frequent_compression_; - std::vector> infrequent_selection_; -}; - -} // namespace hybrid_embedding -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp b/HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp deleted file mode 100644 index 80b95d0567..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/infrequent_embedding.hpp +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -// In order to use it easier in the IndicesContainer -template -class InfrequentEmbeddingBase { - protected: - const Data *data_ = nullptr; - InfrequentEmbeddingSelectionView *indices_view_ = nullptr; - - public: - // Infrequent indices and device pointer! - InfrequentEmbeddingSelection *indices_; - - void set_current_indices(InfrequentEmbeddingSelection *indices); - InfrequentEmbeddingBase(); - virtual ~InfrequentEmbeddingBase(); - - InfrequentEmbeddingBase(const InfrequentEmbeddingBase &other); - - InfrequentEmbeddingBase &operator=(const InfrequentEmbeddingBase &other) { - if (this == &other) { - return *this; - } - - HCTR_LIB_THROW(cudaMalloc(&indices_view_, sizeof(*indices_view_))); - - HCTR_LIB_THROW(cudaMemcpy(indices_view_, other.indices_view_, sizeof(*indices_view_), - cudaMemcpyDeviceToDevice)); - - return *this; - } -}; - -template -class InfrequentEmbedding_NVLink_SingleNode : public InfrequentEmbeddingBase { - public: - using InfrequentEmbeddingBase::data_; - - // copy of the model parameters and the input data, managed by HybridSparseEmbedding - const Model &model_; - const GPUResource &gpu_resource_; - - // locally stored infrequent embedding vectors for the model-parallel part of the embedding for - // each table - Tensor2 infrequent_embedding_vectors_; - - Tensor2 interaction_layer_input_pointers_train_; - Tensor2 interaction_layer_input_pointers_eval_; - Tensor2 gradients_pointers_; - - // to do, we need to initialize it in the constructor - uint32_t embedding_vec_size_; - - void init_pointers(int local_gpu_count, const cudaStream_t stream, - std::vector &interaction_layer_input_pointers_train, - std::vector &interaction_layer_input_pointers_eval, - std::vector &gradients_pointers); - - InfrequentEmbedding_NVLink_SingleNode(Model &model, GPUResource &gpu_resource, - size_t embedding_vec_size); - - ~InfrequentEmbedding_NVLink_SingleNode() {} - - void initialize_embedding_vectors(const std::vector &table_sizes); - void forward_network_direct(bool is_train, cudaStream_t stream); - void update_model_direct(float *dev_lr, float scale, cudaStream_t stream); -}; - -template -class InfrequentEmbedding_IB_NVLINK : public InfrequentEmbeddingBase { - public: - using InfrequentEmbeddingBase::data_; - - // copy of the model parameters and the input data, managed by HybridSparseEmbedding - const Model &model_; - const GPUResource &gpu_resource_; - - // locally stored infrequent embedding vectors for the model-parallel part of the embedding for - // each table - Tensor2 infrequent_embedding_vectors_; - - // Tensors to be passed to the hierarchical comms - // TODO: move these to the index containers - Tensor2 network_indices_offsets_, model_indices_offsets_; - - // to do, we need to initialize it in the constructor - uint32_t embedding_vec_size_; - - // private: - std::unique_ptr> infrequent_forward_comm_buffers_, - infrequent_backward_comm_buffers_; - std::unique_ptr infrequent_forward_comms_, infrequent_backward_comms_; - - // requires model_ and data_ to be set - InfrequentEmbedding_IB_NVLINK(Model &model, GPUResource &gpu_resource, - size_t embedding_vec_size); - - //~InfrequentEmbedding_IB_NVLINK(){}; - - void init_comms(size_t embedding_vec_size, const GPUResource *gpu_resource, - GeneralBuffer2 *i_buf, size_t max_buf_size); - void initialize_embedding_vectors(const std::vector &table_sizes); - void forward_model(emtype *message_buffer, cudaStream_t stream); - void forward_network(const emtype *message_buffer, emtype *interaction_layer_input, - cudaStream_t stream); - void update_network(const emtype *gradients, emtype *message_buffer, cudaStream_t stream); - void update_model(const emtype *message_buffer, float *dev_lr, float scale, cudaStream_t stream); - - const uint32_t *get_model_indices_offsets_ptr() { return model_indices_offsets_.get_ptr(); } - const uint32_t *get_network_indices_offsets_ptr() { return network_indices_offsets_.get_ptr(); } -}; - -template -class InfrequentEmbedding_IB_NVLink_Hier : public InfrequentEmbeddingBase { - public: - using InfrequentEmbeddingBase::data_; - - // copy of the model parameters and the input data, managed by HybridSparseEmbedding - const Model &model_; - const GPUResource &gpu_resource_; - - // locally stored infrequent embedding vectors for the model-parallel part of the embedding for - // each table - Tensor2 infrequent_embedding_vectors_; - - // Communication buffer sizes - dtype max_num_infrequent_per_batch_; - dtype max_num_infrequent_per_train_batch_; - - // Tensors to be passed to the hierarchical comms - // TODO: move these to the index containers - Tensor2 network_indices_sizes_, model_indices_sizes_; - Tensor2 network_indices_sizes_ptrs_, model_indices_sizes_ptrs_; - - // to do, we need to initialize it in the constructor - uint32_t embedding_vec_size_; - - std::unique_ptr> infrequent_forward_comm_buffers_, - infrequent_backward_comm_buffers_; - std::unique_ptr infrequent_forward_comms_, infrequent_backward_comms_; - - // requires model_ and data_ to be set - InfrequentEmbedding_IB_NVLink_Hier(Model &model, GPUResource &gpu_resource, - size_t embedding_vec_size); - //~InfrequentEmbedding_IB_NVLink_Hier(){}; - - void init_comms(int64_t max_num_infrequent_samples, size_t slot_num, size_t embedding_vec_size, - GeneralBuffer2 *buf_ptr, size_t batch_size_true, - size_t batch_size_false, size_t local_gpu_count); - void initialize_embedding_vectors(const std::vector &table_sizes); - void calculate_model_indices_sizes_from_offsets(cudaStream_t stream); - void calculate_network_indices_sizes_from_offsets(cudaStream_t stream); - void fused_intra_forward_model(emtype **message_buffer, cudaStream_t stream); - void hier_forward_network(const emtype *message_buffer, emtype *output_ptr, cudaStream_t stream); - void fused_intra_update_network(const emtype *gradients, emtype **message_buffer, - cudaStream_t stream); - void hier_update_model(const emtype *message_buffer, float *dev_lr, float scale, - cudaStream_t stream); -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/model.hpp b/HugeCTR/include/embeddings/hybrid_embedding/model.hpp deleted file mode 100644 index 71184dd492..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/model.hpp +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -// Depends on : Data, Statistics and CalibrationData - -/// -/// This class defines the hybrid embedding model: -/// it indicates which categories are frequent, which are infrequent -/// and it determines where the corresponding embedding vectors are stored. -/// -/// Also the mlp network - nodes topology is defined here: -/// The node_id, instance_id where the current model instance is -/// associated with is stored. However, keep in mind that these are the only -/// differentiating variables inside this class that differ from other -/// instances. As this model describes the same distribution across the nodes -/// and gpu's (networks). -/// -template -struct Model { - public: - uint32_t node_id; - uint32_t instance_id; - uint32_t global_instance_id; - - CommunicationType communication_type; - - Tensor2 d_num_frequent; - Tensor2 d_total_frequent_count; - dtype num_frequent; - dtype num_categories; - double frequent_probability; - - uint32_t num_instances; - std::vector h_num_instances_per_node; - Tensor2 - num_instances_per_node; // number of gpus for each node, .size() == number of nodes - - Tensor2 category_location; // indicator category => location in embedding vector - Tensor2 frequent_categories; - std::vector h_frequent_model_table_offsets; - std::vector h_infrequent_model_table_offsets; - - // constructors: overloaded for convenience / unit tests - // copy constructor - Model(const Model &model); - ~Model(){}; - Model(CommunicationType communication_type_in, uint32_t global_instance_id_in, - const std::vector &num_instances_per_node_in, size_t num_categories_in) { - std::shared_ptr> buf = GeneralBuffer2::create(); - init_params_and_reserve(communication_type_in, global_instance_id_in, num_instances_per_node_in, - num_categories_in, buf); - buf->allocate(); - } - Model(CommunicationType communication_type_in, uint32_t global_instance_id_in, - const std::vector &num_instances_per_node_in, size_t num_categories_in, - std::shared_ptr> buf) { - init_params_and_reserve(communication_type_in, global_instance_id_in, num_instances_per_node_in, - num_categories_in, buf); - } - - void init_params_and_reserve(CommunicationType communication_type_in, - uint32_t global_instance_id_in, - const std::vector &num_instances_per_node_in, - size_t num_categories_in, - std::shared_ptr> buf); - void init_hybrid_model(const CalibrationData &calibration, Statistics &statistics, - const Data &data, Tensor2 &tmp_categories, - cudaStream_t stream); -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/embeddings/hybrid_embedding/select.cuh b/HugeCTR/include/embeddings/hybrid_embedding/select.cuh deleted file mode 100644 index 542fcbedd4..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/select.cuh +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include - -namespace HugeCTR { -namespace DeviceSelect { -namespace detail { - -template -__global__ void pre_select_if(const T *d_input, unsigned short *d_offset, IndexType *d_block_sum, - size_t len, SelectOp op, T *d_num_selected_out = nullptr) { - unsigned short this_thread_sum = 0; - unsigned short this_block_sum = 0; - - unsigned int tid = threadIdx.x; - unsigned int bid = blockIdx.x; - size_t gtid = static_cast(blockIdx.x) * BlockSize + static_cast(threadIdx.x); - // a trick to - if (!gtid) { - *(d_block_sum - 1) = 0; - } - if (gtid < len) { - IndexType in = d_input ? static_cast(d_input[gtid]) : static_cast(gtid); - this_thread_sum = static_cast(op(in)); - } - __syncthreads(); - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - BlockScan(temp_storage).ExclusiveSum(this_thread_sum, this_thread_sum, this_block_sum); - __syncthreads(); - if (tid == 0) { - d_block_sum[bid] = static_cast(this_block_sum); - } - if (gtid < len) { - d_offset[gtid] = this_thread_sum; - } -} -template -__global__ void post_select_if(const T *d_input, const unsigned short *d_offset, - const IndexType *d_block_offset, size_t len, SelectOp Op, T *output, - T *d_num_selected_out) { - int64_t global_index = 0; - __shared__ IndexType src_data[BlockSize]; - - unsigned int tid = threadIdx.x; - unsigned int bid = blockIdx.x; - size_t gtid = static_cast(blockIdx.x) * BlockSize + static_cast(threadIdx.x); - if (gtid < len) { - // d_offset + d_block_offset to get the global index - global_index = static_cast(d_block_offset[bid] + static_cast(d_offset[gtid])); - // vectorized load - IndexType in = d_input ? static_cast(d_input[gtid]) : static_cast(gtid); - src_data[tid] = in; - } - __syncthreads(); - // warp divergence - if (gtid < len && Op(src_data[tid])) { - output[global_index] = src_data[tid]; - } - if (!gtid) { - *d_num_selected_out = d_block_offset[gridDim.x]; - } -}; - -} // namespace detail - -template -void If(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT input, T *output, - T *d_num_selected_out, IndexType num_items, SelectOp Op, cudaStream_t stream = 0) { - constexpr unsigned int blocksize = 1024; - unsigned int gridDim = (num_items - 1) / (blocksize) + 1; - using cubCountIt = cub::CountingInputIterator; - const T *input_ptr{nullptr}; - if constexpr (!std::is_same::value) { - input_ptr = reinterpret_cast(input); - } - - if (!d_temp_storage) { - temp_storage_bytes = 0; - temp_storage_bytes += sizeof(IndexType) * (gridDim + 1); - temp_storage_bytes += sizeof(unsigned short) * (num_items); - size_t cub_bytes = 0; - HCTR_LIB_THROW(cub::DeviceScan::InclusiveSum((void *)(nullptr), cub_bytes, - (IndexType *)(nullptr), (IndexType *)(nullptr), - gridDim, stream)); - temp_storage_bytes += cub_bytes; - return; - } - size_t temp_start = reinterpret_cast(d_temp_storage); - IndexType *d_block_sum = reinterpret_cast(temp_start); - temp_start += sizeof(IndexType) * (gridDim + 1); - unsigned short *d_offset = reinterpret_cast(temp_start); - temp_start += sizeof(unsigned short) * (num_items); - size_t cub_bytes = temp_storage_bytes + reinterpret_cast(d_temp_storage) - - reinterpret_cast(temp_start); - detail::pre_select_if<<>>( - input_ptr, d_offset, d_block_sum + 1, (size_t)num_items, Op, d_num_selected_out); - HCTR_LIB_THROW(cub::DeviceScan::InclusiveSum(reinterpret_cast(temp_start), cub_bytes, - d_block_sum + 1, d_block_sum + 1, gridDim, stream)); - detail::post_select_if<<>>( - input_ptr, d_offset, d_block_sum, (size_t)num_items, Op, output, d_num_selected_out); -} - -} // namespace DeviceSelect -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp b/HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp deleted file mode 100644 index 31535b9719..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/statistics.hpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -// depends on : data object -// => allocate(Data data) - -template -struct Statistics { - public: - Statistics() - : num_samples(0), - num_tables(0), - num_instances(0), - num_categories(0), - num_unique_categories(0) {} - ~Statistics() {} - Statistics(dtype num_samples_in, size_t num_tables_in, size_t num_instances_in, - dtype num_categories_in) - : num_samples(num_samples_in), - num_tables(num_tables_in), - num_instances(num_instances_in), - num_categories(num_categories_in), - num_unique_categories(0) { - std::shared_ptr> buf = GeneralBuffer2::create(); - reserve(buf); - buf->allocate(); - } - Statistics(dtype num_samples_in, size_t num_tables_in, size_t num_instances_in, - dtype num_categories_in, std::shared_ptr> buf) - : num_samples(num_samples_in), - num_tables(num_tables_in), - num_instances(num_instances_in), - num_unique_categories(0) { - reserve(buf); - } - Statistics(const Data &data, size_t num_instances_in) - : num_samples(data.batch_size * data.num_iterations * data.table_sizes.size()), - num_tables(data.table_sizes.size()), - num_instances(num_instances_in), - num_categories(std::accumulate(data.table_sizes.begin(), data.table_sizes.end(), (dtype)0)), - num_unique_categories(0) { - std::shared_ptr> buf = GeneralBuffer2::create(); - reserve(buf); - buf->allocate(); - } - Statistics(const Data &data, size_t num_instances_in, - std::shared_ptr> buf) - : num_samples(data.batch_size * data.num_iterations * data.table_sizes.size()), - num_tables(data.table_sizes.size()), - num_instances(num_instances_in), - num_categories(std::accumulate(data.table_sizes.begin(), data.table_sizes.end(), 0)), - num_unique_categories(0) { - reserve(buf); - } - - void reserve(std::shared_ptr> buf) { - buf->reserve({num_samples, 1}, &categories_sorted); - buf->reserve({num_samples, 1}, &counts_sorted); - buf->reserve({num_tables + 1, 1}, &table_offsets); - buf->reserve({num_tables + 1, 1}, &infrequent_model_table_offsets); - buf->reserve({num_instances * (num_tables + 1), 1}, &frequent_model_table_offsets); - reserve_temp_storage(buf); - } - - size_t num_samples; // input - size_t num_tables; - size_t num_instances; - dtype num_categories; - uint32_t num_unique_categories; // to be calculated - - // top categories sorted by count - Tensor2 categories_sorted; - Tensor2 counts_sorted; - Tensor2 table_offsets; // cumulative sum of table_sizes - Tensor2 infrequent_model_table_offsets; - Tensor2 frequent_model_table_offsets; - std::vector> sort_categories_by_count_temp_storages_; - std::vector> calculate_frequent_categories_temp_storages_; - std::vector> calculate_infrequent_categories_temp_storages_; - void reserve_temp_storage(std::shared_ptr> buf); - void sort_categories_by_count(const dtype *samples, size_t num_samples, dtype *categories_sorted, - uint32_t *counts_sorted, uint32_t &num_unique_categories, - cudaStream_t stream); - void sort_categories_by_count(const Tensor2 &samples, cudaStream_t stream); - void calculate_frequent_and_infrequent_categories( - dtype *frequent_categories, dtype *infrequent_categories, dtype *category_location, - const size_t num_frequent, const size_t num_infrequent, cudaStream_t stream); - void calculate_infrequent_model_table_offsets( - std::vector &h_infrequent_model_table_offsets, const dtype *infrequent_categories, - const Tensor2 &category_location, uint32_t global_instance_id, - const dtype num_infrequent, cudaStream_t stream); - void calculate_frequent_model_table_offsets(std::vector &h_frequent_model_table_offsets, - const dtype *frequent_categories, - const dtype num_frequent, cudaStream_t stream); - void revoke_temp_storage() { - sort_categories_by_count_temp_storages_.clear(); - calculate_frequent_categories_temp_storages_.clear(); - calculate_infrequent_categories_temp_storages_.clear(); - } -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/update.cuh b/HugeCTR/include/embeddings/hybrid_embedding/update.cuh deleted file mode 100644 index e2ed8acc4f..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/update.cuh +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -namespace { - -template -__global__ void sgd_global_update_kernel(const emtype *__restrict__ gradients, - float *__restrict__ embedding_vectors, - uint32_t embedding_vec_size, - const float *__restrict__ lr_ptr, const float scale) { - int bid = blockIdx.x; // block = one vector - int tid = threadIdx.x; // thread = one element in a vector - - float lr = __ldg(lr_ptr) / scale; - - /// TODO: vectorization possible? - embedding_vectors[bid * embedding_vec_size + tid] -= - lr * TypeConvertFunc::convert(gradients[bid * embedding_vec_size + tid]); -} - -template -__global__ void sgd_atomic_update_kernel(const emtype *__restrict__ gradients, - float *__restrict__ embedding_vectors, - LambdaNum get_num_indices, LambdaIdx get_index, - uint32_t embedding_vec_size, - const float *__restrict__ lr_ptr, const float scale) { - const uint32_t num_indices = get_num_indices(); - - float lr = __ldg(lr_ptr) / scale; - - for (uint32_t i = blockIdx.x; i < num_indices; i += gridDim.x) { - auto index = get_index(i); - - atomicAdd(embedding_vectors + index * embedding_vec_size + threadIdx.x, - -lr * TypeConvertFunc::convert( - gradients[i * embedding_vec_size + threadIdx.x])); - } -} - -} // namespace - -template -void sgd_global_update(const emtype *gradients, float *embedding_vectors, - dtype num_embedding_vectors, uint32_t embedding_vec_size, float *lr_ptr, - float scale, cudaStream_t stream) { - if (num_embedding_vectors < 1) return; - sgd_global_update_kernel<<>>( - gradients, embedding_vectors, embedding_vec_size, lr_ptr, scale); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void sgd_atomic_update(const emtype *gradients, float *embedding_vectors, LambdaNum get_num_indices, - LambdaIdx get_index, uint32_t n_blocks, uint32_t embedding_vec_size, - float *lr_ptr, float scale, cudaStream_t stream) { - // Note: currently taking the number of blocks as an argument but we can also compute it here with - // some heuristics if we think it's better - sgd_atomic_update_kernel<<>>( - gradients, embedding_vectors, get_num_indices, get_index, embedding_vec_size, lr_ptr, scale); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_embedding/utils.cuh b/HugeCTR/include/embeddings/hybrid_embedding/utils.cuh deleted file mode 100644 index b62aea92b2..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/utils.cuh +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace HugeCTR { -namespace hybrid_embedding { - -__global__ void offsets_kernel(const uint32_t* indices, uint32_t* indices_offsets, - uint32_t num_instances, uint32_t multiplier); - -__global__ void model_id_kernel(const uint32_t* indices_offsets, uint32_t* src_model_id, - const uint32_t* d_num_elements); - -template -__global__ void modulo_kernel(dtype* buffer, const stype* d_num_elements, dtype divisor); - -} // namespace hybrid_embedding -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/include/embeddings/hybrid_embedding/utils.hpp b/HugeCTR/include/embeddings/hybrid_embedding/utils.hpp deleted file mode 100644 index 3fdee29842..0000000000 --- a/HugeCTR/include/embeddings/hybrid_embedding/utils.hpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -enum class HybridEmbeddingType { Distributed, Unknown }; -enum class CommunicationType { IB_NVLink_Hier, IB_NVLink, NVLink_SingleNode, Unknown }; -enum class CommunicationDirection { CommunicationForward, CommunicationBackward }; - -template -void download_tensor(std::vector& h_tensor, const Tensor2 tensor, - cudaStream_t stream); - -template -void upload_tensor(const std::vector& h_tensor, Tensor2 tensor, cudaStream_t stream); - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp b/HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp deleted file mode 100644 index d35be06b79..0000000000 --- a/HugeCTR/include/embeddings/hybrid_sparse_embedding.hpp +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR::hybrid_embedding; - -namespace HugeCTR { - -struct HybridSparseEmbeddingParams { - size_t train_batch_size; - size_t evaluate_batch_size; - size_t num_iterations_statistics; - size_t max_num_frequent_categories; // max(train_batch_size, eval_batch_size) * # of batches for - // frequent categories - int64_t max_num_infrequent_samples; - double p_dup_max; - size_t embedding_vec_size; - size_t slot_num; // slot number - std::vector slot_size_array; - hybrid_embedding::CommunicationType communication_type; - double max_all_reduce_bandwidth; - double max_all_to_all_bandwidth; - double efficiency_bandwidth_ratio; - hybrid_embedding::HybridEmbeddingType hybrid_embedding_type; - OptParams opt_params; // optimizer params -}; - -/// -/// Interface class for the hybrid embedding to HugeCTR. It is responsible for -/// persistent gpu memory allocation. -/// -template -class HybridSparseEmbedding : public SchedulableEmbeding { - private: - // Embedding models, one instance per frequent and the infrequent embedding - // for each mlp-network in the train session. - // - - // data-parallel embedding model - std::vector> frequent_embeddings_single_node_; - std::vector> frequent_embeddings_multi_node_; - - // model-parallel embedding model - std::vector> - infrequent_embeddings_single_node_; - std::vector> infrequent_embeddings_ib_nvlink_; - std::vector> - infrequent_embeddings_ib_nvlink_hier_; - - // Hier A2Av / custom AR impl -#ifdef ENABLE_MPI - std::vector comm_stream_; - IbComm* ib_comm_; - AllReduceInPlaceComm::Handle barrier_handle_; -#endif - std::unique_ptr gpu_barrier_; - - AllReduceInPlaceComm::Handle frequent_embedding_handle_; - Tensors2 d_barrier_store_; - - // model_, data_, calibration_ and statistics_ are replications of the model - // and input data on each gpu. The HybridSparseEmbedding class manages - // it's scope / frees the memory. - std::vector> model_; - std::vector> data_statistics_; - std::vector calibration_; - std::vector> statistics_; - - // added by kefeng - // std::vector pre_alloc_bufs_; - std::vector>> bufs_; - - size_t train_inflight_id_ = 0; /**< Which BatchIndices to use. */ - size_t eval_inflight_id_ = 0; /**< Which BatchIndices to use. */ - HybridSparseEmbeddingParams embedding_params_; - std::shared_ptr resource_manager_; - - Tensors2 train_output_tensors_; /**< The output tensors. */ - Tensors2 evaluate_output_tensors_; /**< The output tensors. */ - template - using BuffPtr = std::shared_ptr>; - std::vector> grouped_wgrad_buff_; - bool grouped_all_reduce_ = false; - - std::vector opt_params_; /**< Optimizer params. */ - - GpuLearningRateSchedulers lr_scheds_; - bool graph_mode_; - - size_t current_train_batch_size_ = - 0; /**< Current batch size (since we need to handle incomplete batch). */ - size_t current_eval_batch_size_ = - 0; /**< Current batch size (since we need to handle incomplete batch). */ - bool current_train_batch_cached_ = false; /**< Used to check if BatchIndices already computed. */ - bool current_eval_batch_cached_ = false; /**< Used to check if BatchIndices already computed. */ - std::vector> train_batch_indices_; /**< Stores indices for Batch. */ - std::vector> eval_batch_indices_; /**< Stores indices for Batch. */ - - // TODO: this parameter is not used by HE at all. - // We should be in pursuit of merging SparseEmbeddingHashParams and HybridSparseEmbeddingParams - SparseEmbeddingHashParams dummy_params_; - - FrequentEmbeddingBase& get_frequent_embedding(size_t i) { - if (frequent_embeddings_single_node_.size()) { - return frequent_embeddings_single_node_[i]; - } else { - return frequent_embeddings_multi_node_[i]; - } - } - FrequentEmbeddingData& get_frequent_embedding_data(size_t i) { - if (frequent_embeddings_single_node_.size()) { - return frequent_embeddings_single_node_[i].frequent_data_; - } else { - return frequent_embeddings_multi_node_[i].frequent_data_; - } - } - - InfrequentEmbeddingBase& get_infrequent_embedding(size_t i) { - switch (embedding_params_.communication_type) { - case CommunicationType::NVLink_SingleNode: - return infrequent_embeddings_single_node_[i]; - case CommunicationType::IB_NVLink: - return infrequent_embeddings_ib_nvlink_[i]; - case CommunicationType::IB_NVLink_Hier: - return infrequent_embeddings_ib_nvlink_hier_[i]; - default: - throw std::runtime_error("Unsupported communication type"); - } - } - - protected: - size_t get_batch_size(bool is_train) const { - if (is_train) { - return embedding_params_.train_batch_size; - } else { - return embedding_params_.evaluate_batch_size; - } - } - size_t get_universal_batch_size() const { - return std::max(embedding_params_.train_batch_size, embedding_params_.evaluate_batch_size); - } - size_t get_batch_size_per_gpu(bool is_train) const { - return get_batch_size(is_train) / resource_manager_->get_global_gpu_count(); - } - size_t get_embedding_vec_size() const { return embedding_params_.embedding_vec_size; } - size_t get_slot_num() const { return embedding_params_.slot_num; } - void get_num_instances_per_node(std::vector& num_instances_per_node) { - uint32_t total_gpu_count = resource_manager_->get_global_gpu_count(); - for (uint32_t gid = 0; gid < total_gpu_count; ++gid) { - uint32_t nodeid = resource_manager_->get_process_id_from_gpu_global_id(gid); - num_instances_per_node[nodeid] = num_instances_per_node[nodeid] + 1; - } - return; - } - - GPUResource& get_local_gpu(int i) const { return *resource_manager_->get_local_gpu(i); } - - size_t get_categories_num() { - size_t num_categories = 0; - for (size_t i = 0; i < embedding_params_.slot_size_array.size(); ++i) { - num_categories += embedding_params_.slot_size_array[i]; - } - return num_categories; - } - - public: - HybridSparseEmbedding(const SparseTensors& train_input_tensors, - const SparseTensors& evaluate_input_tensors, - const HybridSparseEmbeddingParams& embedding_params, - const std::vector>& grouped_wgrad_buff, - const GpuLearningRateSchedulers lr_scheds, bool graph_mode, - const std::shared_ptr& resource_manager); - ~HybridSparseEmbedding() = default; - - // TODO: consider to merge it with init_params - void init_model(const SparseTensors& data, size_t& wgrad_offset); - - void setup_buffered_indices(bool is_train, AsyncReader* data_reader); - - void forward(bool is_train) override; - void backward() override; - void update_params() override; - void init_params() override; - void load_parameters(std::string sparse_model) override; - void dump_parameters(std::string sparse_model) const override; - void set_learning_rate(float lr) override; - // TODO: a workaround to enable GPU LR for HE only; need a better way - GpuLearningRateSchedulers get_learning_rate_schedulers() const override; - - size_t get_params_num() const override; - size_t get_vocabulary_size() const override; - size_t get_max_vocabulary_size() const override; - - Embedding_t get_embedding_type() const override { return Embedding_t::HybridSparseEmbedding; } - // TODO: implemented the empty virtual functions below and in the corresponding CU file. - void load_parameters(BufferBag& keys, size_t num) override {} - void dump_parameters(BufferBag& keys, size_t* num) const override {} - - void dump_opt_states(std::string sparse_model) override {} - void load_opt_states(std::string read_path) override {} - void reset_optimizer() override {} - void reset() override {} - - const SparseEmbeddingHashParams& get_embedding_params() const override { return dummy_params_; } - void check_overflow() const override {} - void get_forward_results_tf(const bool is_train, const bool on_gpu, - void* const forward_result) override {} - - std::vector get_train_output_tensors() const override; - std::vector get_evaluate_output_tensors() const override; - - cudaError_t update_top_gradients(const bool on_gpu, const void* const top_gradients) override { - throw; - } - - void freeze() override { HCTR_LOG(WARNING, ROOT, "Hybrid embedding cannot be freezed.\n"); } - - void unfreeze() override { - HCTR_LOG(WARNING, ROOT, "Hybrid embedding do not need to be unfreezed.\n"); - } - - bool is_trainable() const override { return true; } - - void assign_input_tensors(bool is_train, size_t batch_size, size_t inflight_id, - bool cached) override; - void index_calculation(bool is_train, int i) override; - void freq_forward(bool is_train, int i, bool is_first_eval_batch) override; - void freq_backward(int i) override; - void freq_update_params(int i) override; - void infreq_model_forward(int i) override; - void infreq_network_forward(bool is_train, int i) override; - void global_barrier(bool is_train, int i) override; - void infreq_network_backward(int i) override; - void infreq_model_backward(int i) override; -}; - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp b/HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp deleted file mode 100644 index 675aed0b5d..0000000000 --- a/HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp +++ /dev/null @@ -1,485 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include - -namespace HugeCTR { -/** - * The LocalizedSlotSparseEmbeddingOneHot class inherits from Embedding class, which is the base - * class for implementing all embedding layers. In this class, the slots in the embedding table - * are assigned to a single GPU separately, which are called localized slots. For example, slot-0 on - * GPU-0, slot-1 on GPU-1, slot-2 on GPU-0, slot-3 on GPU-1, etc. This class is very simple to the - * LocalizedSlotSparseEmbeddingHash, but optimized for performance according to the "one-hot" - * feature. So, there are several assumptions in this class: 1) The mapping method from keys to - * embedding row_indices is linear, so there is no hashtable in this class; 2) all the features are - * one-hot, while multi-hot is not supported in this class; 3) Implement P2P access in forward prop, - * fused forward_sum+all2all+reorder, so there is no all2all in forward and backward prop, and can - * only support single node. 4) only support SGD optimizer by now. - */ - -template -class LocalizedSlotSparseEmbeddingOneHot : public IEmbedding { - private: - // define tensors - EmbeddingData embedding_data_; - Tensors2 hash_table_value_tensors_; /**< Hash table value. */ - std::vector> value_table_tensors_; - - Tensors2 hash_table_slot_id_tensors_; /**< the tensors for storing slot ids */ - Tensors2 hash_value_index_tensors_; /**< Hash value index. The index is corresponding to - the line number of the value. */ - Tensors2 - embedding_feature_tensors_; /**< the output tensor of the forward(). */ - Tensor2 train_embedding_features_; - Tensor2 evaluate_embedding_features_; - Tensors2 wgrad_tensors_; /**< the input tensor of the backward(). */ - - Tensors2 top_categories_; - std::vector size_top_categories_; - - size_t max_vocabulary_size_; - size_t max_vocabulary_size_per_gpu_; /**< Max vocabulary size for each GPU. */ - std::vector slot_num_per_gpu_; /* slot_num per GPU */ - std::vector slot_size_array_; - - SparseEmbeddingFunctors functors_; - - Tensors2 all2all_tensors_; /**< the temple buffer to store all2all results */ - Tensors2 utest_all2all_tensors_; - Tensors2 utest_reorder_tensors_; - Tensors2 utest_backward_temp_tensors_; - Tensors2 utest_forward_temp_tensors_; - - Tensors2 mapping_offsets_per_gpu_tensors_; - - Tensor2 &get_embedding_features(bool is_train) { - if (is_train) { - return train_embedding_features_; - } else { - return evaluate_embedding_features_; - } - } - - /** - * Calculate the max vocabulary size per GPU. - * @param total_gpu_count total GPU count. - * @param local_gpu_count local GPU count. - * @param slot_sizes an array which stores the size of the slots to be initialized. - * @param device_resources GPU device resources. - */ - static size_t cal_max_voc_size_per_gpu(const std::vector slot_sizes, - const ResourceManager &resource_manager) { - size_t local_gpu_count = resource_manager.get_local_gpu_count(); - size_t total_gpu_count = resource_manager.get_global_gpu_count(); - - size_t max_voc_size = 0; - for (size_t id = 0; id < local_gpu_count; id++) { - size_t global_id = resource_manager.get_local_gpu(id)->get_global_id(); - - size_t total_size = 0; - for (size_t i = 0; i < slot_sizes.size(); i++) { - if ((i % total_gpu_count) == global_id) { - total_size += slot_sizes[i]; - } - } - - if (total_size > max_voc_size) { - max_voc_size = total_size; - } - } - - return max_voc_size; - } - - /** - * Initialize the hash table and embedding table on local GPUs. This function is only used - * by LocalizedSparseEmbeddingHash. - * @param slot_sizes an array which stores the size of the slots to be initialized. - * @param embedding_vec_size embedding vector size. - * @param hash_table_value_tensors embedding table tensors. - * @param hash_table_slot_id_tensors slot ids tensors. - */ - void init_embedding(const std::vector slot_sizes, size_t embedding_vec_size, - std::vector> &hash_table_value_tensors, - Tensors2 &hash_table_slot_id_tensors); - - /** - * load_parameters() for LocalizedSlotSparseEmbeddingOnehot - * @param keys the memory buffer storing keys. - * @param slot_id the memory buffer storing slot_id. - * @param embeddings the memory buffer storing embedding vectors. - * @param num the number of unique keys (embedding vectors) in keys (embeddings). - * @param embedding_vec_size embedding vector size. - * @param hash_table_value_tensors the hash table value on multi GPUs. - * @param slot_sizes the size for each slot - * @param mapping_offsets_per_gpu_tensors the mapping offset of each slot on every GPU - */ - void load_parameters(const Tensor2 &keys, const Tensor2 &slot_id, - const Tensor2 &embeddings, size_t num, size_t embedding_vec_size, - Tensors2 &hash_table_value_tensors, - const std::vector &slot_sizes, - const Tensors2 &mapping_offsets_per_gpu_tensors); - - /** - * dump_parameters for LocalizedSlotSparseEmbeddingOnehot. - * @param sparse_model the folder name of sparse model. - * @param embedding_vec_size embedding vector size. - * @param hash_table_value_tensors the hash table value on multi-GPU. - * @param slot_sizes the size for each slot - */ - void dump_parameters(const std::string &sparse_model, size_t embedding_vec_size, - const Tensors2 &hash_table_value_tensors, - const std::vector &slot_sizes) const; - - /** - * dump_parameters for LocalizedSlotSparseEmbeddingOnehot. - * @param keys the memory buffer to store keys. - * @param slot_id the memory buffer to store slot_id. - * @param embeddings the memory buffer to store embedding vectors. - * @param num pointer to store the number of unique keys (embedding vectors). - * @param embedding_vec_size embedding vector size. - * @param hash_table_value_tensors the hash table value on multi-GPU. - * @param slot_sizes the size for each slot - */ - void dump_parameters(Tensor2 &keys, Tensor2 &slot_id, - Tensor2 &embeddings, size_t *num, size_t embedding_vec_size, - const Tensors2 &hash_table_value_tensors, - const std::vector &slot_sizes) const; - - public: - /** - * The constructor of LocalizedSlotSparseEmbeddingOneHot. - * @param row_offsets_tensors row offsets of the input tensor(refer to row offset vector in sparse - * matrix CSR format). - * @param hash_key_tensors hash keys of the input tensor(refer to value vector in sparse matrix - * CSR format). - * @param embedding_params embedding params for initialization. - * @param resource_manager the GPU resource group - */ - LocalizedSlotSparseEmbeddingOneHot(const Tensors2 &train_row_offsets_tensors, - const Tensors2 &train_value_tensors, - const std::vector> &train_nnz_array, - const Tensors2 &evaluate_row_offsets_tensors, - const Tensors2 &evaluate_value_tensors, - const std::vector> &evaluate_nnz_array, - const SparseEmbeddingHashParams &embedding_params, - const std::shared_ptr &resource_manager); - - LocalizedSlotSparseEmbeddingOneHot(const SparseTensors &train_keys, - const SparseTensors &evaluate_keys, - const SparseEmbeddingHashParams &embedding_params, - const std::shared_ptr &resource_manager); - - void filter_keys_per_gpu(bool is_train, size_t id, size_t global_id, size_t global_num); - - void data_to_unique_categories_per_gpu(bool is_train, size_t id); - /** - * The forward propagation of embedding layer. - */ - void forward(bool is_train) override { - CudaDeviceContext context; - -#pragma omp parallel for num_threads(embedding_data_.get_resource_manager().get_local_gpu_count()) - for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) { - context.set_device( - embedding_data_.get_local_gpu(i).get_device_id()); // set device - // for forward_fuse method - if (embedding_data_.embedding_params_.do_unique_key_flag) { - data_to_unique_categories_per_gpu(is_train, i); - } - if (embedding_data_.embedding_params_.is_data_parallel) { - filter_keys_per_gpu(is_train, i, embedding_data_.get_local_gpu(i).get_global_id(), - embedding_data_.get_resource_manager().get_global_gpu_count()); - } - functors_.forward_mapping_per_gpu( - embedding_data_.embedding_params_.get_batch_size(is_train), slot_num_per_gpu_[i], - embedding_data_.get_value_tensors(is_train)[i], - *embedding_data_.get_nnz_array(is_train)[i], mapping_offsets_per_gpu_tensors_[i], - hash_value_index_tensors_[i], embedding_data_.get_local_gpu(i).get_stream()); - - // fuse forward+all2all+reorder into one kernel - functors_.forward_fuse_per_gpu( - i, embedding_data_.get_resource_manager().get_local_gpu_count(), - embedding_data_.embedding_params_.get_batch_size(is_train), - embedding_data_.get_batch_size_per_gpu(is_train), - embedding_data_.embedding_params_.slot_num, slot_num_per_gpu_[i], - embedding_data_.embedding_params_.embedding_vec_size, - embedding_data_.embedding_params_.combiner, - embedding_data_.get_row_offsets_tensors(is_train)[i], hash_value_index_tensors_[i], - hash_table_value_tensors_[i], get_embedding_features(is_train), - embedding_data_.get_local_gpu(i).get_sm_count(), - embedding_data_.get_local_gpu(i).get_stream()); - } - - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - return; - } - - /** - * The first stage of backward propagation of embedding layer, - * which computes the wgrad by the dgrad from the top layer. - */ - void backward() override { - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - CudaDeviceContext context; - for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) { - context.set_device(embedding_data_.get_local_gpu(i).get_device_id()); - - functors_.backward_fuse_per_gpu( - i, embedding_data_.get_resource_manager().get_local_gpu_count(), - embedding_data_.embedding_params_.get_batch_size(true), - embedding_data_.get_batch_size_per_gpu(true), embedding_data_.embedding_params_.slot_num, - slot_num_per_gpu_[i], embedding_data_.embedding_params_.embedding_vec_size, - embedding_data_.embedding_params_.combiner, get_embedding_features(true), - wgrad_tensors_[i], embedding_data_.get_local_gpu(i).get_sm_count(), - embedding_data_.get_local_gpu(i).get_stream()); - } - - return; - } - - /** - * The second stage of backward propagation of embedding layer, which - * updates the hash table by wgrad(from backward()) and optimizer. - */ - void update_params() override { - // accumulate times for adam optimizer - embedding_data_.embedding_params_.opt_params.hyperparams.adam.times++; -#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count()) - { - size_t id = omp_get_thread_num(); - CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id()); - - // do update params operation: only support SGD - functors_.update_params( - embedding_data_.embedding_params_.embedding_vec_size, - embedding_data_.embedding_params_.opt_params, *embedding_data_.get_nnz_array(true)[id], - hash_value_index_tensors_[id], wgrad_tensors_[id], hash_table_value_tensors_[id], - top_categories_[id], size_top_categories_[id], - embedding_data_.get_local_gpu(id).get_sm_count(), - embedding_data_.get_local_gpu(id).get_stream()); - } - - return; - } - - /** - * Initialize the embedding table - */ - void init_params() override { - // do hash table value initialization - if (slot_size_array_.size() == embedding_data_.embedding_params_.slot_num) { - init_embedding(slot_size_array_, embedding_data_.embedding_params_.embedding_vec_size, - value_table_tensors_, hash_table_slot_id_tensors_); - } else { - throw std::runtime_error( - std::string("[HCDEBUG][ERROR] Runtime error: the size of slot_sizes != slot_num\n")); - } - } - - /** - * Read the hash table from the weight_stream on the host, and - * upload it onto multi-GPUs global memory. - * @param sparse_model the folder name of sparse model. - */ - void load_parameters(std::string sparse_model) override; - void load_parameters(BufferBag &buf_bag, size_t num) override; - /** - * Download the hash table from multi-GPUs global memory to CPU memory - * and write it to the weight_stream on the host. - * @param sparse_model the folder name of sparse model. - */ - void dump_parameters(std::string sparse_model) const override; - void dump_parameters(BufferBag &buf_bag, size_t *num) const override; - - void dump_opt_states(std::string sparse_model) override {} - void load_opt_states(std::string read_path) override {} - void reset_optimizer() override {} - - /** - * Reset the embedding - */ - void reset() override; - - /** - * Get the total size of hash tables on all GPUs. - */ - size_t get_params_num() const override { - return (max_vocabulary_size_ * embedding_data_.embedding_params_.embedding_vec_size); - } - - size_t get_vocabulary_size() const override { return max_vocabulary_size_; } - - size_t get_max_vocabulary_size() const override { return max_vocabulary_size_; } - - // only used for results check - /** - * Get the forward() results from GPUs and copy them to the host pointer - * embedding_feature. This function is only used for unit test. - * @param embedding_feature the host pointer for storing the forward() - * results. - */ - void get_forward_results(bool is_train, Tensor2 &embedding_feature) { - size_t memcpy_size = embedding_data_.get_batch_size_per_gpu(is_train) * - embedding_data_.embedding_params_.slot_num * - embedding_data_.embedding_params_.embedding_vec_size; - - functors_.get_forward_results(memcpy_size, embedding_data_.get_output_tensors(is_train), - embedding_feature, utest_forward_temp_tensors_, - embedding_data_.get_resource_manager()); - - return; - } - - /** - * Get the forward() results from GPUs and copy them to tensorflow's tensor. - */ - void get_forward_results_tf(const bool is_train, const bool on_gpu, - void *const forward_result) override { - size_t memcpy_size = embedding_data_.get_batch_size_per_gpu(is_train) * - embedding_data_.embedding_params_.slot_num * - embedding_data_.embedding_params_.embedding_vec_size; - functors_.get_forward_results(memcpy_size, embedding_data_.get_output_tensors(is_train), - forward_result, utest_forward_temp_tensors_, - embedding_data_.get_resource_manager(), on_gpu); - return; - } - - /** - * Get the backward() results from GPUs and copy them to the host pointer - * wgrad. The wgrad on each GPU should be the same. This function is only - * used for unit test. - * @param wgrad the host pointer for storing the backward() results. - * @param devIndex the GPU device id. - */ - void get_backward_results(Tensor2 &wgrad, int devIndex) { - CudaDeviceContext context(embedding_data_.get_local_gpu(0).get_device_id()); - -#ifndef ENABLE_MPI - if (embedding_data_.get_resource_manager().get_global_gpu_count() > 1) { - functors_.all2all_forward(embedding_data_.get_batch_size_per_gpu(true), slot_num_per_gpu_, - embedding_data_.embedding_params_.embedding_vec_size, - wgrad_tensors_, utest_all2all_tensors_, - embedding_data_.get_resource_manager()); - } else { - HCTR_LIB_THROW(cudaMemcpyAsync( - utest_all2all_tensors_[0].get_ptr(), wgrad_tensors_[0].get_ptr(), - embedding_data_.get_batch_size_per_gpu(true) * slot_num_per_gpu_[0] * - embedding_data_.embedding_params_.embedding_vec_size * sizeof(TypeEmbeddingComp), - cudaMemcpyDeviceToDevice, embedding_data_.get_local_gpu(0).get_stream())); - } -#else - if (embedding_data_.get_resource_manager().get_global_gpu_count() > 1) { - functors_.all2all_forward( - embedding_data_.get_batch_size_per_gpu(true), embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size, wgrad_tensors_, - utest_all2all_tensors_, embedding_data_.get_resource_manager()); - } else { - HCTR_LIB_THROW(cudaMemcpyAsync( - utest_all2all_tensors_[0].get_ptr(), wgrad_tensors_[0].get_ptr(), - embedding_data_.get_batch_size_per_gpu(true) * slot_num_per_gpu_[0] * - embedding_data_.embedding_params_.embedding_vec_size * sizeof(TypeEmbeddingComp), - cudaMemcpyDeviceToDevice, embedding_data_.get_local_gpu(0).get_stream())); - } -#endif - - // reorder - functors_.forward_reorder( - embedding_data_.get_batch_size_per_gpu(true), embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size, utest_all2all_tensors_, - utest_reorder_tensors_, embedding_data_.get_resource_manager()); - - // there are batch_size_per_gpu samples' wgard on each GPU - size_t memcpy_size = embedding_data_.get_batch_size_per_gpu(true) * - embedding_data_.embedding_params_.slot_num * - embedding_data_.embedding_params_.embedding_vec_size; - - // nccl gather - functors_.all_gather(memcpy_size, - utest_reorder_tensors_, // send - utest_backward_temp_tensors_, // recv - embedding_data_.get_resource_manager()); - - // memcpy H2D - functors_.get_backward_results( - devIndex, embedding_data_.get_resource_manager().get_global_gpu_count() * memcpy_size, - utest_backward_temp_tensors_, wgrad, embedding_data_.get_resource_manager()); - - return; - } - - /** - * Get the update_params() results(the hash table, including hash_table_keys - * and hash_table_values) from GPUs and copy them to the host pointers. - * This function is only used for unit test. - * @param hash_table_key the host pointer for storing the hash table keys. - * @param hash_table_value the host pointer for storing the hash table values. - */ - void get_update_params_results(Tensor2 &hash_table_key, - Tensor2 &hash_table_value) {} - - void check_overflow() const override {} - - /** only used in tf embedding plugin to distribute top_gradients to each GPUs' output tensor. - */ - cudaError_t update_top_gradients(const bool on_gpu, const void *const top_gradients) override { - auto output_tensors = embedding_data_.get_output_tensors(true); - CudaDeviceContext context; - - const auto top_gradients_internel = reinterpret_cast(top_gradients); - cudaMemcpyKind direction = (on_gpu ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice); - - cudaError_t error = cudaError_t::cudaSuccess; - for (size_t dev_id = 0; dev_id < embedding_data_.get_resource_manager().get_local_gpu_count(); - ++dev_id) { - context.set_device(embedding_data_.get_local_gpu(dev_id).get_device_id()); - - error = cudaMemcpyAsync( - output_tensors[dev_id].get_ptr(), - top_gradients_internel + dev_id * output_tensors[dev_id].get_num_elements(), - output_tensors[dev_id].get_size_in_bytes(), direction, - embedding_data_.get_local_gpu(dev_id).get_stream()); - if (error != cudaError_t::cudaSuccess) return error; - } - - for (size_t dev_id = 0; dev_id < embedding_data_.get_resource_manager().get_local_gpu_count(); - ++dev_id) { - context.set_device(embedding_data_.get_local_gpu(dev_id).get_device_id()); - error = cudaStreamSynchronize(embedding_data_.get_local_gpu(dev_id).get_stream()); - if (error != cudaError_t::cudaSuccess) return error; - } - - return cudaError_t::cudaSuccess; - } - - void freeze() override { embedding_data_.is_trainable_ = false; } - - void unfreeze() override { embedding_data_.is_trainable_ = true; } - - bool is_trainable() const override { return embedding_data_.is_trainable_; } - - USE_EMBEDDING_DATA_FUNCTION(embedding_data_) -}; // end of class LocalizedSlotSparseEmbeddingOneHot - -} // namespace HugeCTR diff --git a/HugeCTR/include/embeddings/sparse_embedding_functors.hpp b/HugeCTR/include/embeddings/sparse_embedding_functors.hpp index e00f00b90f..24b422f963 100644 --- a/HugeCTR/include/embeddings/sparse_embedding_functors.hpp +++ b/HugeCTR/include/embeddings/sparse_embedding_functors.hpp @@ -284,47 +284,6 @@ class SparseEmbeddingFunctors { const Tensor2 &embedding_features, Tensor2 &wgrad, size_t sm, cudaStream_t stream); - /** - * update_params for LocalizedSlotSparseEmbeddingOneHot. - * overload for fp16. Only support atomic SGD currently. - * The second step of backward propagation: update embedding tables(weights) - * @param stream cuda stream corresponding to the current GPU. - * @param embedding_vec_size embedding vector size. - * @param opt_params optimizer params. - * @param nnz non-zero feature number in one batch - * @param hash_value_index the pointer of hash value_index - * @param wgrad the pointer of wgrad - * @param hash_table_value the pointer of hash table value, which will be updated - */ - template - void update_params(size_t embedding_vec_size, const OptParams &opt_params, size_t nnz, - const Tensor2 &hash_value_index, - const Tensor2 &wgrad, Tensor2 &hash_table_value, - Tensor2 &top_categories, size_t &size_top_categories, size_t sm_count, - cudaStream_t stream, bool force_stats = false); - - /** - * Atomic cached sgd update. - * - * @param num_samples number of samples for which to accumulate the gradient - * @param embedding_vec_size size of the embedding vector per category - * @param hash_value_index - * @param lr - * @param scaler - * @param wgrad - * @param hash_table_value - * @param top_categories - * @param size_top_categories - * @param stream - * - */ - template - static void opt_sgd_atomic_cached(size_t num_samples, size_t embedding_vec_size, - const size_t *hash_value_index, float lr, float scaler, - const TypeEmbeddingComp *wgrad, float *hash_table_value, - size_t *top_categories, size_t &size_top_categories, - cudaStream_t stream, bool force_stats = false); - /** * collection communication: reduce_scatter f or DistributedSlotSparseEmbeddingHash * @param recv_count the count of elements will be received. diff --git a/HugeCTR/include/exchange_wgrad.hpp b/HugeCTR/include/exchange_wgrad.hpp index 9bfb1c01c6..3ced5f4aa7 100644 --- a/HugeCTR/include/exchange_wgrad.hpp +++ b/HugeCTR/include/exchange_wgrad.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -43,14 +44,15 @@ class NetworkExchangeWgrad : public ExchangeWgrad { void init_ar_comm(const std::vector& ptr, size_t size) final; void update_embed_wgrad_size(size_t size) final; void allreduce(size_t device_id, cudaStream_t stream); - NetworkExchangeWgrad(const std::shared_ptr& resource_manager); + NetworkExchangeWgrad(const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager); ~NetworkExchangeWgrad() = default; private: // TODO remove them after hybrid embedding is deprecated BuffPtrs network_wgrad_buffs_; BuffPtrs null_wgrad_buffs_; - std::shared_ptr resource_manager_; + std::shared_ptr collective_manager_; AllReduceInPlaceComm::Handle ar_handle_; @@ -67,7 +69,8 @@ class GroupedExchangeWgrad : public ExchangeWgrad { void init_ar_comm(const std::vector& ptr, size_t size) final; void update_embed_wgrad_size(size_t size) final; void allreduce(size_t device_id, cudaStream_t stream); - GroupedExchangeWgrad(const std::shared_ptr& resource_manager); + GroupedExchangeWgrad(const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager); ~GroupedExchangeWgrad() = default; private: @@ -75,7 +78,7 @@ class GroupedExchangeWgrad : public ExchangeWgrad { BuffPtrs network_wgrad_buffs_; BuffPtrs embed_wgrad_buffs_; - std::shared_ptr resource_manager_; + std::shared_ptr collective_manager_; AllReduceInPlaceComm::Handle ar_handle_; diff --git a/HugeCTR/include/parser.hpp b/HugeCTR/include/parser.hpp index 51b4e71c50..561b61174d 100644 --- a/HugeCTR/include/parser.hpp +++ b/HugeCTR/include/parser.hpp @@ -199,9 +199,7 @@ const std::map LAYER_TYPE_MAP_MP = { {"SequenceMask", Layer_t::SequenceMask}}; const std::map EMBEDDING_TYPE_MAP = { {"DistributedSlotSparseEmbeddingHash", Embedding_t::DistributedSlotSparseEmbeddingHash}, - {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}, - {"LocalizedSlotSparseEmbeddingOneHot", Embedding_t::LocalizedSlotSparseEmbeddingOneHot}, - {"HybridSparseEmbedding", Embedding_t::HybridSparseEmbedding}}; + {"LocalizedSlotSparseEmbeddingHash", Embedding_t::LocalizedSlotSparseEmbeddingHash}}; const std::map INITIALIZER_TYPE_MAP = { {"Uniform", Initializer_t::Uniform}, {"XavierNorm", Initializer_t::XavierNorm}, @@ -244,15 +242,6 @@ static const std::map ALIGNED_TYPE_MAP = { {"None", Alignment_t::None}, }; -static const std::map COMMUNICATION_TYPE_MAP = { - {"IB_NVLink_Hierarchical", hybrid_embedding::CommunicationType::IB_NVLink_Hier}, - {"IB_NVLink", hybrid_embedding::CommunicationType::IB_NVLink}, - {"NVLink_SingleNode", hybrid_embedding::CommunicationType::NVLink_SingleNode}}; - -static const std::map - HYBRID_EMBEDDING_TYPE_MAP = { - {"Distributed", hybrid_embedding::HybridEmbeddingType::Distributed}}; - inline bool has_key_(const nlohmann::json& j_in, const std::string& key_in) { if (j_in.find(key_in) == j_in.end()) { return false; diff --git a/HugeCTR/include/pybind/common_wrapper.hpp b/HugeCTR/include/pybind/common_wrapper.hpp index 9bca4c31ef..5885e7d579 100644 --- a/HugeCTR/include/pybind/common_wrapper.hpp +++ b/HugeCTR/include/pybind/common_wrapper.hpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -59,10 +58,6 @@ void CommonPybind(pybind11::module& m) { .value("Sum", HugeCTR::Check_t::Sum) .value("Non", HugeCTR::Check_t::None) .export_values(); - pybind11::enum_(m, "DataReaderSparse_t") - .value("Distributed", HugeCTR::DataReaderSparse_t::Distributed) - .value("Localized", HugeCTR::DataReaderSparse_t::Localized) - .export_values(); pybind11::enum_(m, "DataReaderType_t") .value("Norm", HugeCTR::DataReaderType_t::Norm) .value("Raw", HugeCTR::DataReaderType_t::Raw) @@ -90,9 +85,6 @@ void CommonPybind(pybind11::module& m) { HugeCTR::Embedding_t::DistributedSlotSparseEmbeddingHash) .value("LocalizedSlotSparseEmbeddingHash", HugeCTR::Embedding_t::LocalizedSlotSparseEmbeddingHash) - .value("LocalizedSlotSparseEmbeddingOneHot", - HugeCTR::Embedding_t::LocalizedSlotSparseEmbeddingOneHot) - .value("HybridSparseEmbedding", HugeCTR::Embedding_t::HybridSparseEmbedding) .export_values(); pybind11::enum_(m, "Initializer_t") .value("Default", HugeCTR::Initializer_t::Default) @@ -157,15 +149,6 @@ void CommonPybind(pybind11::module& m) { pybind11::arg("io_alignment") = 0, pybind11::arg("shuffle"), pybind11::arg("aligned_type") = Alignment_t::None, pybind11::arg("multi_hot_reader") = true, pybind11::arg("is_dense_float") = true); - pybind11::class_(m, "HybridEmbeddingParam") - .def(pybind11::init(), - pybind11::arg("max_num_frequent_categories"), - pybind11::arg("max_num_infrequent_samples"), pybind11::arg("p_dup_max"), - pybind11::arg("max_all_reduce_bandwidth"), pybind11::arg("max_all_to_all_bandwidth"), - pybind11::arg("efficiency_bandwidth_ratio"), pybind11::arg("communication_type"), - pybind11::arg("hybrid_embedding_type")); pybind11::enum_(m, "LrPolicy_t") .value("fixed", HugeCTR::LrPolicy_t::fixed) .export_values(); @@ -218,14 +201,6 @@ void CommonPybind(pybind11::module& m) { .value("OneShot", HugeCTR::AllReduceAlgo::ONESHOT) .value("NCCL", HugeCTR::AllReduceAlgo::NCCL) .export_values(); - pybind11::enum_(m, "HybridEmbeddingType") - .value("Distributed", HugeCTR::hybrid_embedding::HybridEmbeddingType::Distributed) - .export_values(); - pybind11::enum_(m, "CommunicationType") - .value("IB_NVLink_Hier", HugeCTR::hybrid_embedding::CommunicationType::IB_NVLink_Hier) - .value("IB_NVLink", HugeCTR::hybrid_embedding::CommunicationType::IB_NVLink) - .value("NVLink_SingleNode", HugeCTR::hybrid_embedding::CommunicationType::NVLink_SingleNode) - .export_values(); pybind11::enum_(m, "Distribution_t") .value("Uniform", HugeCTR::Distribution_t::Uniform) .value("PowerLaw", HugeCTR::Distribution_t::PowerLaw) diff --git a/HugeCTR/include/pybind/model.hpp b/HugeCTR/include/pybind/model.hpp index def5a93327..3f1c73e34b 100644 --- a/HugeCTR/include/pybind/model.hpp +++ b/HugeCTR/include/pybind/model.hpp @@ -115,13 +115,7 @@ std::set TRAINABLE_LAYERS = { std::map EMBEDDING_TYPE_TO_STRING = { {Embedding_t::DistributedSlotSparseEmbeddingHash, "DistributedSlotSparseEmbeddingHash"}, - {Embedding_t::LocalizedSlotSparseEmbeddingHash, "LocalizedSlotSparseEmbeddingHash"}, - {Embedding_t::LocalizedSlotSparseEmbeddingOneHot, "LocalizedSlotSparseEmbeddingOneHot"}, - {Embedding_t::HybridSparseEmbedding, "HybridSparseEmbedding"}}; - -std::map READER_SPARSE_TYPE_TO_STRING = { - {DataReaderSparse_t::Distributed, "DistributedSlot"}, - {DataReaderSparse_t::Localized, "LocalizedSlot"}}; + {Embedding_t::LocalizedSlotSparseEmbeddingHash, "LocalizedSlotSparseEmbeddingHash"}}; std::map INITIALIZER_TYPE_TO_STRING = { {Initializer_t::Uniform, "Uniform"}, @@ -132,14 +126,6 @@ std::map INITIALIZER_TYPE_TO_STRING = { std::map ALLREDUCE_ALGO_TO_STRING = { {AllReduceAlgo::ONESHOT, "OneShot"}, {AllReduceAlgo::NCCL, "NCCL"}}; -std::map HE_COMM_TYPE_TO_STRING = { - {hybrid_embedding::CommunicationType::IB_NVLink_Hier, "IB_NVLink_Hierarchical"}, - {hybrid_embedding::CommunicationType::IB_NVLink, "IB_NVLink"}, - {hybrid_embedding::CommunicationType::NVLink_SingleNode, "NVLink_SingleNode"}}; - -std::map HE_TYPE_TO_STRING = { - {hybrid_embedding::HybridEmbeddingType::Distributed, "Distributed"}}; - std::map FC_POSITION_TO_STRING = { {FcPosition_t::Head, "Head"}, {FcPosition_t::Body, "Body"}, {FcPosition_t::Tail, "Tail"}, {FcPosition_t::Isolated, "Isolated"}, @@ -206,13 +192,12 @@ struct SparseEmbedding { std::string bottom_name; std::vector slot_size_array; std::shared_ptr embedding_opt_params; - HybridEmbeddingParam hybrid_embedding_param; + SparseEmbedding(Embedding_t embedding_type, size_t workspace_size_per_gpu_in_mb, size_t embedding_vec_size, const std::string& combiner_str, std::string sparse_embedding_name, std::string bottom_name, std::vector& slot_size_array, - std::shared_ptr& embedding_opt_params, - const HybridEmbeddingParam& hybrid_embedding_param); + std::shared_ptr& embedding_opt_params); void initialize_max_vocabulary_size_per_gpu(); }; @@ -339,8 +324,7 @@ void add_input(Input& input, DataReaderParams& reader_params, std::vector>& train_tensor_entities_list, std::vector>& evaluate_tensor_entities_list, std::shared_ptr& train_data_reader, - std::shared_ptr& evaluate_data_reader, - std::shared_ptr& init_data_reader, size_t batch_size, + std::shared_ptr& evaluate_data_reader, size_t batch_size, size_t batch_size_eval, bool use_mixed_precision, bool repeat_dataset, bool train_intra_iteration_overlap, size_t num_iterations_statistics, const std::shared_ptr); @@ -352,6 +336,7 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding, std::vector>& evaluate_tensor_entities_list, std::vector>& embeddings, const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager, size_t batch_size, size_t batch_size_eval, OptParams& embedding_opt_params, std::shared_ptr& exchange_wgrad, bool use_cuda_graph, @@ -608,6 +593,7 @@ class Model final { std::shared_ptr evaluate_data_reader_; /**< data reader for evaluation. */ std::shared_ptr resource_manager_; /**< GPU resources include handles and streams etc.*/ + std::shared_ptr collective_manager_; std::shared_ptr embedding_para_io_; metrics::Metrics metrics_; /**< evaluation metrics. */ @@ -648,33 +634,6 @@ class Model final { size_t number_of_networks() const; - struct GraphScheduler { - private: - volatile size_t* executed_iter; - size_t launched_iter; - - public: - GraphScheduler(std::shared_ptr resource_manager) : launched_iter(0) { - // set up trickling launch - CudaCPUDeviceContext ctx(resource_manager->get_local_gpu(0)->get_device_id()); - HCTR_LIB_THROW(cudaMallocHost((void**)&executed_iter, sizeof(size_t))); - *executed_iter = 0; - } - ~GraphScheduler() { cudaFreeHost(const_cast(executed_iter)); } - void trickling() { - // this function is called by the only thread, hence no need to specify the rank - while (launched_iter > *(executed_iter) + 1) { - usleep(10); - } - launched_iter++; - } - void record_execution(size_t local_rank, cudaStream_t stream) { - // Only rank 0 needs to do the work - if (local_rank == 0) inc_var(executed_iter, stream); - } - }; - std::unique_ptr graph_scheduler_; - struct Graph { // train and eval can be called directly by user bool is_first_train_batch_ = true; @@ -696,22 +655,10 @@ class Model final { bool is_scheduled_datareader() { return (reader_params_.data_reader_type == DataReaderType_t::RawAsync); } - bool is_scheduled_embedding() { - return (embeddings_.size() == 1 && - embeddings_[0]->get_embedding_type() == Embedding_t::HybridSparseEmbedding); - } - template - void create_train_pipeline(std::vector>& networks); - template - void create_evaluate_pipeline(std::vector>& networks); - template - void create_train_network_pipeline(std::vector>& networks); - template + void create_train_network_pipeline(std::vector>& networks); void create_eval_network_pipeline(std::vector>& networks); - template - void create_train_pipeline_with_ebc(std::vector>& networks); - template - void create_evaluate_pipeline_with_ebc(std::vector>& networks); + void create_train_pipeline_with_ebc(std::vector>& networks); + void create_evaluate_pipeline_with_ebc(std::vector>& networks); bool skip_prefetch_in_last_batch(bool is_train); long long read_a_batch(bool is_train); diff --git a/HugeCTR/include/pybind/model_wrapper.hpp b/HugeCTR/include/pybind/model_wrapper.hpp index e048f88de9..9b842300e9 100644 --- a/HugeCTR/include/pybind/model_wrapper.hpp +++ b/HugeCTR/include/pybind/model_wrapper.hpp @@ -70,17 +70,12 @@ void ModelPybind(pybind11::module &m) { pybind11::class_>( m, "SparseEmbedding") .def(pybind11::init &, std::shared_ptr &, - const HybridEmbeddingParam &>(), + std::string, std::vector &, std::shared_ptr &>(), pybind11::arg("embedding_type"), pybind11::arg("workspace_size_per_gpu_in_mb") = 0, pybind11::arg("embedding_vec_size"), pybind11::arg("combiner"), pybind11::arg("sparse_embedding_name"), pybind11::arg("bottom_name"), pybind11::arg("slot_size_array") = std::vector(), - pybind11::arg("optimizer") = std::shared_ptr(new OptParamsPy()), - pybind11::arg("hybrid_embedding_param") = - HybridEmbeddingParam{1, -1, 0.01, 1.3e11, 2.6e11, 1.0, - hybrid_embedding::CommunicationType::NVLink_SingleNode, - hybrid_embedding::HybridEmbeddingType::Distributed}); + pybind11::arg("optimizer") = std::shared_ptr(new OptParamsPy())); pybind11::class_(m, "DenseLayerComputeConfig") .def(pybind11::init(), pybind11::arg("async_wgrad") = false, pybind11::arg("fuse_wb") = false); diff --git a/HugeCTR/include/resource_manager.hpp b/HugeCTR/include/resource_manager.hpp index 491595ae43..cfa0090c66 100644 --- a/HugeCTR/include/resource_manager.hpp +++ b/HugeCTR/include/resource_manager.hpp @@ -32,9 +32,7 @@ namespace HugeCTR { */ class ResourceManager : public ResourceManagerBase { public: - static std::shared_ptr create( - const std::vector>& visible_devices, unsigned long long seed, - DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST); + virtual ~ResourceManager() = default; virtual int get_num_process() const = 0; virtual int get_process_id() const = 0; virtual int get_master_process_id() const = 0; @@ -49,14 +47,6 @@ class ResourceManager : public ResourceManagerBase { virtual const std::shared_ptr& get_device_rmm_device_memory_resource(int local_gpu_id) const = 0; - -#ifdef ENABLE_MPI - virtual void init_ib_comm() = 0; - virtual IbComm* get_ib_comm() const = 0; - virtual void set_ready_to_transfer() = 0; -#endif - virtual void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) = 0; - virtual AllReduceInPlaceComm* get_ar_comm() const = 0; }; } // namespace HugeCTR diff --git a/HugeCTR/include/resource_manager_base.hpp b/HugeCTR/include/resource_manager_base.hpp index 735eb5e7be..754b1a67bd 100644 --- a/HugeCTR/include/resource_manager_base.hpp +++ b/HugeCTR/include/resource_manager_base.hpp @@ -27,6 +27,7 @@ namespace HugeCTR { */ class ResourceManagerBase { public: + virtual ~ResourceManagerBase() = default; virtual void set_local_gpu(std::shared_ptr gpu_resource, size_t local_gpu_id) = 0; virtual const std::shared_ptr& get_local_gpu(size_t local_gpu_id) const = 0; virtual const std::shared_ptr& get_local_gpu_from_device_id( diff --git a/HugeCTR/include/resource_managers/resource_manager_core.hpp b/HugeCTR/include/resource_managers/resource_manager_core.hpp index 2118f3c89a..438319e194 100644 --- a/HugeCTR/include/resource_managers/resource_manager_core.hpp +++ b/HugeCTR/include/resource_managers/resource_manager_core.hpp @@ -44,8 +44,11 @@ class ResourceManagerCore : public ResourceManager { public: ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map, unsigned long long seed); - ResourceManagerCore(const ResourceManagerCore&) = delete; - ResourceManagerCore& operator=(const ResourceManagerCore&) = delete; + static std::shared_ptr create( + const std::vector>& visible_devices, unsigned long long seed, + DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST); + + HCTR_DISALLOW_COPY_AND_MOVE(ResourceManagerCore); ~ResourceManagerCore(); // from ResourceManagerBase @@ -111,25 +114,5 @@ class ResourceManagerCore : public ResourceManager { const std::shared_ptr& get_device_rmm_device_memory_resource( int local_gpu_id) const override; - -#ifdef ENABLE_MPI - void init_ib_comm() override { - HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached"); - } - IbComm* get_ib_comm() const override { - HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached"); - return nullptr; - } - void set_ready_to_transfer() override { - HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached"); - } -#endif - void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) override { - HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached"); - } - AllReduceInPlaceComm* get_ar_comm() const override { - HCTR_OWN_THROW(Error_t::IllegalCall, "Error: should not be reached"); - return nullptr; - } }; } // namespace HugeCTR diff --git a/HugeCTR/include/resource_managers/resource_manager_ext.hpp b/HugeCTR/include/resource_managers/resource_manager_ext.hpp deleted file mode 100644 index 1e68af8c5a..0000000000 --- a/HugeCTR/include/resource_managers/resource_manager_ext.hpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace HugeCTR { - -/** - * @brief GPU resources manager which holds all the resources required by training - * - * An extended GPU Resource manager - */ -class ResourceManagerExt : public ResourceManager { - std::shared_ptr core_; - -#ifdef ENABLE_MPI - std::unique_ptr ib_comm_ = NULL; -#endif - std::shared_ptr ar_comm_ = NULL; - - ResourceManagerExt(std::shared_ptr core) : core_(core) {} - - public: - static std::shared_ptr create( - const std::vector>& visible_devices, unsigned long long seed, - DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST); - - ResourceManagerExt(const ResourceManagerExt&) = delete; - ResourceManagerExt& operator=(const ResourceManagerExt&) = delete; - - // from ResourceManagerBase - void set_local_gpu(std::shared_ptr gpu_resource, size_t local_gpu_id) override { - core_->set_local_gpu(gpu_resource, local_gpu_id); - } - const std::shared_ptr& get_local_gpu(size_t local_gpu_id) const override { - return core_->get_local_gpu(local_gpu_id); - } - const std::shared_ptr& get_local_gpu_from_device_id( - size_t device_id) const override { - return core_->get_local_gpu_from_device_id(device_id); - } - size_t get_local_gpu_count() const override { return core_->get_local_gpu_count(); } - size_t get_global_gpu_count() const override { return core_->get_global_gpu_count(); } - - // from ResourceManager - int get_num_process() const override { return core_->get_num_process(); } - int get_process_id() const override { return core_->get_process_id(); } - int get_master_process_id() const override { return core_->get_master_process_id(); } - bool is_master_process() const override { return core_->is_master_process(); } - - const std::shared_ptr& get_local_cpu() const override { - return core_->get_local_cpu(); - } - - const std::vector>& get_local_gpus() const override { - return core_->get_local_gpus(); - } - - const std::vector& get_local_gpu_device_id_list() const override { - return core_->get_local_gpu_device_id_list(); - } - - int get_process_id_from_gpu_global_id(size_t global_gpu_id) const override { - return core_->get_process_id_from_gpu_global_id(global_gpu_id); - } - - size_t get_gpu_local_id_from_global_id(size_t global_gpu_id) const override { - return core_->get_gpu_local_id_from_global_id(global_gpu_id); - } - - size_t get_gpu_global_id_from_local_id(size_t local_gpu_id) const override { - return core_->get_gpu_global_id_from_local_id(local_gpu_id); - } - - bool p2p_enabled(int src_dev, int dst_dev) const override { - return core_->p2p_enabled(src_dev, dst_dev); - } - bool all_p2p_enabled() const override { return core_->all_p2p_enabled(); } - - DeviceMap::Layout get_device_layout() const override { return core_->get_device_layout(); } - - const std::shared_ptr& get_device_rmm_device_memory_resource( - int local_gpu_id) const override { - return core_->get_device_rmm_device_memory_resource(local_gpu_id); - } - -#ifdef ENABLE_MPI - void init_ib_comm() override; - IbComm* get_ib_comm() const override { return ib_comm_.get(); } - void set_ready_to_transfer() override { - if (ib_comm_) ib_comm_->set_ready_to_transfer(); - } -#endif - void set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) override; - AllReduceInPlaceComm* get_ar_comm() const override { return ar_comm_.get(); } -}; -} // namespace HugeCTR diff --git a/HugeCTR/include/scheduleable.hpp b/HugeCTR/include/scheduleable.hpp index 6633670afa..1ab96638bc 100644 --- a/HugeCTR/include/scheduleable.hpp +++ b/HugeCTR/include/scheduleable.hpp @@ -45,22 +45,4 @@ class SchedulableDataReader : public IDataReader { virtual std::vector get_label_tensor23s() const = 0; virtual std::vector get_dense_tensor23s() const = 0; }; - -class SchedulableEmbeding : public IEmbedding { - public: - virtual ~SchedulableEmbeding() = default; - - virtual void assign_input_tensors(bool is_train, size_t batch_size, size_t inflight_id, - bool cached) = 0; - virtual void index_calculation(bool is_train, int i) = 0; - virtual void freq_forward(bool is_train, int i, bool is_first_eval_batch = true) = 0; - virtual void freq_backward(int i) = 0; - virtual void freq_update_params(int i) = 0; - virtual void infreq_model_forward(int i) = 0; - virtual void infreq_network_forward(bool is_train, int i) = 0; - virtual void global_barrier(bool is_train, int i) = 0; - virtual void infreq_network_backward(int i) = 0; - virtual void infreq_model_backward(int i) = 0; -}; - } // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/src/collectives/collective.cpp b/HugeCTR/src/collectives/collective.cpp new file mode 100644 index 0000000000..a5869a84be --- /dev/null +++ b/HugeCTR/src/collectives/collective.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace HugeCTR { + +#ifdef ENABLE_MPI +void CollectiveManager::init_ib_comm() { + int num_process = core_->get_num_process(); + if (num_process > 1) { + int process_id = core_->get_process_id(); + ib_comm_ = std::make_unique(); + ib_comm_->init(num_process, core_->get_local_gpu_count(), process_id, + core_->get_local_gpu_device_id_list()); + } +} +#endif + +void CollectiveManager::set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) { + int num_process = core_->get_num_process(); +#ifdef ENABLE_MPI + IbComm* ib_comm_ptr = nullptr; + if (algo == AllReduceAlgo::ONESHOT) { + init_ib_comm(); + ib_comm_ptr = ib_comm_.get(); + } + ar_comm_ = AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, + core_->get_local_gpus(), ib_comm_ptr); +#else + ar_comm_ = + AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, core_->get_local_gpus()); +#endif +} + +} // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/async_reader/async_reader.cpp b/HugeCTR/src/data_readers/async_reader/async_reader.cpp deleted file mode 100644 index cc83394614..0000000000 --- a/HugeCTR/src/data_readers/async_reader/async_reader.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -AsyncReaderImpl::AsyncReaderImpl(std::string fname, size_t batch_size_bytes, - const ResourceManager* resource_manager, int num_threads, - int num_batches_per_thread, size_t io_block_size, int io_depth, - int io_alignment, bool shuffle, bool wait_for_gpu_idle) - : - - fname_(fname), - batch_size_bytes_(batch_size_bytes), - resource_manager_(resource_manager), - num_devices_(resource_manager_->get_local_gpu_count()), - num_threads_(num_threads), - num_batches_per_thread_(num_batches_per_thread), - io_block_size_(io_block_size), - io_depth_(io_depth), - io_alignment_(io_alignment), - wait_for_gpu_idle_(wait_for_gpu_idle), - queue_id_(0), - thread_batch_ids_(num_threads_), - thread_buffer_ids_(num_threads_), - gpu_thread_ids_(num_devices_), - local_readers_(num_threads_) { - total_file_size_ = std::filesystem::file_size(fname); - num_batches_ = (total_file_size_ + batch_size_bytes_ - 1) / batch_size_bytes; - batch_ids_.resize(num_batches_); - std::iota(batch_ids_.begin(), batch_ids_.end(), 0); - - if (shuffle) { - std::mt19937 gen(resource_manager_->get_local_cpu()->get_replica_uniform_seed()); - std::shuffle(batch_ids_.begin(), batch_ids_.end(), gen); - } - - // Don't allocate more buffers that number of batches in the file - buffers_.resize(std::min((size_t)num_threads_ * num_batches_per_thread, num_batches_)); - for (auto& buf : buffers_) { - buf = std::make_unique(); - buf->dev_data.resize(num_devices_); - for (int id = 0; id < num_devices_; id++) { - auto device_id = resource_manager_->get_local_gpu(id)->get_device_id(); - CudaDeviceContext ctx(device_id); - HCTR_LIB_THROW(cudaMalloc(&buf->dev_data[id], batch_size_bytes_)); - } - } - - streams_.resize(num_devices_); - for (int id = 0; id < num_devices_; id++) { - auto device_id = resource_manager_->get_local_gpu(id)->get_device_id(); - CudaDeviceContext ctx(device_id); - HCTR_LIB_THROW(cudaStreamCreateWithPriority(&streams_[id], cudaStreamNonBlocking, -100)); - } - HCTR_LIB_THROW(cudaEventCreateWithFlags(&event_success_, cudaEventDisableTiming)); - - // For correct perf benchmarking create the thread readers upfront - create_workers(); -} -// create_workers() will be called only once -void AsyncReaderImpl::create_workers() { - // Use round-robin distribution - for (size_t i = 0; i < num_batches_; i++) { - int thid = i % num_threads_; - thread_batch_ids_[thid].push_back(batch_ids_[i]); - } - - for (auto& id : gpu_thread_ids_) { - id.clear(); - } - for (auto& id : thread_buffer_ids_) { - id.clear(); - } - threads_.reserve(num_threads_); - - for (int thid = 0; thid < num_threads_; thid++) { - int raw_id = thid % num_devices_; - int device_id = resource_manager_->get_local_gpu(raw_id)->get_device_id(); - gpu_thread_ids_.at(raw_id).push_back(thid); - - std::vector thread_buffer_ptrs; - for (int i = 0; i < num_batches_per_thread_; i++) { - size_t buf_id = i * num_threads_ + thid; - if (buf_id < buffers_.size()) { - buffers_[buf_id]->raw_device_id = raw_id; - thread_buffer_ptrs.push_back(buffers_[buf_id].get()); - thread_buffer_ids_.at(thid).push_back(buf_id); - } - } - // Use omp parallel is fine as well? - threads_.emplace_back(std::thread([thid, raw_id, device_id, thread_buffer_ptrs, this]() { - CudaCPUDeviceContext ctx(device_id); - - local_readers_[thid] = std::make_unique( - fname_, resource_manager_, batch_size_bytes_, raw_id, streams_[raw_id], - thread_batch_ids_[thid], thread_buffer_ptrs, - ThreadAsyncReaderParameters{io_block_size_, io_alignment_, io_depth_, num_devices_, - wait_for_gpu_idle_, loop_}, - total_file_size_); - })); - } - for (auto& thread : threads_) { - thread.join(); - } - // this clear is important - threads_.clear(); -} - -bool AsyncReaderImpl::is_currently_loading() { return !threads_.empty(); } - -size_t AsyncReaderImpl::get_num_buffers() const { return buffers_.size(); } - -size_t AsyncReaderImpl::get_num_batches() const { return num_batches_; } - -void AsyncReaderImpl::load_async() { - if (is_currently_loading()) { - throw std::runtime_error("load_async() is called before the previous load_async finished!"); - } - - for (int thid = 0; thid < num_threads_; thid++) { - threads_.emplace_back(std::thread([thid, this]() { - int raw_id = thid % num_devices_; - int device_id = resource_manager_->get_local_gpu(raw_id)->get_device_id(); - CudaCPUDeviceContext ctx(device_id); - - local_readers_[thid]->load(); - })); - } -} - -BatchDesc AsyncReaderImpl::get_batch() { - if (!is_currently_loading()) { - throw std::runtime_error( - "Requested a batch from a file that is not being loaded. Please call load_async() first!"); - } - - for (size_t attempt = 0; attempt < buffers_.size(); attempt++) { - last_buffer_ = buffers_[queue_id_].get(); - - auto status = last_buffer_->status.load(); - while (status != BufferStatus::Finished) { - if (status == BufferStatus::ReadReady || status == BufferStatus::PermanentlyResident) { - return {last_buffer_->size, last_buffer_->dev_data, - status == BufferStatus::PermanentlyResident, static_cast(last_buffer_->id)}; - } - if (wait_for_gpu_idle_) { - last_buffer_->ready_to_upload_event.store(&event_success_); - } - - status = last_buffer_->status.load(); - } - queue_id_ = (queue_id_ + 1) % buffers_.size(); - } - - return {0, std::vector(0), false, 0}; -} - -void AsyncReaderImpl::wait_for_gpu_events(const std::vector events) { - if (!wait_for_gpu_idle_) { - return; - } - assert(events.size() == (size_t)num_devices_); - - for (int thid = 0; thid < num_threads_; thid++) { - int raw_id = thid % num_devices_; - wait_for_gpu_event(events[raw_id], raw_id); - } -} - -void AsyncReaderImpl::wait_for_gpu_event(cudaEvent_t* event, int raw_device_id) { - if (!wait_for_gpu_idle_) { - return; - } - - for (auto thid : gpu_thread_ids_.at(raw_device_id)) { - for (auto bufid : thread_buffer_ids_.at(thid)) { - if (buffers_[bufid]->status == BufferStatus::UploadInProcess) { - buffers_[bufid]->ready_to_upload_event.store(event); - } - } - } -} - -void AsyncReaderImpl::finalize_batch() { - // Don't update status of finished or resident buffers - BufferStatus expected = BufferStatus::ReadReady; - last_buffer_->status.compare_exchange_strong(expected, BufferStatus::IOReady); - if (loop_ && last_buffer_->id == (int64_t)num_batches_ - 1) { - queue_id_ = 0; - } else { - queue_id_ = (queue_id_ + 1) % buffers_.size(); - } -} - -void AsyncReaderImpl::finalize_batch(cudaEvent_t* event) { - last_buffer_->safe_to_upload_event.store(event); - finalize_batch(); -} - -int AsyncReaderImpl::get_last_batch_device() { - if (last_buffer_) { - return last_buffer_->raw_device_id; - } else { - return buffers_[queue_id_]->raw_device_id; - } -} - -void AsyncReaderImpl::reset() { - for (auto& reader : local_readers_) { - reader->reset(); - } - for (auto& thread : threads_) { - thread.join(); - } - threads_.clear(); - queue_id_ = 0; -} - -AsyncReaderImpl::~AsyncReaderImpl() { - reset(); - cudaEventDestroy(event_success_); -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp b/HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp deleted file mode 100644 index 73ddb2b0e6..0000000000 --- a/HugeCTR/src/data_readers/async_reader/async_reader_adapter.cpp +++ /dev/null @@ -1,514 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -namespace HugeCTR { -template -AsyncReader::AsyncReader(std::string fname, size_t batch_size, size_t label_dim, - size_t dense_dim, std::vector& params, - bool mixed_precision, - const std::shared_ptr& resource_manager, - int num_threads, int num_batches_per_thread, - size_t io_block_size, int io_depth, int io_alignment, - bool shuffle, bool wait_for_gpu_idle, Alignment_t aligned) - : resource_manager_(resource_manager), - mixed_precision_(mixed_precision), - batch_size_(batch_size), - batch_size_per_dev_(batch_size_ / resource_manager->get_global_gpu_count()), - completion_events_(resource_manager->get_local_gpu_count()), - schedule_events_(resource_manager->get_local_gpu_count()), - split_schedule_events_(resource_manager->get_local_gpu_count()), - d2d_schedule_events_(resource_manager->get_local_gpu_count()), - s3w_streams_(resource_manager->get_local_gpu_count()), - d2d_streams_(resource_manager->get_local_gpu_count()), - cache_buffers_(false) { - assert(batch_size_ % resource_manager_->get_global_gpu_count() == 0); - assert(params.size() == 1); - static_assert(sizeof(LabelType) == sizeof(InputType)); - - int64_t dense_dim_align8 = dense_dim; - if (aligned == Alignment_t::Auto) dense_dim_align8 = (dense_dim + 7) / 8 * 8; - int64_t sparse_dim = params[0].slot_num; - sample_size_items_ = label_dim + dense_dim + sparse_dim; - size_t batch_size_bytes = sample_size_items_ * sizeof(InputType) * batch_size; - - label_dim_ = label_dim; - dense_dim_ = dense_dim_align8; - sparse_dim_ = sparse_dim; - reader_impl_ = std::make_unique( - fname, batch_size_bytes, resource_manager.get(), num_threads, num_batches_per_thread, - io_block_size, io_depth, io_alignment, shuffle, wait_for_gpu_idle); - - for (uint32_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - auto local_gpu = resource_manager_->get_local_gpu(i); - auto gpu_id = local_gpu->get_device_id(); - CudaDeviceContext ctx(gpu_id); - HCTR_LIB_THROW(cudaEventCreateWithFlags(&completion_events_[i], cudaEventDisableTiming)); - HCTR_LIB_THROW(cudaEventCreateWithFlags(&schedule_events_[i], cudaEventDisableTiming)); - HCTR_LIB_THROW(cudaEventCreateWithFlags(&split_schedule_events_[i], cudaEventDisableTiming)); - HCTR_LIB_THROW(cudaEventCreateWithFlags(&d2d_schedule_events_[i], cudaEventDisableTiming)); - - // set default stream - s3w_streams_[i] = local_gpu->get_stream(); - d2d_streams_[i] = local_gpu->get_stream(); - int64_t bytes = batch_size_per_dev_ * - (label_dim * sizeof(LabelType) + - dense_dim_align8 * (mixed_precision ? sizeof(__half) : sizeof(float))); - - core23::Tensor one_tensor(core23::TensorParams() - .device(core23::Device(core23::DeviceType::GPU, gpu_id)) - .data_type(core23::ScalarType::Char) - .shape({bytes})); - temp_tensors_.push_back(one_tensor); - - label_tensors_.emplace_back(core23::Tensor::bind( - one_tensor.data(), {batch_size_per_dev_, static_cast(label_dim)}, - core23::ToScalarType::value, core23::Device(core23::DeviceType::GPU, gpu_id))); - - dense_tensors_.emplace_back(core23::Tensor::bind( - one_tensor.data() + batch_size_per_dev_ * label_dim, - {batch_size_per_dev_, dense_dim_align8}, - mixed_precision_ ? core23::ScalarType::Half : core23::ScalarType::Float, - core23::Device(core23::DeviceType::GPU, gpu_id))); - } - - // zero-initialization - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - const auto local_gpu = resource_manager_->get_local_gpu(i); - CudaDeviceContext ctx(local_gpu->get_device_id()); - core23::zeros_sync(dense_tensors_[i]); - } - - set_tensor_buffering(1); -} - -template -void AsyncReader::set_tensor_buffering(size_t num_batches_to_buffer) { - // If the number of buffers exceeds or is equal to number of batches in our dataset, then we - // may as well cache them so we only execute the 'split_3_way' kernel once. - cache_buffers_ = num_batches_to_buffer >= reader_impl_->get_num_batches(); - init_batch_tensors(num_batches_to_buffer); -} - -template -void AsyncReader::init_batch_tensors(size_t num_inflight) { - inflight_batch_tensors_.resize(num_inflight); - for (auto& batch_tensors : inflight_batch_tensors_) { - batch_tensors.tag = SIZE_MAX; // Invalid - - for (uint32_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - auto local_gpu = resource_manager_->get_local_gpu(i); - auto gpu_id = local_gpu->get_device_id(); - CudaDeviceContext ctx(gpu_id); - int64_t bytes = - batch_size_per_dev_ * (label_dim_ * sizeof(LabelType) + - dense_dim_ * (mixed_precision_ ? sizeof(__half) : sizeof(float))); - - core23::Tensor one_tensor(core23::TensorParams() - .device(core23::Device(core23::DeviceType::GPU, gpu_id)) - .data_type(core23::ScalarType::Char) - .shape({bytes})); - temp_tensors_.push_back(one_tensor); - batch_tensors.label_tensors.push_back(core23::Tensor::bind( - one_tensor.data(), {batch_size_per_dev_, static_cast(label_dim_)}, - core23::ToScalarType::value, core23::Device(core23::DeviceType::GPU, gpu_id))); - - batch_tensors.dense_tensors.emplace_back(core23::Tensor::bind( - one_tensor.data() + batch_size_per_dev_ * label_dim_, - {batch_size_per_dev_, dense_dim_}, - mixed_precision_ ? core23::ScalarType::Half : core23::ScalarType::Float, - core23::Device(core23::DeviceType::GPU, gpu_id))); - core23::zeros_sync(batch_tensors.dense_tensors.back()); - auto value_tensor = - core23::Tensor(core23::TensorParams() - .device(core23::Device(core23::DeviceType::GPU, gpu_id)) - .data_type(core23::ToScalarType::value) - .shape({batch_size_, sparse_dim_})); - // needs to allocate memory eagerly - value_tensor.data(); - auto dummy_row_offset_tensor = - core23::Tensor(core23::TensorParams() - .data_type(core23::ToScalarType::value) - .device(core23::Device(core23::DeviceType::GPU, gpu_id)) - .shape({4})); - std::shared_ptr dummy_nnz(new size_t(1)); - batch_tensors.sparse_tensors.emplace_back( - SparseTensor23(value_tensor, dummy_row_offset_tensor, dummy_nnz)); - } - } - current_sparse_tensors_ = inflight_batch_tensors_.at(0).sparse_tensors; -} - -template -long long AsyncReader::read_a_batch_to_device_delay_release() { - auto batch = reader_impl_->get_batch(); - if (batch.size_bytes == 0) { - reader_impl_->reset(); - reader_impl_->load_async(); - batch = reader_impl_->get_batch(); - } - - if (cache_buffers_) { - // TODO: replace with cache policy like LRU when number of batches exceeds what we can store - inflight_id_ = batch.id; - } else { - inflight_id_ = (inflight_id_ + 1) % inflight_batch_tensors_.size(); // FIFO - } - - BatchTensors& batch_tensors = inflight_batch_tensors_.at(inflight_id_); - - size_t current_batch_id = static_cast(batch.id); - current_batch_size_ = batch.size_bytes / (sample_size_items_ * sizeof(InputType)); - current_sparse_tensors_ = batch_tensors.sparse_tensors; - current_batch_cached_ = (current_batch_id == batch_tensors.tag) && cache_buffers_; - - int num_local_gpus = resource_manager_->get_local_gpu_count(); -#pragma omp parallel for num_threads(num_local_gpus) - for (int i = 0; i < num_local_gpus; i++) { - auto local_gpu = resource_manager_->get_local_gpu(i); - auto gpu_id = local_gpu->get_device_id(); - - CudaCPUDeviceContext ctx(gpu_id); - auto global_dev_id = resource_manager_->get_gpu_global_id_from_local_id(i); - - const cudaStream_t& stream = s3w_streams_[i]; - - // schedule at correct place in iteration - HCTR_LIB_THROW(cudaStreamWaitEvent(stream, split_schedule_events_[i])); - - if (!current_batch_cached_) { // data can be cached for eval - - auto ptr_wrap = - std::make_shared(reinterpret_cast(batch.dev_data[i])); - // To save memory we're going to use the space in the Data for the unprocessed - // sparse features, and then run to_unique_categories essentially in place - // auto current_batch_size = batch.size_bytes / (sample_size_items_ * sizeof(dtype)); - // auto in_place_tensor = my_data.samples; - // in_place_tensor.reset_shape({current_batch_size, sparse_dim_}); - if (mixed_precision_) { - split_3_way<__half, SparseType>( - batch_tensors.label_tensors[i], batch_tensors.dense_tensors[i], - batch_tensors.sparse_tensors[i].get_value_tensor(), - core23::Tensor::bind(reinterpret_cast(ptr_wrap->get_ptr()), - {current_batch_size_, static_cast(sample_size_items_)}, - core23::ToScalarType::value, - core23::Device(core23::DeviceType::GPU, gpu_id)), - global_dev_id * batch_size_per_dev_, (global_dev_id + 1) * batch_size_per_dev_, stream); - } else { - split_3_way( - batch_tensors.label_tensors[i], batch_tensors.dense_tensors[i], - batch_tensors.sparse_tensors[i].get_value_tensor(), - core23::Tensor::bind(reinterpret_cast(ptr_wrap->get_ptr()), - {current_batch_size_, static_cast(sample_size_items_)}, - core23::ToScalarType::value, - core23::Device(core23::DeviceType::GPU, gpu_id)), - global_dev_id * batch_size_per_dev_, (global_dev_id + 1) * batch_size_per_dev_, stream); - } - } - - auto sparse_ready_event = local_gpu->get_event("sparse_tensors_ready"); - HCTR_LIB_THROW(cudaEventRecord(sparse_ready_event, stream)); - - auto d2d_stream = d2d_streams_[i]; - - // Need result from split-3-way - HCTR_LIB_THROW(cudaStreamWaitEvent(d2d_stream, sparse_ready_event)); - - // we are safe to overwrite - HCTR_LIB_THROW(cudaStreamWaitEvent(d2d_stream, d2d_schedule_events_[i])); - - // batch.dev_data can be reused - HCTR_LIB_THROW(cudaEventRecord(completion_events_[i], d2d_stream)); - - // isn't part of hybrid embedding - assign_dense_and_label_tensors(batch_tensors.label_tensors[i], batch_tensors.dense_tensors[i], - i, d2d_stream); - - auto tensors_ready_event = local_gpu->get_event("bottom_MLP_tensors_ready"); - HCTR_LIB_THROW(cudaEventRecord(tensors_ready_event, d2d_stream)); - } - - batch_tensors.tag = current_batch_id; - return current_batch_size_; -} - -template -void AsyncReader::set_schedule_streams(cudaStream_t s3w_stream, cudaStream_t d2d_stream, - int raw_device_id) { - s3w_streams_[raw_device_id] = s3w_stream; - d2d_streams_[raw_device_id] = d2d_stream; -} - -template -void AsyncReader::assign_dense_and_label_tensors(core23::Tensor& label_tensor, - core23::Tensor& dense_tensor, - int raw_device_id, - cudaStream_t stream) { - auto& dst_label_tensor = label_tensors_[raw_device_id]; - auto& dst_dense_tensor = dense_tensors_[raw_device_id]; - // TODO: allocate tensors together - if ((char*)dst_label_tensor.data() + dst_label_tensor.num_bytes() == - (char*)dst_dense_tensor.data()) { - HCTR_LIB_THROW(cudaMemcpyAsync(dst_label_tensor.data(), label_tensor.data(), - dst_label_tensor.num_bytes() + dense_tensor.num_bytes(), - cudaMemcpyDeviceToDevice, stream)); - } else { - HCTR_LIB_THROW(cudaMemcpyAsync(dst_label_tensor.data(), label_tensor.data(), - dst_label_tensor.num_bytes(), cudaMemcpyDeviceToDevice, stream)); - - HCTR_LIB_THROW(cudaMemcpyAsync(dst_dense_tensor.data(), dense_tensor.data(), - dst_dense_tensor.num_bytes(), cudaMemcpyDeviceToDevice, stream)); - } -} - -template -long long AsyncReader::get_full_batchsize() const { - return batch_size_; -} - -template -void AsyncReader::stream_wait_sparse_tensors(cudaStream_t stream, int raw_device_id, - bool from_graph) { - auto gpu = resource_manager_->get_local_gpu(raw_device_id); - const auto flags = from_graph ? cudaEventWaitExternal : 0; - HCTR_LIB_THROW(cudaStreamWaitEvent(stream, gpu->get_event("sparse_tensors_ready"), flags)); -} - -template -void AsyncReader::stream_wait_dense_tensors(cudaStream_t stream, int raw_device_id, - bool from_graph) { - auto gpu = resource_manager_->get_local_gpu(raw_device_id); - const auto flags = from_graph ? cudaEventWaitExternal : 0; - HCTR_LIB_THROW(cudaStreamWaitEvent(stream, gpu->get_event("bottom_MLP_tensors_ready"), flags)); -} - -template -bool AsyncReader::current_batch_incomplete() const { - return current_batch_size_ != batch_size_; -} - -template -void AsyncReader::ready_to_collect() { - auto raw_device_id = reader_impl_->get_last_batch_device(); - auto local_gpu = resource_manager_->get_local_gpu(raw_device_id); - CudaDeviceContext ctx(local_gpu->get_device_id()); - - reader_impl_->finalize_batch(&completion_events_[raw_device_id]); -} - -template -long long AsyncReader::read_a_batch_to_device() { - auto result = read_a_batch_to_device_delay_release(); - ready_to_collect(); - return result; -} - -template -void AsyncReader::schedule_split_3_way_here(cudaStream_t stream, int raw_device_id, - bool from_graph) { - unsigned int flags = from_graph ? cudaEventRecordExternal : 0; - HCTR_LIB_THROW(cudaEventRecordWithFlags(split_schedule_events_[raw_device_id], stream, flags)); -} - -template -void AsyncReader::schedule_d2d_here(cudaStream_t stream, int raw_device_id, - bool from_graph) { - unsigned int flags = from_graph ? cudaEventRecordExternal : 0; - HCTR_LIB_THROW(cudaEventRecordWithFlags(d2d_schedule_events_[raw_device_id], stream, flags)); -} - -template -void AsyncReader::schedule_here(cudaStream_t stream, int raw_device_id) { - HCTR_LIB_THROW(cudaEventRecord(schedule_events_[raw_device_id], stream)); - reader_impl_->wait_for_gpu_event(&schedule_events_[raw_device_id], raw_device_id); -} - -template -void AsyncReader::schedule_here_graph(cudaStream_t stream, int raw_device_id) { - HCTR_LIB_THROW( - cudaEventRecordWithFlags(schedule_events_[raw_device_id], stream, cudaEventRecordExternal)); -} - -template -void AsyncReader::update_schedule_graph(int raw_device_id) { - reader_impl_->wait_for_gpu_event(&schedule_events_[raw_device_id], raw_device_id); -} - -template -size_t AsyncReader::get_max_batches_inflight() const { - return reader_impl_->get_num_buffers(); -} - -template -bool AsyncReader::is_mixed_precision() { - return mixed_precision_; -} - -template -void AsyncReader::get_dimensions(size_t& label_dim, size_t& dense_dim, - size_t& sparse_dim, size_t& sample_size_items) { - label_dim = label_dim_; - dense_dim = dense_dim_; - sparse_dim = sparse_dim_; - sample_size_items = sample_size_items_; -} - -template -long long AsyncReader::get_current_batchsize_per_device(size_t local_id) { - long long batchsize_per_device = batch_size_ / resource_manager_->get_global_gpu_count(); - size_t global_id = resource_manager_->get_gpu_global_id_from_local_id(local_id); - long long remain_samples = current_batch_size_ - global_id * batchsize_per_device; - if (remain_samples >= batchsize_per_device) { - return batchsize_per_device; - } else if (remain_samples > 0) { - return remain_samples; - } else { - return 0; - } -} - -template -TensorScalarType AsyncReader::get_scalar_type() const { - return TensorScalarTypeFunc::get_type(); -}; -template -bool AsyncReader::is_started() const { - return reader_impl_->is_currently_loading(); -} -template -void AsyncReader::start() { - if (!this->is_started()) { - reader_impl_->load_async(); - } -} - -template -std::vector AsyncReader::get_label_tensor23s() const { - return label_tensors_; -} - -template -std::vector AsyncReader::get_dense_tensor23s() const { - return dense_tensors_; -} -// TODO remove after hybrid embedding deprecation -template -SparseTensors AsyncReader::get_value_tensors() const { - SparseTensors tmp_tensors; - // convert from SparseTensor23 to SparseTensor - // offset is negligible - for (const auto& sparse23 : current_sparse_tensors_) { - core23::Tensor value_tensor = sparse23.get_value_tensor(); - core23::Tensor off_tensor = sparse23.get_rowoffset_tensor(); - auto shape = value_tensor.shape(); - std::vector dimensions(shape.data(), shape.data() + shape.dims()); - - auto value_buffer = PreallocatedBuffer2::create(value_tensor.data(), dimensions); - // dummy row_offset tensor - auto rowoffset_buffer = PreallocatedBuffer2::create(off_tensor.data(), {1}); - - std::shared_ptr> value_tensor2(new Tensor2); - std::shared_ptr> off_tensor2(new Tensor2); - // dummy nnz - SparseTensor current_sparse(dimensions, value_buffer, rowoffset_buffer, - sparse23.get_nnz_ptr(), 1); - tmp_tensors.push_back(current_sparse); - } - return tmp_tensors; -} -template -std::vector AsyncReader::get_value_tensor23s() const { - return current_sparse_tensors_; -} - -// TODO remove after hybrid embedding deprecation -template -std::vector>> -AsyncReader::get_value_tensor_buffers() const { - std::vector>> ret; - // std::vector> tensors; - for (const auto& batch_tensor : inflight_batch_tensors_) { - // std::vector gpu_tensors; - std::vector> gpu_tensors; - for (const auto& sparse23 : batch_tensor.sparse_tensors) { - // gpu_tensors.emplace_back(sparse_tensor); - core23::Tensor value_tensor = sparse23.get_value_tensor(); - core23::Tensor off_tensor = sparse23.get_rowoffset_tensor(); - auto shape = value_tensor.shape(); - std::vector dimensions(shape.data(), shape.data() + shape.dims()); - - auto value_buffer = PreallocatedBuffer2::create(value_tensor.data(), dimensions); - // dummy row_offset tensor - auto rowoffset_buffer = PreallocatedBuffer2::create(off_tensor.data(), {1}); - - std::shared_ptr> value_tensor2(new Tensor2); - std::shared_ptr> off_tensor2(new Tensor2); - // dummy nnz - SparseTensor current_sparse(dimensions, value_buffer, rowoffset_buffer, - sparse23.get_nnz_ptr(), 1); - gpu_tensors.push_back(current_sparse); - } - ret.emplace_back(gpu_tensors); - } - // return tensors; - return ret; -} -template -std::vector> AsyncReader::get_value_tensor_buffer23s() - const { - std::vector> ret; - for (const auto& batch_tensor : inflight_batch_tensors_) { - ret.push_back(batch_tensor.sparse_tensors); - } - return ret; -} - -#ifndef DISABLE_CUDF -template -void AsyncReader::create_drwg_parquet(std::string file_list, - bool strict_order_of_batches, - const std::vector slot_offset, - bool start_reading_from_beginning, - long long max_samples_per_group, - int label_dense_num, int label_dense_dim) {} -#endif -template -void AsyncReader::set_source(std::string file_list) {} - -template -AsyncReader::~AsyncReader() { - // Underlying reader mush be destroyed BEFORE the events - reader_impl_.reset(nullptr); - for (auto& e : completion_events_) { - cudaEventDestroy(e); - } - for (auto& e : schedule_events_) { - cudaEventDestroy(e); - } -} - -template class AsyncReader; -template class AsyncReader; -} // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/async_reader/broadcast.cu b/HugeCTR/src/data_readers/async_reader/broadcast.cu deleted file mode 100644 index 71534f95cc..0000000000 --- a/HugeCTR/src/data_readers/async_reader/broadcast.cu +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -constexpr int copy_width = 4; - -namespace { - -inline __device__ float4 read4(const float* src, int n) { - if (n == copy_width) { - return *((float4*)src); - } else { - float4 res; - if (n > 0) res.x = src[0]; - if (n > 1) res.y = src[1]; - if (n > 2) res.z = src[2]; - return res; - } -} - -inline __device__ void write4(float* dst, int n, float4 val) { - if (n == copy_width) { - *((float4*)dst) = val; - } else { - if (n > 0) dst[0] = val.x; - if (n > 1) dst[1] = val.y; - if (n > 2) dst[2] = val.z; - } -} - -__global__ void broadcast_kernel(float** addrs, const bool* p2p_accessible, int batch_size_floats, - int num_dests, int src_id) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - int idx4 = idx * copy_width; - int num_elems = min(batch_size_floats - idx4, copy_width); - - float4 src_val = read4(addrs[src_id] + idx4, num_elems); - for (int i = 1; i < num_dests; i++) { - int dst_id = (src_id + i) % num_dests; - if (p2p_accessible[dst_id]) { - write4(addrs[dst_id] + idx4, num_elems, src_val); - } - } -} - -} // namespace - -void broadcast(float** dev_pointers, const bool* dev_p2p_accessible, int batch_size_floats, - int num_dests, int src_id, cudaStream_t stream) { - int block_size = 128; - int grid_size = (batch_size_floats + copy_width * block_size - 1) / block_size; - - constexpr bool use_kernel = false; - - for (int i = 1; i < num_dests; i++) { - int dst_id = (src_id + i) % num_dests; - if (!dev_p2p_accessible[dst_id] || (!use_kernel)) { - HCTR_LIB_THROW(cudaMemcpyAsync(dev_pointers[dst_id], dev_pointers[src_id], - batch_size_floats * sizeof(float), cudaMemcpyDeviceToDevice, - stream)); - } - } - - if (use_kernel) { - broadcast_kernel<<>>(dev_pointers, dev_p2p_accessible, - batch_size_floats, num_dests, src_id); - } -} - -} // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu b/HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu deleted file mode 100644 index e629e6b9c1..0000000000 --- a/HugeCTR/src/data_readers/async_reader/split_label_dense_sparse.cu +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -namespace HugeCTR { - -// Sparse pointer should be casted to int* when calling this kernel -template -__global__ void split_kernel_3_way(int batch_size, float* label_ptr, int label_dim, - DenseType* dense_ptr, int dense_dim, int dense_dim_no_align, - SparseType* sparse_ptr, int sparse_dim, - const int* label_dense_sparse, int sample_size_int, - size_t local_idx_start, size_t local_idx_end) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - - if (idx < batch_size * sample_size_int) { - const int in_col = idx % sample_size_int; - const int in_row = idx / sample_size_int; - const int out_row = in_row; - if (in_col < label_dim) { - const int out_col = in_col; - int label = label_dense_sparse[idx]; - if (local_idx_start <= out_row && out_row < local_idx_end) { - label_ptr[(out_row - local_idx_start) * label_dim + out_col] = label; - } - } else if (in_col < label_dim + dense_dim_no_align) { - const int out_col = in_col - label_dim; - int dense = label_dense_sparse[idx]; - if (local_idx_start <= out_row && out_row < local_idx_end) { - dense_ptr[(out_row - local_idx_start) * dense_dim + out_col] = - logf(dense + 1.f); // TODO : FIXME move to data preprocessing - } - } else { - const int out_col = in_col - label_dim - dense_dim_no_align; - sparse_ptr[out_row * sparse_dim + out_col] = label_dense_sparse[idx]; - } - } - return; -} - -template -__global__ void split_kernel_3_way_read4_write4(int batch_size, float* label_ptr, int label_dim, - DenseType* dense_ptr, int dense_dim, - int dense_dim_no_align, int* sparse_ptr, - int sparse_dim, const int* label_dense_sparse, - int sample_size_int, size_t local_idx4_start, - size_t local_idx4_end) { - using DenseType4 = typename std::conditional<(sizeof(DenseType) == 4), int4, int2>::type; - extern __shared__ int label_dense_sparse_s[]; - constexpr int vec_size = sizeof(int4) / sizeof(int); - static_assert(samples_per_cta % vec_size == 0, - "Number of samples per block has to respect divisibility constraints"); - assert(blockDim.x >= 3 * warpSize); - - const int idx_l = threadIdx.x; - const int warp_id = threadIdx.x / warpSize; - const int lane_id = threadIdx.x % warpSize; - - const int my_cta_samples = min(samples_per_cta, batch_size - samples_per_cta * blockIdx.x); - if (my_cta_samples <= 0) { - return; - } - assert(my_cta_samples % vec_size == 0); - - int4* label_dense_sparse_s_align4 = reinterpret_cast(label_dense_sparse_s); - const int4* label_dense_sparse_align4 = reinterpret_cast(label_dense_sparse); - - float* label_s = - reinterpret_cast(label_dense_sparse_s + sample_size_int * samples_per_cta); - DenseType* dense_s = reinterpret_cast(label_s + label_dim * samples_per_cta); - SparseType* sparse_s = reinterpret_cast((int*)dense_s + dense_dim * samples_per_cta); - - // read with int4 - const int src_base = samples_per_cta * sample_size_int / vec_size * blockIdx.x; - for (int id = idx_l; id < my_cta_samples * sample_size_int / vec_size; id += blockDim.x) { - label_dense_sparse_s_align4[id] = label_dense_sparse_align4[src_base + id]; - } - - for (int id = idx_l; id < samples_per_cta * dense_dim; id += blockDim.x) { - dense_s[id] = 0; - } - - __syncthreads(); - - // transpose - for (int id = idx_l; id < samples_per_cta * sample_size_int; id += blockDim.x) { - const int in_col = id % sample_size_int; - const int in_row = id / sample_size_int; - const int out_row = in_row; - if (in_col < label_dim) { - const int out_col = in_col; - label_s[out_row * label_dim + out_col] = label_dense_sparse_s[id]; - } else if (in_col < label_dim + dense_dim_no_align) { - const int out_col = in_col - label_dim; - int dense = label_dense_sparse_s[id]; - dense_s[out_row * dense_dim + out_col] = - logf(dense + 1.f); // TODO : FIXME move to data preprocessing - } else { - const int out_col = in_col - label_dim - dense_dim_no_align; - sparse_s[out_row * sparse_dim + out_col] = label_dense_sparse_s[id]; - } - } - __syncthreads(); - - float4* label_s_align4 = reinterpret_cast(label_s); - DenseType4* dense_s_align4 = reinterpret_cast(dense_s); - int4* sparse_s_align4 = reinterpret_cast(sparse_s); - float4* label_align4 = reinterpret_cast(label_ptr); - DenseType4* dense_align4 = reinterpret_cast(dense_ptr); - int4* sparse_align4 = reinterpret_cast(sparse_ptr); - - const int label_size_int4_per_cta = label_dim * samples_per_cta / vec_size; - const int dense_size_int4_per_cta = dense_dim * samples_per_cta / vec_size; - const int sparse_size_int4_per_cta = sparse_dim * samples_per_cta / vec_size; - - if (warp_id == 0) { - for (int id = lane_id; id < label_dim * my_cta_samples / vec_size; id += warpSize) { - size_t local_idx4 = id + blockIdx.x * label_size_int4_per_cta; - if (label_dim * local_idx4_start <= local_idx4 && local_idx4 < label_dim * local_idx4_end) { - label_align4[local_idx4 - label_dim * local_idx4_start] = label_s_align4[id]; - } - } - } - if (warp_id == 1) { - for (int id = lane_id; id < dense_dim * my_cta_samples / vec_size; id += warpSize) { - size_t local_idx4 = id + blockIdx.x * dense_size_int4_per_cta; - if (dense_dim * local_idx4_start <= local_idx4 && local_idx4 < dense_dim * local_idx4_end) { - dense_align4[local_idx4 - dense_dim * local_idx4_start] = dense_s_align4[id]; - } - } - } - if (warp_id == 2) { - for (int id = lane_id; id < sparse_dim * my_cta_samples / vec_size; id += warpSize) { - sparse_align4[id + blockIdx.x * sparse_size_int4_per_cta] = sparse_s_align4[id]; - } - } -} - -template -void split_3_way(core23::Tensor& label_tensor_per_dev, core23::Tensor& dense_tensor_per_dev, - core23::Tensor& sparse_tensor, const core23::Tensor& label_dense_sparse_buffer, - size_t local_idx_start, size_t local_idx_end, cudaStream_t stream) { - if (label_dense_sparse_buffer.dims() > 0) { - assert(label_tensor_per_dev.size(0) == dense_tensor_per_dev.size(0)); - assert(label_tensor_per_dev.size(0) == local_idx_end - local_idx_start); - - const int batch_size = label_dense_sparse_buffer.size(0); - const int label_dim = label_tensor_per_dev.size(1); - const int dense_dim = dense_tensor_per_dev.size(1); - const int sparse_dim = sparse_tensor.size(1); - const int sample_size_int = label_dense_sparse_buffer.size(1); - cudaPointerAttributes attributes_src, attributes_dst; - - int dense_dim_no_align = sample_size_int - label_dim - sparse_dim; - - constexpr int block_dim = 128; - constexpr int samples_per_cta = 24; - - int vec_width = sizeof(int4) / sizeof(int); - if (sizeof(SparseType) == 4 && batch_size % vec_width == 0 && - local_idx_start % vec_width == 0 && local_idx_end % vec_width == 0 && - samples_per_cta * sample_size_int * sizeof(int) <= 24 * 1024) { - const int grid_dim = (batch_size + samples_per_cta - 1) / samples_per_cta; - const int shmem = 2 * samples_per_cta * (label_dim + dense_dim + sparse_dim) * sizeof(int); - - split_kernel_3_way_read4_write4 - <<>>( - batch_size, label_tensor_per_dev.data(), label_dim, - dense_tensor_per_dev.data(), dense_dim, dense_dim_no_align, - sparse_tensor.data(), sparse_dim, label_dense_sparse_buffer.data(), - sample_size_int, local_idx_start / vec_width, local_idx_end / vec_width); - } else { - const int grid_dim = (label_dense_sparse_buffer.num_elements() - 1) / block_dim + 1; - split_kernel_3_way<<>>( - batch_size, label_tensor_per_dev.data(), label_dim, - dense_tensor_per_dev.data(), dense_dim, dense_dim_no_align, - sparse_tensor.data(), sparse_dim, label_dense_sparse_buffer.data(), - sample_size_int, local_idx_start, local_idx_end); - } - - HCTR_LIB_THROW(cudaPeekAtLastError()); - } -} - -template void split_3_way(core23::Tensor& label_tensor_per_dev, - core23::Tensor& dense_tensor_per_dev, - core23::Tensor& sparse_tensor, - const core23::Tensor& label_dense_sparse_buffer, - size_t local_idx_start, size_t local_idx_end, - cudaStream_t stream); -template void split_3_way<__half, uint32_t>(core23::Tensor& label_tensor_per_dev, - core23::Tensor& dense_tensor_per_dev, - core23::Tensor& sparse_tensor, - const core23::Tensor& label_dense_sparse_buffer, - size_t local_idx_start, size_t local_idx_end, - cudaStream_t stream); - -template void split_3_way(core23::Tensor& label_tensor_per_dev, - core23::Tensor& dense_tensor_per_dev, - core23::Tensor& sparse_tensor, - const core23::Tensor& label_dense_sparse_buffer, - size_t local_idx_start, size_t local_idx_end, - cudaStream_t stream); -template void split_3_way<__half, long long>(core23::Tensor& label_tensor_per_dev, - core23::Tensor& dense_tensor_per_dev, - core23::Tensor& sparse_tensor, - const core23::Tensor& label_dense_sparse_buffer, - size_t local_idx_start, size_t local_idx_end, - cudaStream_t stream); -} // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp b/HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp deleted file mode 100644 index 36c5665517..0000000000 --- a/HugeCTR/src/data_readers/async_reader/thread_async_reader.cpp +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -namespace HugeCTR { - -ThreadAsyncReader::ThreadAsyncReader(std::string fname, const ResourceManager* resource_mananager, - size_t batch_size_bytes, int device_id, cudaStream_t stream, - std::vector batch_ids, - std::vector dest_buffers, - ThreadAsyncReaderParameters params, size_t total_file_size) - : batch_size_bytes_(batch_size_bytes), - device_id_(device_id), - stream_(stream), - total_file_size_(total_file_size), - batch_ids_(batch_ids), - dest_buffers_(dest_buffers), - params_(params), - num_buffers_waiting_io_(0) { -#if (__cplusplus >= 201703L) - static_assert(std::atomic::is_always_lock_free && - std::atomic::is_always_lock_free, - "Compiler cannot use atomic enum class, need to change to int type"); -#endif - HCTR_CHECK_HINT(params_.io_block_size % params_.io_alignment == 0, - " params_.io_block_size % params_.io_alignment != 0"); - - num_dest_buffers_ = dest_buffers_.size(); - - fd_ = open(fname.c_str(), O_RDONLY | O_DIRECT); - if (fd_ == -1) { - int errnum = errno; - if (errnum == ENOENT) { - throw std::runtime_error("No such file: " + fname); - } else if (errnum == EINVAL) { - HCTR_LOG(WARNING, ROOT, - "Current filesystem does not support O_DIRECT open(), use " - "general open() instead\n"); - fd_ = open(fname.c_str(), O_RDONLY); - } - if (fd_ == -1) { - throw std::runtime_error("Open " + fname + " fails due to uncertain reason"); - } - }; - - max_num_blocks_per_batch_ = batch_size_bytes_ / params_.io_block_size + 2; - size_t pinned_size = 0; - for (auto buf : dest_buffers_) { - buf->raw_host_ptr = (char*)aligned_alloc(params_.io_alignment, - max_num_blocks_per_batch_ * params_.io_block_size); - HCTR_LIB_THROW( - cudaHostRegister(buf->raw_host_ptr, max_num_blocks_per_batch_ * params_.io_block_size, 0)); - assert((size_t)buf->raw_host_ptr % params_.io_alignment == 0); - - HCTR_LIB_THROW(cudaEventCreateWithFlags(&buf->event, cudaEventDisableTiming)); - - buf->io_reqs.resize(max_num_blocks_per_batch_); - for (auto& req : buf->io_reqs) { - req = new iocb; - } - pinned_size += max_num_blocks_per_batch_ * params_.io_block_size; - } - for (auto buf : dest_buffers_) { - buf->status.store(BufferStatus::IOReady); - } -} - -void ThreadAsyncReader::load() { - size_t num_batches = batch_ids_.size(); - size_t processed = 0; - std::vector id_per_host_buffer(num_dest_buffers_); - std::iota(id_per_host_buffer.begin(), id_per_host_buffer.end(), 0); - - status_.store(WorkerStatus::OK); - for (auto buf : dest_buffers_) { - buf->safe_to_upload_event.store(nullptr); - buf->ready_to_upload_event.store(nullptr); - buf->preload_done = false; - } - - ioctx_ = 0; - if (io_queue_init(params_.io_depth, &ioctx_) < 0) { - HCTR_OWN_THROW(Error_t::UnspecificError, "io_setup failed"); - } - - while (status_.load() != WorkerStatus::Terminate) { - // bool all_resident = true; - // for (auto buf : dest_buffers_) { - // if (buf->status != BufferStatus::PermanentlyResident) { - // all_resident = false; - // break; - // } - // } - // if (all_resident){ - // return; - // } - - for (int i = 0; i < num_dest_buffers_; i++) { - if (id_per_host_buffer[i] < num_batches) { - try_submit_io(batch_ids_[id_per_host_buffer[i]], i); - } - } - wait_io(); - for (int i = 0; i < num_dest_buffers_; i++) { - if (id_per_host_buffer[i] < num_batches) { - try_submit_p2p(dest_buffers_[i]); - } - } - for (int i = 0; i < num_dest_buffers_; i++) { - if (id_per_host_buffer[i] < num_batches) { - try_submit_upload(dest_buffers_[i]); - } - } - for (int i = 0; i < num_dest_buffers_; i++) { - if (id_per_host_buffer[i] < num_batches) { - if (check_completion(dest_buffers_[i])) { - processed++; - id_per_host_buffer[i] += num_dest_buffers_; - if (params_.loop && id_per_host_buffer[i] >= num_batches) { - id_per_host_buffer[i] = i; - } - } - } - } - usleep(10); - if (!params_.loop && processed >= num_batches) { - break; - } - } - - if (io_destroy(ioctx_) < 0) { - throw std::runtime_error("io_destroy failed"); - } - - HCTR_LIB_THROW(cudaStreamSynchronize(stream_)); - - if (status_.load() != WorkerStatus::Terminate) { - for (int i = 0; i < num_dest_buffers_; i++) { - BufferStatus expected = BufferStatus::IOReady; - while (!dest_buffers_[i]->status.compare_exchange_weak(expected, BufferStatus::Finished)) { - expected = BufferStatus::IOReady; - } - } - } -} - -void ThreadAsyncReader::try_submit_io(size_t batch_id, int io_id) { - auto& buffer = dest_buffers_[io_id]; - if (buffer->status.load() != BufferStatus::IOReady) { - return; - } - // Maybe we have already loaded this batch before?! - if (buffer->id == (int64_t)batch_id) { - buffer->status.store(BufferStatus::PermanentlyResident); - return; - } - - buffer->status.store(BufferStatus::IOInProcess); - - size_t req_beg_offset = batch_id * batch_size_bytes_; - size_t req_end_offset = std::min((batch_id + 1) * batch_size_bytes_, total_file_size_); - size_t raw_beg_offset = (req_beg_offset / params_.io_block_size) * params_.io_block_size; - size_t raw_end_offset = ((req_end_offset + params_.io_block_size - 1) / params_.io_block_size) * - params_.io_block_size; - size_t num_blocks = (raw_end_offset - raw_beg_offset) / params_.io_block_size; - assert(num_blocks <= (size_t)max_num_blocks_per_batch_); - - buffer->id = batch_id; - buffer->num_outstanding_reqs = num_blocks; - buffer->num_submitted_h2d_chunks = 0; - buffer->num_submitted_broadcasts = 0; - buffer->size = req_end_offset - req_beg_offset; - buffer->host_data = buffer->raw_host_ptr + (req_beg_offset - raw_beg_offset); - assert(buffer->size % sizeof(float) == 0); - - for (size_t block = 0; block < num_blocks; block++) { - auto req = buffer->io_reqs[block]; - - io_prep_pread(req, fd_, buffer->raw_host_ptr + params_.io_block_size * block, - params_.io_block_size, raw_beg_offset + params_.io_block_size * block); - req->data = (void*)buffer; - } - - int ret = io_submit(ioctx_, num_blocks, buffer->io_reqs.data()); - num_buffers_waiting_io_ += 1; - if (ret < 0) { - HCTR_OWN_THROW(Error_t::UnspecificError, "io_submit failed"); - } -} - -void ThreadAsyncReader::wait_io() { - timespec timeout = {0, 10'000l}; - - io_event events[max_num_blocks_per_batch_]; - int num_completed = - io_getevents(ioctx_, max_num_blocks_per_batch_, max_num_blocks_per_batch_, events, &timeout); - if (num_completed < 0) { - HCTR_OWN_THROW(Error_t::UnspecificError, "io_getevents failed"); - } - - for (int b = 0; b < num_completed; b++) { - auto req = events[b].obj; - if ((events[b].res < 0 || events[b].res2 != 0)) { - HCTR_OWN_THROW(Error_t::UnspecificError, "io_getevents returned failed event"); - } - auto buffer = (InternalBatchBuffer*)req->data; - buffer->num_outstanding_reqs--; - assert(buffer->num_outstanding_reqs >= 0); - if (buffer->num_outstanding_reqs == 0) { - num_buffers_waiting_io_ -= 1; - buffer->status.store(BufferStatus::UploadInProcess); - if (params_.wait_for_gpu_idle) { - buffer->ready_to_upload_event.store(nullptr); - } - } - } -} - -bool ThreadAsyncReader::wait_for_gpu_idle(InternalBatchBuffer* buffer) { - if (params_.wait_for_gpu_idle && buffer->preload_done) { - auto event_ptr = buffer->ready_to_upload_event.load(); - if (event_ptr == nullptr) { - return false; - } else { - buffer->ready_to_upload_event.store(nullptr); - HCTR_LIB_THROW(cudaStreamWaitEvent(stream_, *event_ptr)); - } - } - return true; -} - -void ThreadAsyncReader::try_submit_upload(InternalBatchBuffer* buffer) { - if (buffer->status.load() != BufferStatus::UploadInProcess || - buffer->num_submitted_h2d_chunks >= params_.num_h2d_chunks) { - return; - } - if (!wait_for_gpu_idle(buffer)) { - return; - } - - // H2D upload - // Wait until the buffers are consumed (one event after a barrier) - if (buffer->num_submitted_h2d_chunks == 0 && buffer->safe_to_upload_event != nullptr) { - HCTR_LIB_THROW(cudaStreamWaitEvent(stream_, *buffer->safe_to_upload_event)); - } - - size_t chunk_size = (buffer->size + params_.num_h2d_chunks - 1) / params_.num_h2d_chunks; - size_t beg_offset = chunk_size * buffer->num_submitted_h2d_chunks; - size_t end_offset = std::min(buffer->size, chunk_size * (buffer->num_submitted_h2d_chunks + 1)); - - HCTR_LIB_THROW(cudaMemcpyAsync(buffer->dev_data[device_id_] + beg_offset, - buffer->host_data + beg_offset, end_offset - beg_offset, - cudaMemcpyHostToDevice, stream_)); - buffer->num_submitted_h2d_chunks++; -} - -void ThreadAsyncReader::try_submit_p2p(InternalBatchBuffer* buffer) { - if (buffer->status.load() != BufferStatus::UploadInProcess || - buffer->num_submitted_h2d_chunks < params_.num_h2d_chunks) { - return; - } - if (!wait_for_gpu_idle(buffer)) { - return; - } - - // Broadcast to the other GPUs - if (buffer->num_submitted_broadcasts != (int)buffer->dev_data.size()) { - if (device_id_ != buffer->num_submitted_broadcasts) { - HCTR_LIB_THROW(cudaMemcpyAsync(buffer->dev_data[buffer->num_submitted_broadcasts], - buffer->dev_data[device_id_], buffer->size, cudaMemcpyDefault, - stream_)); - } - buffer->num_submitted_broadcasts++; - return; - } - - // Here we've submitted everything - // There is no real need to make eventRecord atomic (wrt stream) with the - // rest, - // we only care that eventRecord is AFTER the H2D and the broadcast - buffer->preload_done = true; - buffer->num_submitted_h2d_chunks = 0; - buffer->num_submitted_broadcasts = 0; - HCTR_LIB_THROW(cudaEventRecord(buffer->event, stream_)); - buffer->status.store(BufferStatus::UploadSubmitted); -} - -bool ThreadAsyncReader::check_completion(InternalBatchBuffer* buffer) { - if (buffer->status.load() != BufferStatus::UploadSubmitted) { - return false; - } - - auto res = cudaEventQuery(buffer->event); - if (res == cudaSuccess) { - buffer->status.store(BufferStatus::ReadReady); - return true; - } - if (res == cudaErrorNotReady) { - return false; - } - HCTR_LIB_THROW(res); - return false; -} - -void ThreadAsyncReader::reset() { - status_.store(WorkerStatus::Terminate); - for (auto buf : dest_buffers_) { - buf->status.store(BufferStatus::IOReady); - } -} - -ThreadAsyncReader::~ThreadAsyncReader() = default; - -} // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp b/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp index 606fe785b1..2a1e90f1c2 100644 --- a/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp +++ b/HugeCTR/src/data_readers/multi_hot/async_data_reader.cpp @@ -17,9 +17,8 @@ #include #include #include -#include -#include #include +#include #include #include #include diff --git a/HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu b/HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu deleted file mode 100644 index c2e7448433..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/calibration_data.cu +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -namespace calibration_data_kernels { - -template -__global__ void binary_threshold_search(const CountsT *__restrict__ counts, ThresholdT threshold, - IdxT *out_idx, IdxT n_elem) { - if (threadIdx.x == 0) { - IdxT start = 0; - IdxT end = n_elem; - while (start < end) { - IdxT mid = (start + end) / 2; - CountsT count = counts[mid]; - - if (count >= threshold) - start = mid + 1; - else - end = mid; - } - - *out_idx = start; - } -} - -template -__global__ void sum_counts(const CountsT *__restrict__ counts, CountsT *result, size_t n_elem) { - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - CountsT val = 0; - if (tid < n_elem) { - val = counts[tid]; - } - CountsT local_res = blockReduceSum(val); - if (threadIdx.x == 0) { - atomicAdd(result, local_res); - } -} - -} // namespace calibration_data_kernels - -/// -/// interpolate data_size using the two calibration data -/// calibrated_data_size, calibrated_times -/// return communication_times -/// -void CalibrationData::interpolate(const Tensor2 &calibrated_data_size, - const Tensor2 &calibrated_times, - const Tensor2 &data_size, - Tensor2 &communication_times) { - // TODO: implement this -} - -/// -/// Convenience function for interpolating all-to-all communication times from -/// calibrated data -/// -void CalibrationData::interpolate_all_reduce(const Tensor2 &data_size, - Tensor2 &communication_times) { - interpolate(all_reduce_data_size, all_reduce_times, data_size, communication_times); -} - -/// -/// Convenience function for interpolating all-to-all communication times from -/// calibrated data -/// -void CalibrationData::interpolate_all_to_all(const Tensor2 &data_size, - Tensor2 &communication_times) { - interpolate(all_to_all_data_size, all_to_all_times, data_size, communication_times); -} - -// Calculate threshold such that for the worst case distribution there will -// be one duplication per network on average -template -double ModelInitializationFunctors::calculate_threshold( - const CommunicationType communication_type, double p_dup_max, double all_to_all_bandwidth, - double all_reduce_bandwidth, double efficiency_bandwidth_ratio, size_t num_nodes, - size_t batch_size, size_t num_networks, size_t num_iterations, size_t num_tables) { - double count_threshold = 1.; - - // for NVLink capture effectively all duplications with number of categories - double M = (double)batch_size / (double)num_networks; - // double p_dup_max = 1.0 / 100.; // maximum 1 % of samples the category will be duplicated - switch (communication_type) { - case CommunicationType::IB_NVLink: - count_threshold = (double)num_iterations * (double)num_networks * all_to_all_bandwidth / - all_reduce_bandwidth * efficiency_bandwidth_ratio * (double)num_networks / - ((double)num_networks - 1.); - break; - case CommunicationType::IB_NVLink_Hier: - count_threshold = (double)num_iterations * (double)num_networks * all_to_all_bandwidth / - all_reduce_bandwidth * efficiency_bandwidth_ratio * (double)num_nodes / - ((double)num_nodes - 1.); - break; - case CommunicationType::NVLink_SingleNode: - // count threshold such that the probability of duplication is less than p_dup_max - // even if there are batch size number of categories that occur more often, - // there will be a duplication at most once every iteration per gpu - // - // p_duplication(category) \approx 1/2 M (M-1) \left( \frac{count}{batch_size x - // num_iterations} \right)^2 - count_threshold = - (double)batch_size * (double)num_iterations * sqrt(2.0 * p_dup_max / (M * (M - 1))); - break; - default: - HCTR_OWN_THROW(Error_t::WrongInput, - "Unknown communication type, expecting IB_NVLink or NVLink"); - } - - return count_threshold; -} - -/// -/// Calculate the number of frequent categories from data -/// -template -dtype ModelInitializationFunctors::calculate_num_frequent_categories( - const CommunicationType &communication_type, const size_t num_networks, - const CalibrationData &calibration, const Statistics &statistics, - const Data &data, dtype *d_num_frequent, cudaStream_t stream) { - dtype num_frequent; - dtype num_top_categories = (dtype)statistics.num_unique_categories; - - if (calibration.all_to_all_times.get_size_in_bytes() > 0) { - // calibration is given, perform fully optimized hybrid model - HCTR_OWN_THROW(Error_t::WrongInput, - "initialization hybrid model from communication calibration not available yet"); - } else { - size_t num_nodes = calibration.num_nodes; - size_t batch_size = data.batch_size; - size_t num_iterations = data.num_iterations; - size_t num_tables = data.table_sizes.size(); - - // Use threshold to determine number of frequent categories, - // calculates optimal number of frequent categories when the all-to-all - // and all-reduce are both bandwidth limited. - double count_threshold = ModelInitializationFunctors::calculate_threshold( - communication_type, calibration.p_dup_max, calibration.max_all_to_all_bandwidth, - calibration.max_all_reduce_bandwidth, calibration.efficiency_bandwidth_ratio, num_nodes, - batch_size, num_networks, num_iterations, num_tables); - - calibration_data_kernels::binary_threshold_search<<<1, 1, 0, stream>>>( - statistics.counts_sorted.get_ptr(), count_threshold, d_num_frequent, - (dtype)num_top_categories); - - HCTR_LIB_THROW(cudaMemcpyAsync(&num_frequent, d_num_frequent, sizeof(dtype), - cudaMemcpyDeviceToHost, stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - } - if (num_frequent > 0) { - num_frequent = ((num_frequent - 1) / num_networks + 1) * num_networks; - } - if (num_frequent > num_top_categories) { - num_frequent -= num_networks; - } - return num_frequent; -} - -/// -/// Calculate the number of frequent categories from data -/// -template -double ModelInitializationFunctors::calculate_frequent_probability( - const Statistics &statistics, const dtype num_frequent, uint32_t *d_total_frequent_count, - cudaStream_t stream) { - uint32_t total_frequent_count; - - HCTR_LIB_THROW(cudaMemsetAsync(d_total_frequent_count, 0, sizeof(uint32_t), stream)); - calibration_data_kernels::sum_counts<<>>( - statistics.counts_sorted.get_ptr(), d_total_frequent_count, num_frequent); - HCTR_LIB_THROW(cudaMemcpyAsync(&total_frequent_count, d_total_frequent_count, sizeof(dtype), - cudaMemcpyDeviceToHost, stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - return (double)total_frequent_count / (double)statistics.num_samples; -} - -template class ModelInitializationFunctors; -template class ModelInitializationFunctors; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/communication.cu b/HugeCTR/src/embeddings/hybrid_embedding/communication.cu deleted file mode 100644 index 36175747e4..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/communication.cu +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include - -namespace { - -template -ncclDataType_t get_nccl_type(); -template <> -ncclDataType_t get_nccl_type() { - return ncclInt32; -} -template <> -ncclDataType_t get_nccl_type() { - return ncclUint32; -} -template <> -ncclDataType_t get_nccl_type() { - return ncclUint64; -} -template <> -ncclDataType_t get_nccl_type() { - return ncclFloat32; -} -template <> -ncclDataType_t get_nccl_type<__half>() { - return ncclFloat16; -} - -} // namespace - -namespace HugeCTR { - -namespace hybrid_embedding { - -Communication::Communication(size_t width_data_field) : width_data_field_(width_data_field) {} - -/* - * All to All communications - */ -template -AllToAllVComm::AllToAllVComm(Tensor2 send_buffer, Tensor2 recv_buffer, - const uint32_t* send_offsets, const uint32_t* recv_offsets, - const GPUResource* gpu_resource, size_t width_data_field) - : Communication(width_data_field), - send_buffer_(send_buffer), - recv_buffer_(recv_buffer), - send_offsets_(send_offsets), - recv_offsets_(recv_offsets), - gpu_resource_(gpu_resource) {} - -template -void AllToAll_Multi_NCCL::communicate(cudaStream_t stream) { - auto& comm = this->gpu_resource_->get_nccl(); - auto type = get_nccl_type(); - - int num_global_gpus; - HCTR_LIB_THROW(ncclCommCount(comm, &num_global_gpus)); - - HCTR_LIB_THROW(ncclGroupStart()); - for (int i = 0; i < num_global_gpus; i++) { - HCTR_LIB_THROW( - ncclSend(this->send_buffer_.get_ptr() + this->send_offsets_[i] * this->width_data_field_, - (this->send_offsets_[i + 1] - this->send_offsets_[i]) * this->width_data_field_, - type, i, comm, stream)); - HCTR_LIB_THROW( - ncclRecv(this->recv_buffer_.get_ptr() + this->recv_offsets_[i] * this->width_data_field_, - (this->recv_offsets_[i + 1] - this->recv_offsets_[i]) * this->width_data_field_, - type, i, comm, stream)); - } - HCTR_LIB_THROW(ncclGroupEnd()); -} - -/* - * All Reduce communications - */ -template -AllReduceComm::AllReduceComm(AllReduceInPlaceComm* ar_comm, - AllReduceInPlaceComm::Handle ar_handle, - const GPUResource* gpu_resource) - : Communication(0), ar_comm_(ar_comm), ar_handle_(ar_handle), gpu_resource_(gpu_resource) {} - -template -void AllReduceComm::communicate(cudaStream_t stream) { - ar_comm_->all_reduce(ar_handle_, stream, gpu_resource_->get_local_id()); -} - -#ifdef ENABLE_MPI -template -HierAll2Allv_Multi_IB::HierAll2Allv_Multi_IB(uint32_t instance_id, - HierA2AvCollHandle coll_handle, - size_t** send_sizes, - const GPUResource* gpu_resource, - IbComm* ib_comm, cudaStream_t comm_stream) - : Communication(sizeof(commtype)), - instance_id_(instance_id), - coll_handle_(coll_handle), - send_sizes_(send_sizes), - gpu_resource_(gpu_resource), - ib_comm_(ib_comm), - comm_stream_(comm_stream) { - HCTR_LIB_THROW(cudaEventCreate(&comm_event_)); -} - -template -void HierAll2Allv_Multi_IB::update_sizes(cudaStream_t stream) { - ib_comm_->pre_intra_update_a2a_coll_sizes(coll_handle_, send_sizes_, stream, instance_id_); -} - -template -void HierAll2Allv_Multi_IB::communicate(cudaStream_t stream) { - ib_comm_->post_send_command_a2a(coll_handle_, stream, instance_id_); - HCTR_LIB_THROW(cudaEventRecord(comm_event_, comm_stream_)); - HCTR_LIB_THROW(cudaStreamWaitEvent(stream, comm_event_)); - // ib_comm_->wait_global_recv_async(coll_handle_, instance_id_); -} - -template -void HierAll2Allv_Multi_IB::initiate_communication(cudaStream_t stream) { - ib_comm_->post_a2a_send_command(coll_handle_, stream, instance_id_); - HCTR_LIB_THROW(cudaEventRecord(comm_event_, comm_stream_)); - HCTR_LIB_THROW(cudaStreamWaitEvent(stream, comm_event_)); -} - -template -void HierAll2Allv_Multi_IB::wait_completion(cudaStream_t stream) { - ib_comm_->blocking_wait(coll_handle_, stream, instance_id_); - HCTR_LIB_THROW(cudaEventRecord(comm_event_, comm_stream_)); - HCTR_LIB_THROW(cudaStreamWaitEvent(stream, comm_event_)); - // ib_comm_->wait_global_recv_async(coll_handle_, instance_id_); -} - -template -HierAll2Allv_Multi_IB::~HierAll2Allv_Multi_IB() { - cudaEventDestroy(comm_event_); -} -#endif - -template class AllToAllVComm; -template class AllToAllVComm<__half>; -template class AllReduceComm; -template class AllReduceComm<__half>; - -template class AllToAll_Multi_NCCL; -template class AllToAll_Multi_NCCL<__half>; -#ifdef ENABLE_MPI -template class HierAll2Allv_Multi_IB; -template class HierAll2Allv_Multi_IB<__half>; -#endif - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/data.cu b/HugeCTR/src/embeddings/hybrid_embedding/data.cu deleted file mode 100644 index 9509899d9f..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/data.cu +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -size_t EmbeddingTableFunctors::get_embedding_table_index( - const std::vector& table_sizes, dtype category) { - size_t embedding = 0; - dtype next_offset = (dtype)table_sizes[embedding]; - for (embedding = 0; embedding < table_sizes.size() - 1 && category >= next_offset; ++embedding) - next_offset += table_sizes[embedding + 1]; - return embedding; -} - -template -void EmbeddingTableFunctors::get_embedding_offsets(std::vector& embedding_offsets, - const std::vector& table_sizes) { - const size_t num_tables = table_sizes.size(); - embedding_offsets.resize(num_tables); - dtype embedding_offset = (dtype)0; - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - embedding_offsets[embedding] = embedding_offset; - embedding_offset += static_cast(table_sizes[embedding]); - } -} - -template -dtype EmbeddingTableFunctors::get_num_categories(const std::vector& table_sizes) { - dtype num_categories = (dtype)0; - for (size_t i = 0; i < table_sizes.size(); ++i) - num_categories += static_cast(table_sizes[i]); - return num_categories; -} - -template -__global__ void data_to_unique_categories_kernel(dtype* data, dtype* embedding_offsets, - int num_tables, int num_data, dtype* samples, - int num_valid_data, dtype pad_val) { - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < num_data; - idx += blockDim.x * gridDim.x) { - samples[idx] = - idx >= num_valid_data ? pad_val : data[idx] + embedding_offsets[idx % num_tables]; - } -} - -template -__global__ void data_to_unique_categories_align4_kernel(dtype* data, dtype* embedding_offsets, - int num_tables, int num_data, - dtype* samples, int num_valid_data, - dtype pad_val) { - auto data4 = reinterpret_cast(data); - auto samples4 = reinterpret_cast(samples); - for (int idx4 = threadIdx.x + blockIdx.x * blockDim.x; idx4 < num_data / 4; - idx4 += blockDim.x * gridDim.x) { - uint4 load_data = data4[idx4]; - uint4 load_embedding_offsets; - - int idx = idx4 * 4; - load_data.x += embedding_offsets[(idx) % num_tables]; - load_data.y += embedding_offsets[(idx + 1) % num_tables]; - load_data.z += embedding_offsets[(idx + 2) % num_tables]; - load_data.w += embedding_offsets[(idx + 3) % num_tables]; - - load_data.x = idx >= num_valid_data ? pad_val : load_data.x; - load_data.y = idx + 1 >= num_valid_data ? pad_val : load_data.y; - load_data.z = idx + 2 >= num_valid_data ? pad_val : load_data.z; - load_data.w = idx + 3 >= num_valid_data ? pad_val : load_data.w; - - samples4[idx4] = load_data; - } -} - -/// data_to_unique_categories converts the argument 'data' and stores -/// the result in member variable 'samples'. -/// Per network, the columns corresponding to embedding tables -/// are concatenated and categories get an unique index / label. -template -void Data::data_to_unique_categories(Tensor2 data, cudaStream_t stream) { - HCTR_LIB_THROW(cudaPeekAtLastError()); - /// === TODO: PERFORM ON GPU === - /// ============================ - // HCTR_LOG_S(WARNING, WORLD) << "data_to_unique_categories() needs to be placed on the GPU!" << - // std::endl; - // TODO : perform conversion by kernel (before start of iteration ? => see below) - // for batch_size = 55*1024 - // batch_size * 26 * 4 / 1600e9 = 3.67 microseconds, - // - // Remark: - // Doesn't need to be before start of kernel. - // Would be nice to have just before calculating indices, since - // those would be in L2 cache already. - size_t current_batch_size = data.get_dimensions()[0]; - size_t block_size = 256; - size_t grid_size = - std::min(static_cast(4096), - (table_sizes.size() * batch_size * num_iterations - 1) / block_size + 1); - size_t num_samples = table_sizes.size() * batch_size * num_iterations; - // Not all samples in a batch may be valid. I.e last iteration of evaluation may be incomplete. - size_t num_valid_samples = table_sizes.size() * current_batch_size; - assert(num_valid_samples > 0 && "Batch contained 0 valid samples"); - auto null_category = static_cast(num_categories); - if (num_samples % 4 == 0 && sizeof(dtype) == 4) { - data_to_unique_categories_align4_kernel<<>>( - data.get_ptr(), embedding_offsets.get_ptr(), table_sizes.size(), num_samples, - samples.get_ptr(), num_valid_samples, null_category); - } else { - data_to_unique_categories_kernel<<>>( - data.get_ptr(), embedding_offsets.get_ptr(), table_sizes.size(), (int)num_samples, - samples.get_ptr(), (int)num_valid_samples, null_category); - } - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template class Data; -template class Data; - -template struct EmbeddingTableFunctors; -template struct EmbeddingTableFunctors; -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu b/HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu deleted file mode 100644 index 75d9cbcf8d..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/frequent_embedding.cu +++ /dev/null @@ -1,487 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -namespace frequent_embedding_kernels { - -template -__global__ void reset_relevant_gradients(float* __restrict__ gradients, uint32_t embedding_vec_size, - FrequentEmbeddingCompressionView* indices, - uint32_t num_instances) { - const uint32_t num_network_cache_indices = indices->network_cache_indices_offsets[num_instances]; - for (uint32_t i = blockIdx.x; i < num_network_cache_indices; i += gridDim.x) - gradients[indices->network_cache_indices[i] * embedding_vec_size + threadIdx.x] = 0.0f; -} - -template -__global__ void frequent_local_reduce(const emtype* __restrict__ gradients_in, - float* __restrict__ gradients_out, - size_t local_samples_offset, - const dtype* __restrict__ category_location, - uint32_t embedding_vec_size, - FrequentEmbeddingCompressionView* indices) { - const uint32_t num_frequent_sample_indices = *indices->d_num_frequent_sample_indices; - - for (uint32_t i = blockIdx.x; i < num_frequent_sample_indices; i += gridDim.x) { - uint32_t local_sample_index = indices->frequent_sample_indices[i]; - dtype category = indices->samples[local_samples_offset + local_sample_index]; - dtype frequent_index = category_location[2 * category + 1]; - - atomicAdd(gradients_out + frequent_index * embedding_vec_size + threadIdx.x, - TypeConvertFunc::convert( - gradients_in[local_sample_index * embedding_vec_size + threadIdx.x])); - } -} - -template -__forceinline__ __device__ void update_model_direct_common( - const emtype* const* __restrict__ gradients_pointers, float* __restrict__ embedding_vectors, - const uint32_t* __restrict__ model_cache_indices, - const uint32_t* __restrict__ model_cache_indices_offsets, uint32_t num_instances, - uint32_t model_id, uint32_t num_frequent_per_model, uint32_t embedding_vec_size, float lr) {} - -template -__global__ void update_model_direct(const emtype* const* __restrict__ gradients_pointers, - float* __restrict__ embedding_vectors, - FrequentEmbeddingCompressionView* indices, - uint32_t num_instances, uint32_t model_id, - uint32_t num_frequent_per_model, uint32_t embedding_vec_size, - const float* __restrict__ lr_ptr, const float scale) { - float lr = __ldg(lr_ptr) / scale; - const uint32_t offset = indices->model_cache_indices_offsets[model_id + 1]; - const uint32_t num_model_cache_indices = indices->model_cache_indices_offsets[num_instances]; - - for (uint32_t i = blockIdx.x; i < num_model_cache_indices; i += gridDim.x) { - int vid = (i + offset) % num_model_cache_indices; - - uint32_t frequent_index = indices->model_cache_indices[vid]; - uint32_t network_id; - for (network_id = 0; - network_id < num_instances && indices->model_cache_indices_offsets[network_id + 1] <= vid; - network_id++) - ; - - const emtype* gradients = gradients_pointers[network_id]; - - uint32_t cache_location = frequent_index * embedding_vec_size + threadIdx.x; - atomicAdd(embedding_vectors + cache_location, - -lr * TypeConvertFunc::convert(gradients[cache_location])); - } -} - -} // namespace frequent_embedding_kernels - -template -FrequentEmbeddingBase::FrequentEmbeddingBase() {} - -template -FrequentEmbeddingBase::~FrequentEmbeddingBase() {} - -template -void FrequentEmbeddingBase::set_current_indices( - FrequentEmbeddingCompression* indices) { - indices_ = indices; - data_ = indices->get_data(); - indices_view_ = indices->get_device_view(); -} - -template -FrequentEmbeddingData::FrequentEmbeddingData(const Model& model, - const GPUResource& gpu_resource, - BuffPtr& grouped_wgrad_buff, - uint32_t embedding_vec_size, - size_t max_num_frequent_categories) - : model_(model), - gpu_resource_(gpu_resource), - grouped_wgrad_buff_(grouped_wgrad_buff), - wgrad_core23_buffer_(nullptr), - embedding_vec_size_(embedding_vec_size), - max_num_frequent_categories_(max_num_frequent_categories) { - std::shared_ptr> buf = GeneralBuffer2::create(); - buf->reserve({max_num_frequent_categories, embedding_vec_size_}, &frequent_embedding_vectors_); - if (sizeof(emtype) != sizeof(float)) { - buf->reserve({max_num_frequent_categories, embedding_vec_size_}, &float_frequent_gradients_); - } - - auto& gradients = get_gradients(); - if (grouped_wgrad_buff == NULL) { - buf->reserve({max_num_frequent_categories, embedding_vec_size_}, &gradients); - } else { - core23::BufferParams buffer_params = {}; - // TODO this has to be consistent with add_dense_layer.cpp:2126 - buffer_params.channel = std::is_same_v ? "TRAIN_WGRAD" : "TRAIN_WGRAD_HALF"; - core23::Device device(core23::DeviceType::GPU, gpu_resource.get_device_id()); - core23::Shape shape{static_cast(max_num_frequent_categories), - static_cast(embedding_vec_size_)}; - - core23::TensorParams t_params = core23::TensorParams() - .data_type(core23::ToScalarType::value) - .shape(shape) - .device(device) - .buffer_params(buffer_params) - .alignment(256); // default 256 Byte - core23::Tensor grad_tensor(t_params); - wgrad_core23_buffer_ = std::make_shared(grad_tensor); - gradients = - Tensor2({max_num_frequent_categories, embedding_vec_size_}, wgrad_core23_buffer_); - } - - buf->allocate(); -} - -template -FrequentEmbeddingSingleNode::FrequentEmbeddingSingleNode( - const Model& model, const GPUResource& gpu_resource, BuffPtr& grouped_wgrad_buff, - uint32_t embedding_vec_size, size_t max_num_frequent_categories) - : frequent_data_(model, gpu_resource, grouped_wgrad_buff, embedding_vec_size, - max_num_frequent_categories) { - std::shared_ptr> buf = GeneralBuffer2::create(); - - buf->reserve({model.num_instances, 1}, &embedding_vectors_cache_pointers_); - buf->reserve({model.num_instances, 1}, &partial_gradients_pointers_); - if (sizeof(emtype) != sizeof(float)) { - buf->reserve({max_num_frequent_categories, embedding_vec_size}, - &frequent_embedding_vectors_cache_); - } - buf->allocate(); -} - -template -void FrequentEmbeddingMultiNode::init_ar_comm(AllReduceInPlaceComm* ar_comm, - AllReduceInPlaceComm::Handle& handle, - int local_id) { - auto& local_gpu = frequent_data_.gpu_resource_; - CudaDeviceContext context(local_gpu.get_device_id()); - - auto& gradients = frequent_data_.get_gradients(); - ar_comm->set_coll_buf(handle, gradients.get_ptr(), gradients.get_size_in_bytes(), local_id); - ar_comm_ = std::make_unique>(ar_comm, handle, &local_gpu); -} - -template -void FrequentEmbeddingData::initialize_embedding_vectors( - const std::vector& table_sizes, size_t grouped_wgrad_offset_in_bytes) { - CudaDeviceContext context(gpu_resource_.get_device_id()); - - const size_t num_tables = table_sizes.size(); - for (size_t model_id = 0; model_id < model_.num_instances; ++model_id) { - for (size_t embedding = 0; embedding < num_tables; embedding++) { - float up_bound = sqrt(1.f / table_sizes[embedding]); - size_t offset = - embedding_vec_size_ * - model_.h_frequent_model_table_offsets[model_id * (num_tables + 1) + embedding]; - size_t num_elements = - embedding_vec_size_ * - (model_.h_frequent_model_table_offsets[model_id * (num_tables + 1) + embedding + 1] - - model_.h_frequent_model_table_offsets[model_id * (num_tables + 1) + embedding]); - UniformGenerator::fill(frequent_embedding_vectors_.get_ptr() + offset, num_elements, - -up_bound, up_bound, gpu_resource_.get_sm_count(), - gpu_resource_.get_replica_uniform_curand_generator(), - gpu_resource_.get_stream()); - } - } - - if (wgrad_core23_buffer_) { - // update wgrad tensors - size_t grad_size = model_.num_frequent * embedding_vec_size_; - if (sizeof(float) != sizeof(emtype)) { - frequent_gradients_ = Tensor2({grad_size}, wgrad_core23_buffer_); - } else { - float_frequent_gradients_ = Tensor2({grad_size}, wgrad_core23_buffer_); - } - } else if (grouped_wgrad_buff_) { - // update wgrad tensors - size_t grad_size = model_.num_frequent * embedding_vec_size_; - if (sizeof(float) != sizeof(emtype)) { - auto buf = std::make_shared( - (char*)grouped_wgrad_buff_->as_tensor().get_ptr() + grouped_wgrad_offset_in_bytes); - frequent_gradients_ = Tensor2({grad_size}, buf); - } else { - auto buf = std::make_shared( - (char*)grouped_wgrad_buff_->as_tensor().get_ptr() + grouped_wgrad_offset_in_bytes); - float_frequent_gradients_ = Tensor2({grad_size}, buf); - } - } -} - -/* Single-node: refresh needed vectors in the cache of each network - * Note: each network pulls from the models */ -template -void FrequentEmbeddingSingleNode::forward_model(cudaStream_t stream) { - const uint32_t num_instances = frequent_data_.model_.num_instances; - const uint32_t model_id = frequent_data_.model_.global_instance_id; - - auto embedding_vectors_cache_pointers = embedding_vectors_cache_pointers_.get_ptr(); - auto frequent_embedding_vectors = frequent_data_.frequent_embedding_vectors_.get_ptr(); - auto indices = this->indices_view_; - auto embedding_vec_size = frequent_data_.embedding_vec_size_; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->model_cache_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - const uint32_t offset = indices->model_cache_indices_offsets[model_id + 1]; - const uint32_t num_model_cache_indices = - indices->model_cache_indices_offsets[num_instances]; - int vid = (i + offset) % num_model_cache_indices; - uint32_t frequent_index = indices->model_cache_indices[vid]; - - uint32_t network_id; - for (network_id = 0; network_id < num_instances && - indices->model_cache_indices_offsets[network_id + 1] <= vid; - network_id++) - ; - emtype* embedding_vectors_out = embedding_vectors_cache_pointers[network_id]; - - const float* src_ptr = frequent_embedding_vectors + frequent_index * embedding_vec_size; - emtype* dst_ptr = embedding_vectors_out + frequent_index * embedding_vec_size; - - return { - src_ptr, {dst_ptr}, {static_cast(src_ptr) != static_cast(dst_ptr)}}; - }); - - shuffle(copy_desc, stream, frequent_data_.model_.num_frequent / 4); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -/* Single-node: refresh all vectors in the cache of each network */ -template -void FrequentEmbeddingSingleNode::forward_model_eval(cudaStream_t stream) { - const uint32_t num_instances = frequent_data_.model_.num_instances; - const uint32_t model_id = frequent_data_.model_.global_instance_id; - - emtype** embedding_vectors_cache_pointers = embedding_vectors_cache_pointers_.get_ptr(); - const float* frequent_embedding_vectors = frequent_data_.frequent_embedding_vectors_.get_ptr(); - size_t embedding_vec_size = frequent_data_.embedding_vec_size_; - const uint32_t num_frequent = frequent_data_.model_.num_frequent; - const uint32_t num_frequent_per_model = num_frequent / num_instances; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, [=] __device__() { return num_frequent; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - // Shift pattern - uint32_t shifted_i = (i + (model_id + 1) * num_frequent_per_model) % num_frequent; - uint32_t network_id = shifted_i / num_frequent_per_model; - uint32_t frequent_index = - model_id * num_frequent_per_model + shifted_i % num_frequent_per_model; - - emtype* embedding_vectors_out = embedding_vectors_cache_pointers[network_id]; - - const float* src_ptr = frequent_embedding_vectors + frequent_index * embedding_vec_size; - emtype* dst_ptr = embedding_vectors_out + frequent_index * embedding_vec_size; - - return { - src_ptr, {dst_ptr}, {static_cast(src_ptr) != static_cast(dst_ptr)}}; - }); - - shuffle(copy_desc, stream, num_frequent); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -template -void FrequentEmbeddingData::forward_network( - const vectype* embedding_vectors, emtype* interaction_layer_input, - FrequentEmbeddingBase* base, cudaStream_t stream) { - uint32_t samples_per_instance = - base->data_->samples.get_num_elements() / this->model_.num_instances; - uint32_t global_sample_index_base = model_.global_instance_id * samples_per_instance; - - auto indices = base->indices_view_; - auto category_location = this->model_.category_location.get_ptr(); - auto embedding_vec_size = this->embedding_vec_size_; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() -> size_t { return *indices->d_num_frequent_sample_indices; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - auto index = indices->frequent_sample_indices[i]; - auto category = indices->samples[index + global_sample_index_base]; - auto frequent_index = category_location[2 * category + 1]; - - return { - embedding_vectors + frequent_index * embedding_vec_size, - {interaction_layer_input + indices->frequent_sample_indices[i] * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, samples_per_instance); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -/* Concatenate the embedding vectors into the buffer for top-mlp input */ -template -void FrequentEmbeddingSingleNode::forward_network(emtype* interaction_layer_input, - cudaStream_t stream) { - frequent_data_.forward_network(get_embedding_vectors_cache().get_ptr(), interaction_layer_input, - this, stream); -} - -template -void FrequentEmbeddingMultiNode::forward_network(emtype* interaction_layer_input, - cudaStream_t stream) { - frequent_data_.forward_network(frequent_data_.frequent_embedding_vectors_.get_ptr(), - interaction_layer_input, this, stream); -} - -/* Reduce gradients on each network */ -template -void FrequentEmbeddingData::local_reduce(const emtype* gradients, - FrequentEmbeddingBase* base, - cudaStream_t stream) { - const auto num_instances = model_.num_instances; - const auto network_id = model_.global_instance_id; - size_t local_samples_size = - ceildiv(base->data_->batch_size, num_instances) * base->data_->table_sizes.size(); - - int n_blocks = 16 * gpu_resource_.get_sm_count(); - auto embedding_vec_size = embedding_vec_size_; - - frequent_embedding_kernels::frequent_local_reduce<<>>( - gradients, float_frequent_gradients_.get_ptr(), network_id * local_samples_size, - model_.category_location.get_ptr(), embedding_vec_size, base->indices_view_); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - if (sizeof(emtype) != sizeof(float)) { - convert_array<<<1000, 128, 0, stream>>>(frequent_gradients_.get_ptr(), - float_frequent_gradients_.get_ptr(), - model_.num_frequent * embedding_vec_size); - HCTR_LIB_THROW(cudaPeekAtLastError()); - } -} - -template -void FrequentEmbeddingSingleNode::local_reduce(const emtype* gradients, - cudaStream_t stream) { - auto num_instances = frequent_data_.model_.num_instances; - int n_blocks = 16 * frequent_data_.gpu_resource_.get_sm_count(); - auto embedding_vec_size = frequent_data_.embedding_vec_size_; - - /* Set to zero the gradients of categories that appear in the batch */ - frequent_embedding_kernels::reset_relevant_gradients<<>>( - frequent_data_.float_frequent_gradients_.get_ptr(), embedding_vec_size, this->indices_view_, - num_instances); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - frequent_data_.local_reduce(gradients, this, stream); -} - -template -void FrequentEmbeddingMultiNode::local_reduce(const emtype* gradients, - cudaStream_t stream) { - /* Set to zero all the gradients */ - if (frequent_data_.model_.num_frequent > 0) { - HCTR_LIB_THROW(cudaMemsetAsync( - frequent_data_.float_frequent_gradients_.get_ptr(), 0, - frequent_data_.model_.num_frequent * frequent_data_.embedding_vec_size_ * sizeof(float), - stream)); - } - - frequent_data_.local_reduce(gradients, this, stream); -} - -template -void FrequentEmbeddingMultiNode::update_model(float* dev_lr, float scale, - cudaStream_t stream) { - sgd_global_update(frequent_data_.get_gradients().get_ptr(), - frequent_data_.frequent_embedding_vectors_.get_ptr(), - frequent_data_.model_.num_frequent, frequent_data_.embedding_vec_size_, dev_lr, - scale, stream); -} - -/* Update model for single-node: direct write in category "owner"'s table, lr is a device variable - */ -template -void FrequentEmbeddingSingleNode::update_model_direct(float* dev_lr, float scale, - cudaStream_t stream) { - const uint32_t& num_instances = frequent_data_.model_.num_instances; - const uint32_t& model_id = frequent_data_.model_.global_instance_id; - const uint32_t num_frequent_per_model = frequent_data_.model_.num_frequent / num_instances; - - int num_sm = frequent_data_.gpu_resource_.get_sm_count(); - int n_blocks = 8 * num_sm; // TODO: better heuristics - - /* Update models */ - frequent_embedding_kernels:: - update_model_direct<<>>( - partial_gradients_pointers_.get_ptr(), - frequent_data_.frequent_embedding_vectors_.get_ptr(), this->indices_view_, num_instances, - model_id, num_frequent_per_model, frequent_data_.embedding_vec_size_, dev_lr, scale); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void FrequentEmbeddingMultiNode::communicate(cudaStream_t stream) { - ar_comm_->communicate(stream); -} - -template class FrequentEmbeddingBase; -template class FrequentEmbeddingBase; - -template class FrequentEmbeddingData; -template class FrequentEmbeddingData; -template class FrequentEmbeddingData; -template class FrequentEmbeddingData; - -template class FrequentEmbeddingSingleNode; -template class FrequentEmbeddingSingleNode; -template class FrequentEmbeddingSingleNode; -template class FrequentEmbeddingSingleNode; - -template class FrequentEmbeddingMultiNode; -template class FrequentEmbeddingMultiNode; -template class FrequentEmbeddingMultiNode; -template class FrequentEmbeddingMultiNode; - -template void FrequentEmbeddingData::forward_network<__half>( - const __half*, __half*, FrequentEmbeddingBase*, cudaStream_t); -template void FrequentEmbeddingData::forward_network( - const float*, __half*, FrequentEmbeddingBase*, cudaStream_t); -template void FrequentEmbeddingData::forward_network( - const float*, float*, FrequentEmbeddingBase*, cudaStream_t); -template void FrequentEmbeddingData::forward_network<__half>( - const __half*, __half*, FrequentEmbeddingBase*, cudaStream_t); -template void FrequentEmbeddingData::forward_network( - const float*, __half*, FrequentEmbeddingBase*, cudaStream_t); -template void FrequentEmbeddingData::forward_network( - const float*, float*, FrequentEmbeddingBase*, cudaStream_t); -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu b/HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu deleted file mode 100644 index e52199c233..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/hybrid_indices.cu +++ /dev/null @@ -1,541 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace indices_kernels { - -template -__global__ void fused_cache_masks(const dtype* __restrict__ samples, - const dtype* __restrict__ category_location, - bool* __restrict__ model_cache_mask, - bool* __restrict__ network_cache_mask, uint32_t offset, - uint32_t samples_size, uint32_t local_samples_size, - uint32_t num_frequent, uint32_t num_frequent_per_model, - uint32_t model_id, uint32_t num_instances) { - uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; - - if (tid < samples_size) { - dtype category = __ldg(samples + tid); - dtype frequent_loc = __ldg(category_location + 2 * category); - dtype frequent_index = __ldg(category_location + (2 * category + 1)); - - if (frequent_loc == num_instances && frequent_index / num_frequent_per_model == model_id) - model_cache_mask[(tid / local_samples_size) * num_frequent_per_model + - frequent_index % num_frequent_per_model] = true; - } - - if (tid < local_samples_size) { - dtype category = __ldg(samples + offset + tid); - dtype frequent_loc = __ldg(category_location + 2 * category); - dtype frequent_index = __ldg(category_location + (2 * category + 1)); - - if (frequent_loc == num_instances) network_cache_mask[frequent_index] = true; - } -} - -__global__ void mask_indices_to_buffer_indices( - uint32_t* __restrict__ model_cache_indices, - const uint32_t* __restrict__ model_cache_indices_offsets, uint32_t num_instances, - uint32_t num_frequent_per_model, uint32_t model_id) { - const uint32_t num_selected = __ldg(model_cache_indices_offsets + num_instances); - - for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_selected; - i += blockDim.x * gridDim.x) - model_cache_indices[i] = - model_cache_indices[i] % num_frequent_per_model + num_frequent_per_model * model_id; -} - -template -__global__ void calculate_network_indices_mask(const dtype* __restrict__ local_samples, - const dtype* __restrict__ category_location, - bool* mask, uint32_t local_samples_size, - uint32_t num_instances) { - for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < local_samples_size; - i += gridDim.x * blockDim.x) { - dtype category = local_samples[i]; - uint32_t model_id = static_cast(category_location[2 * category]); - for (uint32_t section_id = 0; section_id < num_instances; section_id++) { - mask[local_samples_size * section_id + i] = (model_id == section_id); - } - } -} - -} // namespace indices_kernels - -namespace HugeCTR { -namespace hybrid_embedding { - -// =========================================================================================== -// Frequent Compression -// =========================================================================================== - -template -FrequentEmbeddingCompression::FrequentEmbeddingCompression( - size_t max_num_frequent_categories, const Data& data, const Model& model) - : data_(data), model_(model) { - const int num_tables = data_.table_sizes.size(); - - std::shared_ptr> buf = GeneralBuffer2::create(); - buf->reserve({max_num_frequent_categories, 1}, &model_cache_indices_); - buf->reserve({model.num_instances + 1, 1}, &model_cache_indices_offsets_); - buf->reserve({max_num_frequent_categories, 1}, &network_cache_indices_); - buf->reserve({model.num_instances + 1, 1}, &network_cache_indices_offsets_); - buf->reserve({2 * max_num_frequent_categories, 1}, &cache_masks_); - buf->reserve({ceildiv(data_.batch_size, model.num_instances) * num_tables, 1}, - &frequent_sample_indices_); - buf->reserve({1}, &d_num_frequent_sample_indices_); - - // Temporary storage - calculate_frequent_sample_indices_temp_storage_bytes((data_.batch_size / model.num_instances) * - num_tables); - calculate_model_cache_indices_temp_storage_bytes(max_num_frequent_categories); - calculate_network_cache_indices_temp_storage_bytes(max_num_frequent_categories); - buf->reserve({frequent_sample_indices_temp_storage_bytes_, 1}, - &frequent_sample_indices_temp_storage_); - buf->reserve({model_cache_indices_temp_storage_bytes_, 1}, &model_cache_indices_temp_storage_); - buf->reserve({network_cache_indices_temp_storage_bytes_, 1}, - &network_cache_indices_temp_storage_); - buf->allocate(); - - FrequentEmbeddingCompressionView view = {data_.samples.get_ptr(), - cache_masks_.get_ptr(), - model_cache_indices_.get_ptr(), - model_cache_indices_offsets_.get_ptr(), - network_cache_indices_.get_ptr(), - network_cache_indices_offsets_.get_ptr(), - d_num_frequent_sample_indices_.get_ptr(), - frequent_sample_indices_.get_ptr()}; - - HCTR_LIB_THROW(cudaMalloc(&device_indices_view_, sizeof(view))); - HCTR_LIB_THROW(cudaMemcpy(device_indices_view_, &view, sizeof(view), cudaMemcpyHostToDevice)); -} - -template -struct FrequentSampleIndicesSelectOp { - const dtype* samples; - const dtype* category_location; - uint32_t offset; - dtype num_instances; - __host__ __device__ __forceinline__ FrequentSampleIndicesSelectOp(const dtype* samples, - const dtype* category_location, - uint32_t offset, - dtype num_instances) - : samples(samples), - category_location(category_location), - offset(offset), - num_instances(num_instances) {} - __device__ __forceinline__ bool operator()(const uint32_t& idx) const { - dtype category = __ldg(samples + offset + idx); - dtype frequent_location = __ldg(category_location + 2 * category); - return frequent_location == num_instances; - } -}; - -template -void FrequentEmbeddingCompression::calculate_frequent_sample_indices_temp_storage_bytes( - const size_t local_samples_size) { - cub::CountingInputIterator counting(0); - FrequentSampleIndicesSelectOp select_op(nullptr, nullptr, 0, 0); - cub::DeviceSelect::If(nullptr, frequent_sample_indices_temp_storage_bytes_, counting, - (uint32_t*)nullptr, (uint32_t*)nullptr, local_samples_size, select_op, 0); -} - -template -void FrequentEmbeddingCompression::calculate_model_cache_indices_temp_storage_bytes( - const size_t num_frequent) { - size_t select_bytes = 0; - cub::CountingInputIterator counting(0); - cub::DeviceSelect::Flagged(nullptr, select_bytes, counting, (bool*)nullptr, (uint32_t*)nullptr, - (uint32_t*)nullptr, num_frequent, 0); - - constexpr uint32_t align = 256; - model_cache_indices_temp_storage_bytes_ = alignTo(num_frequent, align) + select_bytes; -} - -template -void FrequentEmbeddingCompression::calculate_network_cache_indices_temp_storage_bytes( - const size_t num_frequent) { - size_t select_bytes = (size_t)0; - cub::CountingInputIterator counting(0); - cub::DeviceSelect::Flagged(nullptr, select_bytes, counting, (bool*)nullptr, (uint32_t*)nullptr, - (uint32_t*)nullptr, num_frequent, 0); - - network_cache_indices_temp_storage_bytes_ = select_bytes; -} - -template -void FrequentEmbeddingCompression::calculate_frequent_sample_indices(cudaStream_t stream) { - const size_t num_networks = model_.num_instances; - size_t local_samples_size = (data_.batch_size / num_networks) * data_.table_sizes.size(); - - // Select indices of frequent categories appearing in the local MLP batch - cub::CountingInputIterator counting(0); - FrequentSampleIndicesSelectOp select_op( - data_.samples.get_ptr(), model_.category_location.get_ptr(), - model_.global_instance_id * local_samples_size, model_.num_instances); - cub::DeviceSelect::If( - reinterpret_cast(frequent_sample_indices_temp_storage_.get_ptr()), - frequent_sample_indices_temp_storage_bytes_, counting, frequent_sample_indices_.get_ptr(), - d_num_frequent_sample_indices_.get_ptr(), local_samples_size, select_op, stream); -} - -template -void FrequentEmbeddingCompression::calculate_model_cache_indices(size_t sm_count, - cudaStream_t stream) { - const size_t num_instances = model_.num_instances; - const size_t num_frequent = model_.num_frequent; - const size_t samples_size = data_.batch_size * data_.table_sizes.size(); - size_t local_samples_size = - ceildiv(data_.batch_size, num_instances) * data_.table_sizes.size(); - - // Note: we assume that the number of frequent categories is a - // multiple of the number of models! - const size_t num_frequent_per_model = num_frequent / num_instances; - - /** - * Explanation of the mask: - * The model owns num_frequent_per_model categories. For each network, - * we want to know the categories that appear in their local batch and - * belong to this model. The mask is the concatenation of num_network - * sections of size num_frequent_per_model. - * It has a size num_frequent but does not represent all the frequent - * categories, only num_networks repetitions of the same categories. - */ - - // Temporary storage - char* scratch_ptr = model_cache_indices_temp_storage_.get_ptr(); - void* d_temp_storage = reinterpret_cast(scratch_ptr); - size_t temp_storage_bytes = model_cache_indices_temp_storage_bytes_; - - const bool* d_model_cache_mask = cache_masks_.get_ptr() + num_frequent; - - /* Select categories according to the mask */ - cub::CountingInputIterator counting(0); - cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, counting, d_model_cache_mask, - model_cache_indices_.get_ptr(), - model_cache_indices_offsets_.get_ptr() + num_instances, num_frequent, - stream); - - /* Compute offsets */ - constexpr size_t TPB_offsets = 256; - size_t n_blocks = ceildiv(num_instances, TPB_offsets); - offsets_kernel<<>>(model_cache_indices_.get_ptr(), - model_cache_indices_offsets_.get_ptr(), - num_instances, num_frequent_per_model); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - /* Convert to buffer indices */ - - constexpr size_t TPB_convert = 256; - n_blocks = sm_count; - indices_kernels::mask_indices_to_buffer_indices<<>>( - model_cache_indices_.get_ptr(), model_cache_indices_offsets_.get_ptr(), num_instances, - num_frequent_per_model, model_.global_instance_id); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void FrequentEmbeddingCompression::calculate_cache_masks(cudaStream_t stream) { - const size_t num_instances = model_.num_instances; - const size_t num_frequent = model_.num_frequent; - size_t samples_size = data_.batch_size * data_.table_sizes.size(); - size_t local_samples_size = ceildiv(samples_size, num_instances); - const size_t num_frequent_per_model = num_frequent / num_instances; - - bool* d_network_cache_mask = cache_masks_.get_ptr(); - bool* d_model_cache_mask = cache_masks_.get_ptr() + num_frequent; - - /* Initialize the masks to false */ - HCTR_LIB_THROW(cudaMemsetAsync(cache_masks_.get_ptr(), 0, 2 * num_frequent, stream)); - - /* Compute the model cache mask */ - constexpr size_t TPB_mask = 256; - size_t n_blocks = ceildiv(samples_size, TPB_mask); - indices_kernels::fused_cache_masks<<>>( - data_.samples.get_ptr(), model_.category_location.get_ptr(), d_model_cache_mask, - d_network_cache_mask, model_.global_instance_id * local_samples_size, samples_size, - local_samples_size, num_frequent, num_frequent_per_model, model_.global_instance_id, - model_.num_instances); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void FrequentEmbeddingCompression::calculate_network_cache_indices(cudaStream_t stream) { - const size_t num_instances = model_.num_instances; - const size_t num_frequent = model_.num_frequent; - size_t local_samples_size = - ceildiv(data_.batch_size, num_instances) * data_.table_sizes.size(); - - // Note: we assume that the number of frequent categories is a - // multiple of the number of models! - const size_t num_frequent_per_model = num_frequent / num_instances; - - // Temporary storage - char* scratch_ptr = network_cache_indices_temp_storage_.get_ptr(); - void* d_temp_storage = reinterpret_cast(scratch_ptr); - size_t temp_storage_bytes = network_cache_indices_temp_storage_bytes_; - - const bool* d_network_cache_mask = cache_masks_.get_ptr(); - - /* Select categories according to the mask */ - cub::CountingInputIterator counting(0); - cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, counting, d_network_cache_mask, - network_cache_indices_.get_ptr(), - network_cache_indices_offsets_.get_ptr() + num_instances, num_frequent, - stream); - - /* Compute offsets */ - constexpr size_t TPB_offsets = 256; - size_t n_blocks = ceildiv(num_instances, TPB_offsets); - offsets_kernel<<>>(network_cache_indices_.get_ptr(), - network_cache_indices_offsets_.get_ptr(), - num_instances, num_frequent_per_model); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -// =========================================================================================== -// Inrequent Selection -// =========================================================================================== - -template -InfrequentEmbeddingSelection::InfrequentEmbeddingSelection(const Data& data, - const Model& model) - : data_(data), model_(model) { - const size_t num_tables = data_.table_sizes.size(); - - auto buf = GeneralBuffer2::create(); - - buf->reserve({data_.batch_size, num_tables}, &model_indices_); - buf->reserve({ceildiv(data_.batch_size, model.num_instances), num_tables}, - &network_indices_); - buf->reserve({ceildiv(data_.batch_size, model.num_instances), num_tables}, - &network_indices_src_model_id_); - - // buf->reserve({model.num_instances}, &model_indices_sizes_); - // buf->reserve({model.num_instances}, &model_indices_sizes_ptrs_); - // buf->reserve({model.num_instances}, &network_indices_sizes_); - // buf->reserve({model.num_instances}, &network_indices_sizes_ptrs_); - - // Temporary storage - calculate_model_indices_temp_storage_bytes(data_.batch_size, num_tables); - calculate_network_indices_temp_storage_bytes(data_.batch_size, num_tables, model.num_instances); - buf->reserve({model_indices_temp_storage_bytes_, 1}, &model_indices_temp_storage_); - buf->reserve({network_indices_temp_storage_bytes_, 1}, &network_indices_temp_storage_); - - buf->allocate(); - - auto managed_buf = GeneralBuffer2::create(); - managed_buf->reserve({model.num_instances + 1, 1}, &model_indices_offsets_); - managed_buf->reserve({model.num_instances + 1, 1}, &network_indices_offsets_); - managed_buf->allocate(); - // int current_device; - // HCTR_LIB_THROW(cudaGetDevice(¤t_device)); - // HCTR_LIB_THROW(cudaMemAdvise(managed_buf->get_ptr(), managed_buf->get_size_in_bytes(), - // cudaMemAdviseSetReadMostly, current_device)); - - InfrequentEmbeddingSelectionView view = {data_.samples.get_ptr(), - model_indices_.get_ptr(), - model_indices_offsets_.get_ptr(), - network_indices_.get_ptr(), - network_indices_offsets_.get_ptr(), - network_indices_src_model_id_.get_ptr()}; - - HCTR_LIB_THROW(cudaMalloc(&device_indices_view_, sizeof(view))); - HCTR_LIB_THROW(cudaMemcpy(device_indices_view_, &view, sizeof(view), cudaMemcpyHostToDevice)); -} - -template -struct ModelIndicesSelectOp { - const dtype* samples; - const dtype* category_location; - uint32_t my_model_id; - __host__ __device__ __forceinline__ ModelIndicesSelectOp(const dtype* samples, - const dtype* category_location, - uint32_t my_model_id) - : samples(samples), category_location(category_location), my_model_id(my_model_id) {} - __device__ __forceinline__ bool operator()(const uint32_t& idx) const { - dtype category = __ldg(samples + idx); - dtype model_id = __ldg(category_location + 2 * category); - return model_id == my_model_id; - } -}; - -template -void InfrequentEmbeddingSelection::calculate_model_indices_temp_storage_bytes( - size_t max_batch_size, size_t table_size) { - cub::CountingInputIterator counting(0); - ModelIndicesSelectOp select_op(nullptr, nullptr, 0); - cub::DeviceSelect::If(nullptr, model_indices_temp_storage_bytes_, counting, (uint32_t*)nullptr, - (uint32_t*)nullptr, max_batch_size * table_size, select_op, 0); -} - -template -void InfrequentEmbeddingSelection::calculate_network_indices_temp_storage_bytes( - size_t max_batch_size, size_t table_size, const uint32_t num_instances) { - uint32_t samples_size = max_batch_size * table_size; - uint32_t local_samples_size = ceildiv(samples_size, num_instances); - - // Calculate select bytes - size_t select_bytes = 0; - cub::CountingInputIterator counting(0); - cub::DeviceSelect::Flagged(nullptr, select_bytes, counting, (bool*)nullptr, (uint32_t*)nullptr, - (uint32_t*)nullptr, samples_size, 0); - - // Total size - constexpr uint32_t align = 256; - network_indices_temp_storage_bytes_ = - alignTo(sizeof(bool) * samples_size, align) + select_bytes; -} - -template -void InfrequentEmbeddingSelection::calculate_model_indices(cudaStream_t stream) { - const uint32_t& num_instances = model_.num_instances; - - size_t local_batch_size = ceildiv(data_.batch_size, num_instances); - - // Select indices of infrequent categories belonging to this model - cub::CountingInputIterator counting(0); - ModelIndicesSelectOp select_op(data_.samples.get_ptr(), model_.category_location.get_ptr(), - model_.global_instance_id); - cub::DeviceSelect::If(reinterpret_cast(model_indices_temp_storage_.get_ptr()), - model_indices_temp_storage_bytes_, counting, model_indices_.get_ptr(), - model_indices_offsets_.get_ptr() + num_instances, - data_.batch_size * data_.table_sizes.size(), select_op, stream); - - // Compute offsets - constexpr size_t TPB = 256; - const size_t n_blocks = ceildiv(num_instances, TPB); - offsets_kernel<<>>(model_indices_.get_ptr(), - model_indices_offsets_.get_ptr(), num_instances, - local_batch_size * data_.table_sizes.size()); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbeddingSelection::calculate_network_indices(size_t sm_count, - cudaStream_t stream) { - const uint32_t num_instances = model_.num_instances; - uint32_t samples_size = data_.batch_size * data_.table_sizes.size(); - uint32_t local_samples_size = ceildiv(samples_size, num_instances); - - // Temporary storage - constexpr uint32_t align = 256; - char* scratch_ptr = network_indices_temp_storage_.get_ptr(); - size_t scratch_offset = 0; - bool* d_mask = reinterpret_cast(scratch_ptr + scratch_offset); - scratch_offset += alignTo(sizeof(bool) * samples_size, align); - void* d_temp_storage = reinterpret_cast(scratch_ptr + scratch_offset); - size_t temp_storage_bytes = network_indices_temp_storage_bytes_ - scratch_offset; - - // Compute mask (for each source GPU, whether each element in the batch is located there) - constexpr uint32_t TPB_mask = 256; - uint32_t n_blocks_mask = ceildiv(local_samples_size, TPB_mask); - indices_kernels::calculate_network_indices_mask<<>>( - data_.samples.get_ptr() + model_.global_instance_id * local_samples_size, - model_.category_location.get_ptr(), d_mask, local_samples_size, num_instances); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - // Select indices according to the mask - cub::CountingInputIterator counting(0); - cub::DeviceSelect::Flagged( - d_temp_storage, temp_storage_bytes, counting, d_mask, network_indices_.get_ptr(), - network_indices_offsets_.get_ptr() + num_instances, samples_size, stream); - - // Compute offsets - constexpr uint32_t TPB_offsets = 256; - uint32_t n_blocks_offsets = ceildiv(num_instances, TPB_offsets); - offsets_kernel<<>>(network_indices_.get_ptr(), - network_indices_offsets_.get_ptr(), - num_instances, local_samples_size); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - // Re-map indices between 0 and local_samples_size - 1 - uint32_t TPB_remap = 256; - uint32_t n_blocks_remap = sm_count; - modulo_kernel<<>>( - network_indices_.get_ptr(), network_indices_offsets_.get_ptr() + num_instances, - local_samples_size); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - // Figure out the model id for each indices - model_id_kernel<<>>( - network_indices_offsets_.get_ptr(), network_indices_src_model_id_.get_ptr(), - network_indices_offsets_.get_ptr() + num_instances); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -// template -// void InfrequentEmbeddingSelection::calculate_model_indices_sizes_from_offsets( -// size_t embedding_vec_bytes, cudaStream_t stream) { -// constexpr size_t TPB = 256; -// const size_t n_blocks = ceildiv(model_.num_instances, TPB); -// offsets_to_sizes<<>>( -// model_indices_sizes_.get_ptr(), model_indices_offsets_.get_ptr(), -// embedding_vec_bytes, model_.num_instances); -// } - -// template -// void InfrequentEmbeddingSelection::calculate_network_indices_sizes_from_offsets( -// size_t embedding_vec_bytes, cudaStream_t stream) { -// constexpr size_t TPB = 256; -// const size_t n_blocks = ceildiv(model_.num_instances, TPB); -// offsets_to_sizes<<>>( -// network_indices_sizes_.get_ptr(), network_indices_offsets_.get_ptr(), -// embedding_vec_bytes, model_.num_instances); -// } - -template -void compute_indices(FrequentEmbeddingCompression& compression, - InfrequentEmbeddingSelection& selection, - CommunicationType communication_type, bool compute_network_cache_indices, - cudaStream_t stream, int sm_count) { - compression.calculate_frequent_sample_indices(stream); - selection.calculate_model_indices(stream); - - if (communication_type != CommunicationType::NVLink_SingleNode) { - selection.calculate_network_indices(sm_count, stream); - } else { - compression.calculate_cache_masks(stream); - if (compute_network_cache_indices) { - compression.calculate_network_cache_indices(stream); - } - compression.calculate_model_cache_indices(sm_count, stream); - } -} - -template void compute_indices(FrequentEmbeddingCompression& compression, - InfrequentEmbeddingSelection& selection, - CommunicationType communication_type, - bool compute_network_cache_indices, cudaStream_t stream, - int sm_count); - -template void compute_indices(FrequentEmbeddingCompression& compression, - InfrequentEmbeddingSelection& selection, - CommunicationType communication_type, - bool compute_network_cache_indices, cudaStream_t stream, - int sm_count); - -template class FrequentEmbeddingCompression; -template class FrequentEmbeddingCompression; -template class InfrequentEmbeddingSelection; -template class InfrequentEmbeddingSelection; - -} // namespace hybrid_embedding -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu b/HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu deleted file mode 100644 index d1641bd6b7..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/indices_container.cu +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -namespace HugeCTR { -namespace hybrid_embedding { - -template -BatchIndices::BatchIndices(std::vector>& models, - std::vector> data_sources, - std::shared_ptr& resource_manager, - size_t batch_size, std::vector& slot_size_array, - size_t max_num_frequent_categories, - CommunicationType communication_type) - : num_slots_(slot_size_array.size()), - resource_manager_(resource_manager), - communication_type_(communication_type) { - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); ++i) { - CudaDeviceContext ctx(resource_manager_->get_local_gpu(i)->get_device_id()); - data_.emplace_back(data_sources[i].get_value_tensor(), slot_size_array, batch_size, 1); - } - - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - CudaDeviceContext ctx(resource_manager_->get_local_gpu(i)->get_device_id()); - - frequent_compression_.emplace_back(max_num_frequent_categories, data_[i], models[i]); - infrequent_selection_.emplace_back(data_[i], models[i]); - } -} - -template -void BatchIndices::compute(int raw_device_id, size_t batch_size, cudaStream_t stream) { - auto& local_gpu = resource_manager_->get_local_gpu(raw_device_id); - auto& my_data = data_[raw_device_id]; - - auto samples = my_data.samples; - samples.reset_shape({batch_size, num_slots_}); - - my_data.data_to_unique_categories(samples, stream); - - compute_indices(frequent_compression_[raw_device_id], infrequent_selection_[raw_device_id], - communication_type_, true, stream, local_gpu->get_sm_count()); -} - -template class BatchIndices; -template class BatchIndices; - -} // namespace hybrid_embedding -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu b/HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu deleted file mode 100644 index 9020bcd2e2..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/infrequent_embedding.cu +++ /dev/null @@ -1,670 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -namespace infrequent_embedding_kernels { - -template -__global__ void hier_update_model(InfrequentEmbeddingSelectionView* indices, - const dtype* __restrict__ category_location, - const emtype* __restrict__ gradients, - float* __restrict__ embedding_vectors, - uint32_t embedding_vec_size, uint32_t num_instances, - uint32_t local_samples_size, uint32_t local_comm_buff_size, - const float* __restrict__ lr_ptr, const float scale) { - float lr = __ldg(lr_ptr) / scale; - const uint32_t num_indices = indices->model_indices_offsets[num_instances]; - - // Load offset only when the network_id changes - uint32_t previous_network_id = 0; - uint32_t offset = 0; - - for (uint32_t i = blockIdx.x; i < num_indices; i += gridDim.x) { - uint32_t index = indices->model_indices[i]; - dtype category = indices->samples[index]; - dtype location = category_location[2 * category + 1]; - uint32_t network_id = index / local_samples_size; - if (network_id != previous_network_id) { - offset = indices->model_indices_offsets[network_id]; - previous_network_id = network_id; - } - atomicAdd( - embedding_vectors + location * embedding_vec_size + threadIdx.x, - -lr * TypeConvertFunc::convert( - gradients[embedding_vec_size * (network_id * local_comm_buff_size + i - offset) + - threadIdx.x])); - } -} - -template -__global__ void infrequent_update_model_direct( - const emtype* const* __restrict__ gradients_pointers, float* embedding_vectors, - InfrequentEmbeddingSelectionView* indices, const dtype* __restrict__ category_location, - uint32_t num_instances, uint32_t model_id, uint32_t embedding_vec_size, - uint32_t local_samples_size, const float* __restrict__ lr_ptr, const float scale) { - float lr = __ldg(lr_ptr) / scale; - // Shift pattern - const uint32_t offset = indices->model_indices_offsets[model_id + 1]; - const uint32_t num_model_indices = indices->model_indices_offsets[num_instances]; - - for (uint32_t i = blockIdx.x; i < num_model_indices; i += gridDim.x) { - uint32_t vid = (i + offset) % num_model_indices; - - uint32_t index = indices->model_indices[vid]; - uint32_t network_id = index / local_samples_size; - uint32_t local_index = index % local_samples_size; - dtype category = indices->samples[index]; - uint32_t location = category_location[2 * category + 1]; - - const emtype* gradients = gradients_pointers[network_id]; - - atomicAdd(embedding_vectors + location * embedding_vec_size + threadIdx.x, - -lr * TypeConvertFunc::convert( - gradients[local_index * embedding_vec_size + threadIdx.x])); - } -} - -// template -// __global__ void calculate_network_indices_mask(const dtype* __restrict__ local_samples, -// const dtype* __restrict__ category_location, -// bool* mask, uint32_t local_samples_size, -// uint32_t num_instances) { -// for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < local_samples_size; -// i += gridDim.x * blockDim.x) { -// dtype category = local_samples[i]; -// uint32_t model_id = static_cast(category_location[2 * category]); -// for (uint32_t section_id = 0; section_id < num_instances; section_id++) { -// mask[local_samples_size * section_id + i] = (model_id == section_id); -// } -// } -// } - -template -static __global__ void offsets_to_sizes(size_t* sizes, LambdaPtr get_offsets_ptr, - size_t element_size, uint32_t num_instances) { - uint32_t* offsets = get_offsets_ptr(); - for (int t = blockIdx.x * blockDim.x + threadIdx.x; t < num_instances; - t += gridDim.x * blockDim.x) { - sizes[t] = (offsets[t + 1] - offsets[t]) * element_size; - } -} - -} // namespace infrequent_embedding_kernels - -template -InfrequentEmbeddingBase::InfrequentEmbeddingBase() {} - -template -InfrequentEmbeddingBase::~InfrequentEmbeddingBase() {} - -template -InfrequentEmbeddingBase::InfrequentEmbeddingBase(const InfrequentEmbeddingBase& other) { - HCTR_LIB_THROW(cudaMalloc(&indices_view_, sizeof(*indices_view_))); - - HCTR_LIB_THROW(cudaMemcpy(indices_view_, other.indices_view_, sizeof(*indices_view_), - cudaMemcpyDeviceToDevice)); -} - -template -void InfrequentEmbeddingBase::set_current_indices( - InfrequentEmbeddingSelection* indices) { - indices_ = indices; - data_ = indices->get_data(); - indices_view_ = indices->get_device_view(); -} - -template -InfrequentEmbedding_NVLink_SingleNode::InfrequentEmbedding_NVLink_SingleNode( - Model& model, GPUResource& gpu_resource, size_t embedding_vec_size) - : model_(model), gpu_resource_(gpu_resource), embedding_vec_size_(embedding_vec_size) { - auto buf = GeneralBuffer2::create(); - buf->reserve({ceildiv(model.num_categories, model.num_instances), embedding_vec_size_}, - &infrequent_embedding_vectors_); - buf->reserve({model.num_instances, 1}, &interaction_layer_input_pointers_train_); - buf->reserve({model.num_instances, 1}, &interaction_layer_input_pointers_eval_); - buf->reserve({model.num_instances, 1}, &gradients_pointers_); - buf->allocate(); -} - -template -void InfrequentEmbedding_NVLink_SingleNode::init_pointers( - int local_gpu_count, const cudaStream_t stream, - std::vector& interaction_layer_input_pointers_train, - std::vector& interaction_layer_input_pointers_eval, - std::vector& gradients_pointers) { - HCTR_LIB_THROW(cudaMemcpyAsync(interaction_layer_input_pointers_train_.get_ptr(), - interaction_layer_input_pointers_train.data(), - local_gpu_count * sizeof(emtype*), cudaMemcpyHostToDevice, - stream)); - HCTR_LIB_THROW(cudaMemcpyAsync(interaction_layer_input_pointers_eval_.get_ptr(), - interaction_layer_input_pointers_eval.data(), - local_gpu_count * sizeof(emtype*), cudaMemcpyHostToDevice, - stream)); - HCTR_LIB_THROW(cudaMemcpyAsync(gradients_pointers_.get_ptr(), gradients_pointers.data(), - local_gpu_count * sizeof(emtype*), cudaMemcpyHostToDevice, - stream)); -} - -/** Forward network for single GPU (no communications) */ -template -void InfrequentEmbedding_NVLink_SingleNode::forward_network_direct( - bool is_train, cudaStream_t stream) { - const uint32_t num_instances = model_.num_instances; - const uint32_t model_id = model_.global_instance_id; - uint32_t local_samples_size = - ceildiv(data_->batch_size, num_instances) * data_->table_sizes.size(); - - auto interaction_layer_input_pointers = is_train - ? interaction_layer_input_pointers_train_.get_ptr() - : interaction_layer_input_pointers_eval_.get_ptr(); - auto indices = this->indices_view_; - auto category_location = model_.category_location.get_ptr(); - auto model_table = infrequent_embedding_vectors_.get_ptr(); - auto embedding_vec_size = embedding_vec_size_; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->model_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - const uint32_t offset = indices->model_indices_offsets[model_id + 1]; - const uint32_t num_model_indices = indices->model_indices_offsets[num_instances]; - const uint32_t vid = (i + offset) % num_model_indices; - const uint32_t index = indices->model_indices[vid]; - - const dtype category = indices->samples[index]; - const dtype location = category_location[2 * category + 1]; - - const uint32_t network_id = index / local_samples_size; - const uint32_t local_index = index % local_samples_size; - - emtype* interaction_layer_input = interaction_layer_input_pointers[network_id]; - - return {model_table + location * embedding_vec_size, - {interaction_layer_input + local_index * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, local_samples_size / 10); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_NVLink_SingleNode::update_model_direct( - float* dev_lr, float scale, cudaStream_t stream) { - const uint32_t& num_instances = model_.num_instances; - uint32_t local_samples_size = - ceildiv(data_->batch_size, num_instances) * data_->table_sizes.size(); - - int num_sm = gpu_resource_.get_sm_count(); - int n_blocks = 8 * num_sm; // TODO: better heuristics - - /* Each model reads from the gradients of each network */ - infrequent_embedding_kernels:: - infrequent_update_model_direct<<>>( - gradients_pointers_.get_ptr(), infrequent_embedding_vectors_.get_ptr(), - this->indices_view_, model_.category_location.get_ptr(), model_.num_instances, - model_.global_instance_id, embedding_vec_size_, local_samples_size, dev_lr, scale); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -InfrequentEmbedding_IB_NVLINK::InfrequentEmbedding_IB_NVLINK( - Model& model, GPUResource& gpu_resource, size_t embedding_vec_size) - : model_(model), gpu_resource_(gpu_resource), embedding_vec_size_(embedding_vec_size) { - auto buf = GeneralBuffer2::create(); - - buf->reserve({ceildiv(model.num_categories, model.num_instances), embedding_vec_size_}, - &infrequent_embedding_vectors_); - buf->allocate(); - - auto managed_buf = GeneralBuffer2::create(); - managed_buf->reserve({model.num_instances + 1, 1}, &model_indices_offsets_); - managed_buf->reserve({model.num_instances + 1, 1}, &network_indices_offsets_); - managed_buf->allocate(); - // int current_device; - // HCTR_LIB_THROW(cudaGetDevice(¤t_device)); - // HCTR_LIB_THROW(cudaMemAdvise(managed_buf->get_ptr(), managed_buf->get_size_in_bytes(), - // cudaMemAdviseSetReadMostly, current_device)); -} - -template -void InfrequentEmbedding_IB_NVLINK::init_comms(size_t embedding_vec_size, - const GPUResource* gpu_resource, - GeneralBuffer2* i_buf, - size_t max_buf_size) { - infrequent_forward_comm_buffers_ = std::make_unique>(i_buf, max_buf_size); - infrequent_backward_comm_buffers_ = - std::make_unique>(i_buf, max_buf_size); - infrequent_forward_comms_ = std::make_unique>( - infrequent_forward_comm_buffers_->send_buffer, infrequent_forward_comm_buffers_->recv_buffer, - get_model_indices_offsets_ptr(), get_network_indices_offsets_ptr(), gpu_resource, - embedding_vec_size); - infrequent_backward_comms_ = std::make_unique>( - infrequent_backward_comm_buffers_->send_buffer, - infrequent_backward_comm_buffers_->recv_buffer, get_network_indices_offsets_ptr(), - get_model_indices_offsets_ptr(), gpu_resource, embedding_vec_size); -} - -template -void InfrequentEmbedding_IB_NVLINK::forward_model(emtype* message_buffer, - cudaStream_t stream) { - HCTR_LIB_THROW(cudaMemcpyAsync( - model_indices_offsets_.get_ptr(), this->indices_->model_indices_offsets_.get_ptr(), - model_indices_offsets_.get_size_in_bytes(), cudaMemcpyDeviceToDevice, stream)); - - HCTR_LIB_THROW(cudaMemcpyAsync( - network_indices_offsets_.get_ptr(), this->indices_->network_indices_offsets_.get_ptr(), - network_indices_offsets_.get_size_in_bytes(), cudaMemcpyDeviceToDevice, stream)); - - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - auto indices = this->indices_view_; - auto category_location = model_.category_location.get_ptr(); - auto infrequent_embedding_vectors = infrequent_embedding_vectors_.get_ptr(); - auto embedding_vec_size = embedding_vec_size_; - auto num_instances = model_.num_instances; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->model_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - uint32_t index = indices->model_indices[i]; - dtype category = indices->samples[index]; - dtype location = category_location[2 * category + 1]; - - return {infrequent_embedding_vectors + location * embedding_vec_size, - {message_buffer + i * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLINK::forward_network(const emtype* message_buffer, - emtype* output_ptr, - cudaStream_t stream) { - auto indices = this->indices_view_; - auto embedding_vec_size = embedding_vec_size_; - auto num_instances = model_.num_instances; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->network_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - uint32_t index = indices->network_indices[i]; - return {message_buffer + i * embedding_vec_size, - {output_ptr + index * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLINK::update_network(const emtype* gradients, - emtype* message_buffer, - cudaStream_t stream) { - auto indices = this->indices_view_; - auto embedding_vec_size = embedding_vec_size_; - auto num_instances = model_.num_instances; - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->network_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - uint32_t index = indices->network_indices[i]; - - return {gradients + index * embedding_vec_size, - {message_buffer + i * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLINK::update_model(const emtype* message_buffer, - float* dev_lr, float scale, - cudaStream_t stream) { - auto indices = this->indices_view_; - const dtype* __restrict__ category_location = model_.category_location.get_ptr(); - auto num_instances = model_.num_instances; - - uint32_t n_blocks = gpu_resource_.get_sm_count(); - - sgd_atomic_update( - message_buffer, infrequent_embedding_vectors_.get_ptr(), - [indices, num_instances] __device__() { - return indices->model_indices_offsets[num_instances]; - }, - [indices, category_location] __device__(uint32_t i) { - uint32_t index = indices->model_indices[i]; - dtype category = indices->samples[index]; - return category_location[2 * category + 1]; - }, - n_blocks, embedding_vec_size_, dev_lr, scale, stream); -} - -template -InfrequentEmbedding_IB_NVLink_Hier::InfrequentEmbedding_IB_NVLink_Hier( - Model& model, GPUResource& gpu_resource, size_t embedding_vec_size) - : model_(model), gpu_resource_(gpu_resource), embedding_vec_size_(embedding_vec_size) { - auto buf = GeneralBuffer2::create(); - buf->reserve({ceildiv(model.num_categories, model.num_instances), embedding_vec_size_}, - &infrequent_embedding_vectors_); - buf->reserve({model_.num_instances}, &model_indices_sizes_); - buf->reserve({model_.num_instances}, &model_indices_sizes_ptrs_); - buf->reserve({model_.num_instances}, &network_indices_sizes_); - buf->reserve({model_.num_instances}, &network_indices_sizes_ptrs_); - buf->allocate(); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::init_comms( - int64_t max_num_infrequent_samples, size_t slot_num, size_t embedding_vec_size, - GeneralBuffer2* buf_ptr, size_t batch_size_true, size_t batch_size_false, - size_t local_gpu_count) { - double p_infrequent_samples = 1.0; - if (max_num_infrequent_samples >= 0) { - p_infrequent_samples = - (double)max_num_infrequent_samples / ((double)batch_size_true * slot_num); - } - auto align = [this](size_t val) { - auto alignment = model_.num_instances; - return ((val + alignment - 1) / alignment) * alignment; - }; - - max_num_infrequent_per_batch_ = - align(std::max(batch_size_true, batch_size_false) * slot_num * p_infrequent_samples); - - max_num_infrequent_per_train_batch_ = align(batch_size_true * slot_num * p_infrequent_samples); - - size_t max_buf_size = embedding_vec_size * max_num_infrequent_per_batch_; - size_t max_back_buf_size = embedding_vec_size * max_num_infrequent_per_train_batch_; - - HCTR_LOG_S(INFO, ROOT) << "Allocating A2A buffers for infrequent categories. For training : " - << max_num_infrequent_per_train_batch_ - << ", for evaluation: " << max_num_infrequent_per_batch_ << std::endl; - - infrequent_backward_comm_buffers_ = - std::make_unique>(buf_ptr, max_back_buf_size); - infrequent_forward_comm_buffers_ = - std::make_unique>(buf_ptr, max_buf_size); - // TODO: need to check the correctness - buf_ptr->reserve({local_gpu_count}, &infrequent_forward_comm_buffers_->send_buffer_ptrs); - buf_ptr->reserve({local_gpu_count}, &infrequent_backward_comm_buffers_->send_buffer_ptrs); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::fused_intra_forward_model( - emtype** message_buffer, cudaStream_t stream) { - auto indices = this->indices_view_; - auto category_location = model_.category_location.get_ptr(); - auto infrequent_embedding_vectors = infrequent_embedding_vectors_.get_ptr(); - size_t embedding_vec_size = embedding_vec_size_; - auto local_instance_id = model_.instance_id; - auto num_instances = model_.num_instances; - auto per_node_instances = num_instances / model_.h_num_instances_per_node.size(); - uint32_t local_samples_size = - ceildiv(data_->batch_size, num_instances) * data_->table_sizes.size(); - - uint32_t local_comm_buff_size = - ceildiv(max_num_infrequent_per_batch_, model_.num_instances); - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->model_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - uint32_t num_selected = indices->model_indices_offsets[num_instances]; - uint32_t vid = - (i + indices->model_indices_offsets[(local_instance_id + 1) % per_node_instances]) % - num_selected; - uint32_t index = indices->model_indices[vid]; - uint32_t network_id = (index / local_samples_size); - dtype category = indices->samples[index]; - dtype location = category_location[2 * category + 1]; - uint32_t local_network_id = (network_id % per_node_instances); - emtype* output_ptr = - &message_buffer[local_network_id][(network_id - local_network_id + local_instance_id) * - local_comm_buff_size * embedding_vec_size]; - - return { - infrequent_embedding_vectors + location * embedding_vec_size, - {output_ptr + (vid - indices->model_indices_offsets[network_id]) * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::hier_forward_network( - const emtype* message_buffer, emtype* output_ptr, cudaStream_t stream) { - auto indices = this->indices_view_; - auto embedding_vec_size = embedding_vec_size_; - auto num_instances = model_.num_instances; - uint32_t local_samples_size = - ceildiv(data_->batch_size, model_.num_instances) * data_->table_sizes.size(); - uint32_t local_comm_buff_size = - ceildiv(max_num_infrequent_per_batch_, model_.num_instances); - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->network_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - uint32_t index = indices->network_indices[i]; - uint32_t model_id = indices->network_indices_src_model_id[i]; - uint32_t offset = indices->network_indices_offsets[model_id]; - - return { - message_buffer + (model_id * local_comm_buff_size + i - offset) * embedding_vec_size, - {output_ptr + index * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::fused_intra_update_network( - const emtype* gradients, emtype** message_buffer, cudaStream_t stream) { - auto indices = this->indices_view_; - size_t embedding_vec_size = embedding_vec_size_; - auto local_instance_id = model_.instance_id; - auto num_instances = model_.num_instances; - auto per_node_instances = num_instances / model_.h_num_instances_per_node.size(); - uint32_t local_comm_buff_size = - ceildiv(max_num_infrequent_per_train_batch_, model_.num_instances); - - auto copy_desc = CopyDescriptors::make_OneToOne( - embedding_vec_size, - [=] __device__() { return indices->network_indices_offsets[num_instances]; }, - [=] __device__(size_t i) -> CopyDescriptors::CopyDetails { - uint32_t num_selected = indices->network_indices_offsets[num_instances]; - uint32_t vid = - (i + indices->network_indices_offsets[(local_instance_id + 1) % per_node_instances]) % - num_selected; - uint32_t index = indices->network_indices[vid]; - - uint32_t model_id = indices->network_indices_src_model_id[vid]; - - uint32_t local_model_id = (model_id % per_node_instances); - emtype* output_ptr = - &message_buffer[local_model_id][(model_id - local_model_id + local_instance_id) * - local_comm_buff_size * embedding_vec_size]; - - return { - gradients + index * embedding_vec_size, - {output_ptr + (vid - indices->network_indices_offsets[model_id]) * embedding_vec_size}, - {true}}; - }); - - shuffle(copy_desc, stream, data_->samples.get_num_elements() / model_.num_instances / 8); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::hier_update_model( - const emtype* message_buffer, float* dev_lr, float scale, cudaStream_t stream) { - const uint32_t& num_instances = model_.num_instances; - uint32_t local_samples_size = - ceildiv(data_->batch_size, num_instances) * data_->table_sizes.size(); - uint32_t local_comm_buff_size = - ceildiv(max_num_infrequent_per_train_batch_, model_.num_instances); - - int num_sm = gpu_resource_.get_sm_count(); - int n_blocks = 16 * num_sm; // TODO: better heuristics - - infrequent_embedding_kernels::hier_update_model<<>>( - this->indices_view_, model_.category_location.get_ptr(), - // infrequent_backward_comm_buffers_.back().recv_buffer.get_ptr(), - message_buffer, infrequent_embedding_vectors_.get_ptr(), embedding_vec_size_, - model_.num_instances, local_samples_size, local_comm_buff_size, dev_lr, scale); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::calculate_model_indices_sizes_from_offsets( - cudaStream_t stream) { - auto indices = this->indices_view_; - constexpr size_t TPB = 256; - const size_t n_blocks = ceildiv(model_.num_instances, TPB); - infrequent_embedding_kernels::offsets_to_sizes<<>>( - model_indices_sizes_.get_ptr(), [=] __device__() { return indices->model_indices_offsets; }, - embedding_vec_size_ * sizeof(emtype), model_.num_instances); -} - -template -void InfrequentEmbedding_IB_NVLink_Hier< - dtype, emtype>::calculate_network_indices_sizes_from_offsets(cudaStream_t stream) { - auto indices = this->indices_view_; - constexpr size_t TPB = 256; - const size_t n_blocks = ceildiv(model_.num_instances, TPB); - infrequent_embedding_kernels::offsets_to_sizes<<>>( - network_indices_sizes_.get_ptr(), - [=] __device__() { return indices->network_indices_offsets; }, - embedding_vec_size_ * sizeof(emtype), model_.num_instances); -} - -template -void InfrequentEmbedding_NVLink_SingleNode::initialize_embedding_vectors( - const std::vector& table_sizes) { - CudaDeviceContext context(gpu_resource_.get_device_id()); - - const size_t num_tables = table_sizes.size(); - for (size_t i = 0; i < num_tables; i++) { - float up_bound = sqrt(1.f / table_sizes[i]); - - const size_t offset = embedding_vec_size_ * model_.h_infrequent_model_table_offsets[i]; - const size_t number_of_vectors = - model_.h_infrequent_model_table_offsets[i + 1] - model_.h_infrequent_model_table_offsets[i]; - UniformGenerator::fill( - infrequent_embedding_vectors_.get_ptr() + offset, embedding_vec_size_ * number_of_vectors, - -up_bound, up_bound, gpu_resource_.get_sm_count(), - gpu_resource_.get_replica_variant_curand_generator(), gpu_resource_.get_stream()); - } -} - -template -void InfrequentEmbedding_IB_NVLINK::initialize_embedding_vectors( - const std::vector& table_sizes) { - CudaDeviceContext context(gpu_resource_.get_device_id()); - - const size_t num_tables = table_sizes.size(); - for (size_t i = 0; i < num_tables; i++) { - float up_bound = sqrt(1.f / table_sizes[i]); - - const size_t offset = embedding_vec_size_ * model_.h_infrequent_model_table_offsets[i]; - const size_t number_of_vectors = - model_.h_infrequent_model_table_offsets[i + 1] - model_.h_infrequent_model_table_offsets[i]; - UniformGenerator::fill( - infrequent_embedding_vectors_.get_ptr() + offset, embedding_vec_size_ * number_of_vectors, - -up_bound, up_bound, gpu_resource_.get_sm_count(), - gpu_resource_.get_replica_variant_curand_generator(), gpu_resource_.get_stream()); - } -} - -template -void InfrequentEmbedding_IB_NVLink_Hier::initialize_embedding_vectors( - const std::vector& table_sizes) { - CudaDeviceContext context(gpu_resource_.get_device_id()); - - const size_t num_tables = table_sizes.size(); - for (size_t i = 0; i < num_tables; i++) { - float up_bound = sqrt(1.f / table_sizes[i]); - - const size_t offset = embedding_vec_size_ * model_.h_infrequent_model_table_offsets[i]; - const size_t number_of_vectors = - model_.h_infrequent_model_table_offsets[i + 1] - model_.h_infrequent_model_table_offsets[i]; - UniformGenerator::fill( - infrequent_embedding_vectors_.get_ptr() + offset, embedding_vec_size_ * number_of_vectors, - -up_bound, up_bound, gpu_resource_.get_sm_count(), - gpu_resource_.get_replica_variant_curand_generator(), gpu_resource_.get_stream()); - } -} - -template class InfrequentEmbeddingBase; -template class InfrequentEmbeddingBase; - -// NVLink_SingleNode -template class InfrequentEmbedding_NVLink_SingleNode; -template class InfrequentEmbedding_NVLink_SingleNode; -template class InfrequentEmbedding_NVLink_SingleNode; -template class InfrequentEmbedding_NVLink_SingleNode; - -// IB_NVLINK -template class InfrequentEmbedding_IB_NVLINK; -template class InfrequentEmbedding_IB_NVLINK; -template class InfrequentEmbedding_IB_NVLINK; -template class InfrequentEmbedding_IB_NVLINK; - -// IB_NVLink_Hier -template class InfrequentEmbedding_IB_NVLink_Hier; -template class InfrequentEmbedding_IB_NVLink_Hier; -template class InfrequentEmbedding_IB_NVLink_Hier; -template class InfrequentEmbedding_IB_NVLink_Hier; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/model.cu b/HugeCTR/src/embeddings/hybrid_embedding/model.cu deleted file mode 100644 index 2564a8aff6..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/model.cu +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -Model::Model(const Model &model) { - node_id = model.node_id; - instance_id = model.instance_id; - global_instance_id = model.global_instance_id; - communication_type = model.communication_type; - d_num_frequent = model.d_num_frequent; - d_total_frequent_count = model.d_total_frequent_count; - num_frequent = model.num_frequent; - num_categories = model.num_categories; - num_instances = model.num_instances; - if (model.h_num_instances_per_node.size() > 0) { - h_num_instances_per_node.resize(model.h_num_instances_per_node.size()); - for (size_t i = 0; i < model.h_num_instances_per_node.size(); ++i) { - h_num_instances_per_node[i] = model.h_num_instances_per_node[i]; - } - } - num_instances_per_node = model.num_instances_per_node; - category_location = model.category_location; - frequent_categories = model.frequent_categories; - if (model.h_frequent_model_table_offsets.size() > 0) { - h_frequent_model_table_offsets = model.h_frequent_model_table_offsets; - } - if (model.h_infrequent_model_table_offsets.size() > 0) { - h_infrequent_model_table_offsets = model.h_infrequent_model_table_offsets; - } -} - -template -void Model::init_params_and_reserve(CommunicationType communication_type_in, - uint32_t global_instance_id_in, - const std::vector &num_instances_per_node_in, - size_t num_categories_in, - std::shared_ptr> buf) { - // initialize model parameters and reserve memory - communication_type = communication_type_in; - global_instance_id = global_instance_id_in; - h_num_instances_per_node = num_instances_per_node_in; - num_categories = num_categories_in; - num_instances = 0; - for (size_t i = 0; i < h_num_instances_per_node.size(); ++i) - num_instances += h_num_instances_per_node[i]; - - const size_t num_nodes = h_num_instances_per_node.size(); - assert(num_nodes > 0); - uint32_t sum_instances = (uint32_t)0; - for (node_id = 0; node_id < num_nodes && global_instance_id >= sum_instances; ++node_id) - sum_instances += h_num_instances_per_node[node_id]; - node_id--; - - // instance id within node - instance_id = global_instance_id - (sum_instances - h_num_instances_per_node[node_id]); - buf->reserve({1, 1}, &d_num_frequent); - buf->reserve({1, 1}, &d_total_frequent_count); - buf->reserve({h_num_instances_per_node.size(), 1}, &num_instances_per_node); - size_t cate_len = (static_cast(num_categories) + 1) << 1; - buf->reserve({cate_len, 1}, &category_location); // +1 for NULL category -} - -/// init_model calculates the optimal number of frequent categories -/// given the calibration of the all-to-all and all-reduce. -template -void Model::init_hybrid_model(const CalibrationData &calibration, - Statistics &statistics, const Data &data, - Tensor2 &tmp_categories, cudaStream_t stream) { - dtype *frequent_categories_ptr = tmp_categories.get_ptr(); // tmp_categories.get_ptr(); - // list the top categories sorted by count - const Tensor2 &samples = data.samples; - statistics.sort_categories_by_count(samples, stream); - - /* Calculate table offsets, i.e cumulative sum of the table sizes */ - std::vector h_table_offsets(data.table_sizes.size() + 1); - h_table_offsets[0] = 0; - for (size_t i = 0; i < data.table_sizes.size(); i++) { - h_table_offsets[i + 1] = h_table_offsets[i] + (dtype)data.table_sizes[i]; - } - upload_tensor(h_table_offsets, statistics.table_offsets, stream); - - // from the sorted count, determine the number of frequent categories - // - // If the calibration data is present, this is used to calculate the number - // of frequent categories. Otherwise use the threshold required by the - // communication type. - num_frequent = ModelInitializationFunctors::calculate_num_frequent_categories( - communication_type, num_instances, calibration, statistics, data, d_num_frequent.get_ptr(), - stream); - std::shared_ptr> buf = GeneralBuffer2::create(); - buf->reserve({(size_t)num_frequent, 1}, &this->frequent_categories); - buf->allocate(); - frequent_probability = ModelInitializationFunctors::calculate_frequent_probability( - statistics, num_frequent, d_total_frequent_count.get_ptr(), stream); - - dtype num_infrequent = num_categories - num_frequent; - dtype *infrequent_categories_ptr = frequent_categories_ptr + num_frequent; - /* The categories are organized: - * - per instance (round-robin) - * - then per slot - * - and finally in decreasing order of frequency - */ - statistics.calculate_frequent_and_infrequent_categories( - frequent_categories_ptr, infrequent_categories_ptr, category_location.get_ptr(), num_frequent, - num_infrequent, stream); - HCTR_LIB_THROW(cudaMemcpyAsync(this->frequent_categories.get_ptr(), frequent_categories_ptr, - num_frequent * sizeof(dtype), cudaMemcpyDeviceToDevice, stream)); - /* Calculate frequent and infrequent table offsets */ - statistics.calculate_frequent_model_table_offsets(h_frequent_model_table_offsets, - frequent_categories_ptr, num_frequent, stream); - statistics.calculate_infrequent_model_table_offsets(h_infrequent_model_table_offsets, - infrequent_categories_ptr, category_location, - global_instance_id, num_infrequent, stream); - // statistics.revoke_temp_storage(); - /* A synchronization is necessary to ensure that the host arrays have been copied */ - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); -} - -template class Model; -template class Model; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/statistics.cu b/HugeCTR/src/embeddings/hybrid_embedding/statistics.cu deleted file mode 100644 index 4116436f43..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/statistics.cu +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { -namespace hybrid_embedding { - -namespace statistics_kernels { - -/** Compute keys to sort the frequent embedding tables. - * The categories are organized: - * - per instance (round-robin) - * - then per slot - * - and finally in decreasing order of frequency - * - * The sort is stable, so the keys only need to be: instance_id * num_tables + table_id - */ -template -static __global__ void category_to_frequent_section(const dtype *__restrict__ categories_sorted, - uint32_t *keys, - const dtype *__restrict__ table_offsets, - size_t num_frequent, size_t num_tables, - size_t num_instances) { - size_t tid = static_cast(blockIdx.x) * static_cast(blockDim.x) + threadIdx.x; - if (tid < num_frequent) { - dtype category = categories_sorted[tid]; - - uint32_t table_id = 0; - for (table_id = 0; table_id < num_tables - 1 && category >= table_offsets[table_id + 1]; - ++table_id) { - } - - uint32_t instance_id = tid % num_instances; - - keys[tid] = instance_id * num_tables + table_id; - } -} - -template -static __global__ void fill(T *__restrict__ array, T val, IdxT n_elem) { - IdxT tid = static_cast(blockIdx.x) * static_cast(blockDim.x) + - static_cast(threadIdx.x); - if (tid < n_elem) array[tid] = val; -} - -template -static __global__ void calculate_category_location_frequent( - const dtype *__restrict__ frequent_categories, dtype *category_location, size_t num_frequent, - size_t num_instances) { - size_t tid = static_cast(blockIdx.x) * static_cast(blockDim.x) + threadIdx.x; - if (tid < num_frequent) { - dtype category = frequent_categories[tid]; - category_location[2 * (size_t)category] = num_instances; - category_location[2 * (size_t)category + 1] = tid; - } -} - -template -static __global__ void calculate_category_location_infrequent( - const dtype *__restrict__ infrequent_categories, dtype *category_location, - size_t num_infrequent, size_t num_models) { - size_t tid = static_cast(blockIdx.x) * static_cast(blockDim.x) + threadIdx.x; - if (tid < num_infrequent) { - dtype category = infrequent_categories[tid]; - category_location[2 * (size_t)category] = tid % num_models; - category_location[2 * (size_t)category + 1] = tid / num_models; - } -} - -template -static __global__ void calculate_infrequent_model_table_offsets( - const dtype *__restrict__ categories, const dtype *__restrict__ category_location, - const dtype *__restrict__ table_offsets, dtype *offsets, size_t n_tables, dtype n_elem, - dtype n_model_elem, uint32_t global_instance_id) { - const size_t table_id = threadIdx.x; - if (table_id > n_tables) { - return; - } - // Find first category id belonging to that table (not necessarily in this model!) - dtype category = table_offsets[table_id]; - - // Step 1: binary search of the category - dtype start = 0; - dtype end = n_elem; - while (start < end) { - dtype mid = start + (end - start) / 2; - dtype value = categories[mid]; - if (value < category) - start = mid + 1; - else { - end = mid; - } - } - - // Step 2: increment until the model id matches - while (start < n_elem && category_location[2 * (size_t)categories[start]] != global_instance_id) { - start++; - } - - // Step 3: lookup location and write the offset - if (start == n_elem) { - // If we are at the end of the array, write the number of elements belonging to this model - offsets[table_id] = n_model_elem; - } else { - // Else, write the location of the first category from this table belonging to this model - offsets[table_id] = category_location[2 * (size_t)categories[start] + 1]; - } -} - -template -static __global__ void calculate_frequent_model_table_offsets( - const dtype *__restrict__ categories, const dtype *__restrict__ table_offsets, dtype *offsets, - size_t n_divs, size_t n_tables, dtype n_elem) { - const size_t div_id = blockIdx.x; - const size_t table_id = threadIdx.x; - - const dtype n_elem_per_div = n_elem / n_divs; // Note: num_instances divides num_frequent - - // Find first category id belonging to that table - dtype category = table_offsets[table_id]; - - // Setup start and end to the bounds of this division - dtype start = div_id * n_elem_per_div; - dtype end = (div_id + 1) * n_elem_per_div; - - // Binary search - while (start < end) { - dtype mid = (start + end) / 2; - dtype value = categories[mid]; - - if (value < category) - start = mid + 1; - else - end = mid; - } - - // Write offset - offsets[div_id * (n_tables + 1) + table_id] = start; -} - -} // namespace statistics_kernels - -/// -/// Perform count of categories within the samples and sort the categories by count -/// -template -void Statistics::sort_categories_by_count(const Tensor2 &samples, - cudaStream_t stream) { - const dtype *d_samples = samples.get_ptr(); - size_t num_samples = samples.get_size_in_bytes() / sizeof(dtype); - dtype *d_categories = categories_sorted.get_ptr(); - uint32_t *d_counts = counts_sorted.get_ptr(); - sort_categories_by_count(d_samples, num_samples, d_categories, d_counts, num_unique_categories, - stream); // Kefengs' function - categories_sorted.reset_shape({num_unique_categories, 1}); - counts_sorted.reset_shape({num_unique_categories, 1}); -} - -template -struct InfrequentSelectOp { - const dtype *category_location; - const dtype num_categories; - __host__ __device__ __forceinline__ InfrequentSelectOp(const dtype *category_location, - const dtype num_categories) - : category_location(category_location), num_categories(num_categories) {} - __device__ __forceinline__ bool operator()(const dtype &category) const { - return category_location[2 * (size_t)category + 1] == num_categories; - } -}; - -template -void Statistics::reserve_temp_storage(std::shared_ptr> buf) { - size_t size_sort_keys_temp = 0; - sort_categories_by_count_temp_storages_.resize(7); - HCTR_LIB_THROW(cub::DeviceRadixSort::SortKeys((void *)nullptr, size_sort_keys_temp, - (dtype *)nullptr, (dtype *)nullptr, - (int)num_samples, 0, sizeof(dtype) * 8, 0)); - buf->reserve({size_sort_keys_temp, 1}, &sort_categories_by_count_temp_storages_[0]); - buf->reserve({num_samples * sizeof(dtype), 1}, &sort_categories_by_count_temp_storages_[1]); - size_t size_unique_categories_temp = 0; - HCTR_LIB_THROW(cub::DeviceRunLengthEncode::Encode( - (void *)nullptr, size_unique_categories_temp, (dtype *)nullptr, (dtype *)nullptr, - (uint32_t *)nullptr, (uint32_t *)nullptr, (int)num_samples, 0)); - - buf->reserve({size_unique_categories_temp, 1}, &sort_categories_by_count_temp_storages_[2]); - buf->reserve({num_samples * sizeof(dtype), 1}, &sort_categories_by_count_temp_storages_[3]); - buf->reserve({num_samples * sizeof(uint32_t), 1}, &sort_categories_by_count_temp_storages_[4]); - buf->reserve({sizeof(uint32_t), 1}, &sort_categories_by_count_temp_storages_[5]); - - size_t size_sort_pairs_temp = 0; - HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairsDescending( - (void *)nullptr, size_sort_pairs_temp, (uint32_t *)nullptr, (uint32_t *)nullptr, - (dtype *)nullptr, (dtype *)nullptr, (int)num_samples, 0, sizeof(uint32_t) * 8, 0)); - buf->reserve({size_sort_pairs_temp, 1}, &sort_categories_by_count_temp_storages_[6]); - - /// TODO: reuse temp storage for operations that can't run concurrently! - - calculate_frequent_categories_temp_storages_.resize(3); - size_t size_sort_temp = 0; - int bit_width = 1; - for (uint32_t i = num_instances * num_tables - 1; i >>= 1;) bit_width++; - HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairs( - (void *)nullptr, size_sort_temp, (uint32_t *)nullptr, (uint32_t *)nullptr, (dtype *)nullptr, - (dtype *)nullptr, (int)num_samples, 0, bit_width, 0)); - - buf->reserve({num_samples * sizeof(uint32_t), 1}, - &calculate_frequent_categories_temp_storages_[0]); - buf->reserve({num_samples * sizeof(uint32_t), 1}, - &calculate_frequent_categories_temp_storages_[1]); - buf->reserve({size_sort_temp, 1}, &calculate_frequent_categories_temp_storages_[2]); - - calculate_infrequent_categories_temp_storages_.resize(2); - size_t size_select_temp = 0; - cub::CountingInputIterator counting(0); - InfrequentSelectOp select_op(nullptr, 0); - if (static_cast(num_categories) < (1ul << 31)) { - HCTR_LIB_THROW(cub::DeviceSelect::If((void *)nullptr, size_select_temp, counting, - (dtype *)nullptr, (dtype *)nullptr, num_categories, - select_op, 0)); - } else { - HugeCTR::DeviceSelect::If((void *)nullptr, size_select_temp, counting, (dtype *)nullptr, - (dtype *)nullptr, static_cast(num_categories), select_op, 0); - } - buf->reserve({size_select_temp, 1}, &calculate_infrequent_categories_temp_storages_[0]); - buf->reserve({sizeof(dtype), 1}, &calculate_infrequent_categories_temp_storages_[1]); -}; - -template -void Statistics::sort_categories_by_count(const dtype *samples, size_t num_samples, - dtype *categories_sorted, uint32_t *counts_sorted, - uint32_t &num_unique_categories, - cudaStream_t stream) { - if (num_samples > 0x7fffffff) { - HCTR_LOG_S(ERROR, WORLD) << "Num samples: " << std::hex << num_samples << std::dec << std::endl; - HCTR_OWN_THROW(Error_t::WrongInput, "num_samples is too large, overflow for int type"); - } - void *p_sort_keys_temp = - reinterpret_cast(sort_categories_by_count_temp_storages_[0].get_ptr()); // void* - dtype *p_sort_keys_out = - reinterpret_cast(sort_categories_by_count_temp_storages_[1].get_ptr()); // dtype* - void *p_unique_categories_temp = - reinterpret_cast(sort_categories_by_count_temp_storages_[2].get_ptr()); // void* - dtype *p_unique_categories_out = - reinterpret_cast(sort_categories_by_count_temp_storages_[3].get_ptr()); // dtype* - uint32_t *p_unique_categories_counts = reinterpret_cast( - sort_categories_by_count_temp_storages_[4].get_ptr()); // uint32_t* - uint32_t *p_num_unique_categories = reinterpret_cast( - sort_categories_by_count_temp_storages_[5].get_ptr()); // uint32* - void *p_sort_pairs_temp = - reinterpret_cast(sort_categories_by_count_temp_storages_[6].get_ptr()); // void* - - size_t temp_size = sort_categories_by_count_temp_storages_[0].get_size_in_bytes(); - HCTR_LIB_THROW(cub::DeviceRadixSort::SortKeys(p_sort_keys_temp, temp_size, samples, - p_sort_keys_out, (int)num_samples, 0, - sizeof(dtype) * 8, stream)); - size_t sorted_len = (size_t)num_samples; - temp_size = sort_categories_by_count_temp_storages_[2].get_size_in_bytes(); - HCTR_LIB_THROW(cub::DeviceRunLengthEncode::Encode( - p_unique_categories_temp, temp_size, p_sort_keys_out, p_unique_categories_out, - p_unique_categories_counts, p_num_unique_categories, (int)num_samples, stream)); - HCTR_LIB_THROW(cudaMemcpyAsync((void *)&num_unique_categories, (void *)p_num_unique_categories, - sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - temp_size = sort_categories_by_count_temp_storages_[6].get_size_in_bytes(); - HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairsDescending( - p_sort_pairs_temp, temp_size, p_unique_categories_counts, counts_sorted, - p_unique_categories_out, categories_sorted, (int)num_unique_categories, 0, - sizeof(uint32_t) * 8, stream)); -} - -template -void Statistics::calculate_frequent_and_infrequent_categories( - dtype *frequent_categories, dtype *infrequent_categories, dtype *category_location, - const size_t num_frequent, const size_t num_infrequent, cudaStream_t stream) { - // Fill with default value1 - constexpr size_t TPB_fill = 256; - const size_t total_num_categories = num_categories + 1; // Add NULL category - const size_t n_blocks_fill = ceildiv(2 * total_num_categories, TPB_fill); - statistics_kernels::fill<<>>( - category_location, (dtype)num_categories, 2 * total_num_categories); - HCTR_LIB_THROW(cudaPeekAtLastError()); - // Frequent category generation - if (num_frequent > 0) { - uint32_t *p_keys_in = reinterpret_cast( - calculate_frequent_categories_temp_storages_[0].get_ptr()); // uint32_t* - uint32_t *p_keys_out = reinterpret_cast( - calculate_frequent_categories_temp_storages_[1].get_ptr()); // uint32_t* - void *p_sort_temp = reinterpret_cast( - calculate_frequent_categories_temp_storages_[2].get_ptr()); // void* - size_t sort_temp_size = calculate_frequent_categories_temp_storages_[2].get_size_in_bytes(); - - // Generate keys - constexpr size_t TPB_keys = 256; - const size_t n_blocks_keys = ceildiv(num_frequent, TPB_keys); - statistics_kernels::category_to_frequent_section<<>>( - categories_sorted.get_ptr(), p_keys_in, table_offsets.get_ptr(), num_frequent, num_tables, - num_instances); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - // Sort - int bit_width = 1; - for (uint32_t i = num_instances * num_tables - 1; i >>= 1;) bit_width++; - HCTR_LIB_THROW(cub::DeviceRadixSort::SortPairs( - p_sort_temp, sort_temp_size, p_keys_in, p_keys_out, categories_sorted.get_ptr(), - frequent_categories, (int)num_frequent, 0, bit_width, stream)); - constexpr size_t TPB_loc = 256; - const size_t n_blocks_loc_freq = (size_t)ceildiv(num_frequent, TPB_loc); - statistics_kernels:: - calculate_category_location_frequent<<>>( - frequent_categories, category_location, num_frequent, num_instances); - } - // Infrequent category generation - if (num_infrequent > 0) { - // TODO: combine select and writing to category_location with a custom output iterator - void *p_select_temp = reinterpret_cast( - calculate_infrequent_categories_temp_storages_[0].get_ptr()); // void* - dtype *p_num_selected = reinterpret_cast( - calculate_infrequent_categories_temp_storages_[1].get_ptr()); // dtype* - size_t select_temp_size = calculate_infrequent_categories_temp_storages_[0].get_size_in_bytes(); - - cub::CountingInputIterator counting(0); - InfrequentSelectOp select_op(category_location, num_categories); - if (static_cast(num_categories) < (1ul << 31)) { - HCTR_LIB_THROW(cub::DeviceSelect::If(p_select_temp, select_temp_size, counting, - infrequent_categories, p_num_selected, num_categories, - select_op, stream)); - } else { - HugeCTR::DeviceSelect::If(p_select_temp, select_temp_size, counting, infrequent_categories, - p_num_selected, static_cast(num_categories), select_op, - stream); - } - - constexpr size_t TPB_loc = 256; - const size_t n_blocks_loc_infreq = (size_t)ceildiv(num_infrequent, TPB_loc); - statistics_kernels:: - calculate_category_location_infrequent<<>>( - infrequent_categories, category_location, num_infrequent, num_instances); - HCTR_LIB_THROW(cudaPeekAtLastError()); - } -} - -template -void Statistics::calculate_infrequent_model_table_offsets( - std::vector &h_infrequent_model_table_offsets, const dtype *infrequent_categories, - const Tensor2 &category_location, uint32_t global_instance_id, - const dtype num_infrequent, cudaStream_t stream) { - dtype num_model_infrequent = num_infrequent / num_instances + - (global_instance_id < num_infrequent % num_instances ? 1 : 0); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - statistics_kernels::calculate_infrequent_model_table_offsets<<<1, 64, 0, stream>>>( - infrequent_categories, category_location.get_ptr(), table_offsets.get_ptr(), - infrequent_model_table_offsets.get_ptr(), num_tables, num_infrequent, num_model_infrequent, - global_instance_id); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - h_infrequent_model_table_offsets.resize(num_tables + 1); - HCTR_LIB_THROW(cudaMemcpyAsync(h_infrequent_model_table_offsets.data(), - infrequent_model_table_offsets.get_ptr(), - (num_tables + 1) * sizeof(dtype), cudaMemcpyDeviceToHost, stream)); -} - -template -void Statistics::calculate_frequent_model_table_offsets( - std::vector &h_frequent_model_table_offsets, const dtype *frequent_categories, - const dtype num_frequent, cudaStream_t stream) { - statistics_kernels:: - calculate_frequent_model_table_offsets<<>>( - frequent_categories, table_offsets.get_ptr(), frequent_model_table_offsets.get_ptr(), - num_instances, num_tables, num_frequent); - HCTR_LIB_THROW(cudaPeekAtLastError()); - - h_frequent_model_table_offsets.resize(num_instances * (num_tables + 1)); - HCTR_LIB_THROW(cudaMemcpyAsync( - h_frequent_model_table_offsets.data(), frequent_model_table_offsets.get_ptr(), - num_instances * (num_tables + 1) * sizeof(dtype), cudaMemcpyDeviceToHost, stream)); -} - -template class Statistics; -template class Statistics; -template class Statistics; - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/hybrid_embedding/utils.cu b/HugeCTR/src/embeddings/hybrid_embedding/utils.cu deleted file mode 100644 index 5c450f1337..0000000000 --- a/HugeCTR/src/embeddings/hybrid_embedding/utils.cu +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -void download_tensor(std::vector& h_tensor, const Tensor2 tensor, - cudaStream_t stream) { - size_t tensor_size = tensor.get_num_elements(); - h_tensor.resize(tensor_size); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - HCTR_LIB_THROW(cudaMemcpyAsync(h_tensor.data(), tensor.get_ptr(), tensor.get_size_in_bytes(), - cudaMemcpyDeviceToHost, stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); -} - -template -void upload_tensor(const std::vector& h_tensor, Tensor2 tensor, cudaStream_t stream) { - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - assert(tensor.get_num_elements() >= h_tensor.size()); - HCTR_LIB_THROW(cudaMemcpyAsync(tensor.get_ptr(), h_tensor.data(), h_tensor.size() * sizeof(dtype), - cudaMemcpyHostToDevice, stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); -} - -__global__ void offsets_kernel(const uint32_t* indices, uint32_t* indices_offsets, - uint32_t num_instances, uint32_t multiplier) { - uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x; - - if (tid < num_instances) { - uint32_t searched_value = multiplier * tid; - uint32_t num_selected = indices_offsets[num_instances]; - - // Binary search - uint32_t i = 0; - uint32_t j = num_selected; - while (i < j) { - uint32_t m = (i + j) / 2; - uint32_t value = __ldg(indices + m); - - if (value < searched_value) - i = m + 1; - else - j = m; - } - - // Write offset - indices_offsets[tid] = i; - } -} - -template -__global__ void modulo_kernel(dtype* buffer, const stype* d_num_elements, dtype divisor) { - const stype num_elements = __ldg(d_num_elements); - for (stype i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elements; - i += blockDim.x * gridDim.x) - buffer[i] %= divisor; -} - -__global__ void model_id_kernel(const uint32_t* indices_offsets, uint32_t* src_model_id, - const uint32_t* d_num_elements) { - // Find model id - uint32_t num_elements = __ldg(d_num_elements); - for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elements; - i += blockDim.x * gridDim.x) { - uint32_t model_id = 0; - uint32_t next_offset = indices_offsets[1]; - while (next_offset <= i) { - model_id++; - next_offset = indices_offsets[model_id + 1]; - } - src_model_id[i] = model_id; - } -} - -template void download_tensor(std::vector& h_tensor, - const Tensor2 tensor, cudaStream_t stream); -template void download_tensor(std::vector& h_tensor, - const Tensor2 tensor, cudaStream_t stream); -template void download_tensor(std::vector& h_tensor, - const Tensor2 tensor, cudaStream_t stream); -template void download_tensor<__half>(std::vector<__half>& h_tensor, const Tensor2<__half> tensor, - cudaStream_t stream); -template void download_tensor(std::vector& h_tensor, const Tensor2 tensor, - cudaStream_t stream); -template void upload_tensor(const std::vector& h_tensor, - Tensor2 tensor, cudaStream_t stream); -template void upload_tensor(const std::vector& h_tensor, - Tensor2 tensor, cudaStream_t stream); -template void upload_tensor(const std::vector& h_tensor, - Tensor2 tensor, cudaStream_t stream); - -template void upload_tensor<__half>(const std::vector<__half>& h_tensor, Tensor2<__half> tensor, - cudaStream_t stream); -template void upload_tensor(const std::vector& h_tensor, Tensor2 tensor, - cudaStream_t stream); - -template __global__ void modulo_kernel(uint32_t* buffer, const uint32_t* d_num_elements, - uint32_t divisor); -} // namespace hybrid_embedding - -} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/src/embeddings/hybrid_sparse_embedding.cu b/HugeCTR/src/embeddings/hybrid_sparse_embedding.cu deleted file mode 100644 index 3cc1f37c1f..0000000000 --- a/HugeCTR/src/embeddings/hybrid_sparse_embedding.cu +++ /dev/null @@ -1,820 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { -template -HybridSparseEmbedding::HybridSparseEmbedding( - const SparseTensors &train_input_tensors, - const SparseTensors &evaluate_input_tensors, - const HybridSparseEmbeddingParams &embedding_params, - const std::vector> &grouped_wgrad_buff, - const GpuLearningRateSchedulers lr_scheds, bool graph_mode, - const std::shared_ptr &resource_manager) - : embedding_params_(embedding_params), - resource_manager_(resource_manager), - grouped_wgrad_buff_(grouped_wgrad_buff), - grouped_all_reduce_(grouped_wgrad_buff[0] != NULL), - lr_scheds_(lr_scheds), - graph_mode_(graph_mode), - current_train_batch_size_(get_batch_size(true)), - current_eval_batch_size_(get_batch_size(false)) { - try { - // 0. Error check - if (embedding_params_.train_batch_size < 1 || embedding_params_.evaluate_batch_size < 1 || - embedding_params_.slot_num < 1 || embedding_params_.embedding_vec_size < 1) { - HCTR_OWN_THROW(Error_t::WrongInput, - "batchsize < 1 || slot_num < 1 || embedding_vec_size < 1"); - } - - if (embedding_params_.embedding_vec_size > 1024) { - HCTR_OWN_THROW(Error_t::WrongInput, - "the embedding_vec_size can not be more than 1024 in embedding layer"); - } - - size_t total_gpu_count = resource_manager_->get_global_gpu_count(); - size_t local_gpu_count = resource_manager_->get_local_gpu_count(); - - if (train_input_tensors.size() != local_gpu_count || - evaluate_input_tensors.size() != local_gpu_count) { - HCTR_OWN_THROW(Error_t::WrongInput, - "either train_input_tensors.size() or evaluate_input_tensors.size() isn't " - "local_gpu_count_"); - } - - HCTR_LOG_S(INFO, ROOT) << "Using Hybrid Embedding with train batch " << get_batch_size(true) - << " and eval batch " << get_batch_size(false) << std::endl; - - // 1. initialize optimizer - for (size_t id = 0; id < local_gpu_count; id++) { - OptParams opt_params; - opt_params.optimizer = embedding_params_.opt_params.optimizer; - opt_params.lr = embedding_params_.opt_params.lr; - opt_params.update_type = embedding_params_.opt_params.update_type; - opt_params.scaler = embedding_params_.opt_params.scaler; - opt_params_.emplace_back(opt_params); - } - // 2. reserve buffers for different tensors - data_statistics_.reserve(local_gpu_count); - model_.reserve(local_gpu_count); - calibration_.reserve(local_gpu_count); - statistics_.reserve(local_gpu_count); - train_output_tensors_.reserve(local_gpu_count); - evaluate_output_tensors_.reserve(local_gpu_count); - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - frequent_embeddings_single_node_.reserve(local_gpu_count); - } else { - frequent_embeddings_multi_node_.reserve(local_gpu_count); - } - - infrequent_embeddings_single_node_.reserve(local_gpu_count); - infrequent_embeddings_ib_nvlink_.reserve(local_gpu_count); - infrequent_embeddings_ib_nvlink_hier_.reserve(local_gpu_count); - - assert(bufs_.empty()); - CudaDeviceContext context; - // 2.1. construct data - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - - data_statistics_.emplace_back(embedding_params_.slot_size_array, get_batch_size(true), - embedding_params_.num_iterations_statistics); - } - - // 2.2 construct model - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - - std::vector num_instances_per_node(resource_manager_->get_num_process(), 0); - get_num_instances_per_node(num_instances_per_node); - model_.emplace_back(embedding_params_.communication_type, - resource_manager_->get_local_gpu(i)->get_global_id(), - num_instances_per_node, get_categories_num()); - } - - // 2.3 construct calibration - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - calibration_.emplace_back(resource_manager_->get_num_process(), embedding_params_.p_dup_max, - embedding_params_.max_all_reduce_bandwidth, - embedding_params_.max_all_to_all_bandwidth, - embedding_params_.efficiency_bandwidth_ratio); - } - - // 2.4 construct Statistics - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - const size_t num_samples_statistics = embedding_params_.num_iterations_statistics * - get_batch_size(true) * embedding_params_.slot_num; - statistics_.emplace_back((dtype)num_samples_statistics, embedding_params_.slot_num, - model_[i].num_instances, get_categories_num()); - } - - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - std::shared_ptr> buf = GeneralBuffer2::create(); - bufs_.emplace_back(buf); - // 2.5. reserve for train output/ evaluate output tensors - Tensor2 tensor; - buf->reserve({get_batch_size_per_gpu(true), get_slot_num(), get_embedding_vec_size()}, - &tensor); - train_output_tensors_.emplace_back(tensor); - buf->reserve({get_batch_size_per_gpu(false), get_slot_num(), get_embedding_vec_size()}, - &tensor); - evaluate_output_tensors_.emplace_back(tensor); - - // 2.6 construct frequent embedding - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - frequent_embeddings_single_node_.emplace_back( - model_[i], get_local_gpu(i), grouped_wgrad_buff_[i], get_embedding_vec_size(), - embedding_params_.max_num_frequent_categories); - } else { - frequent_embeddings_multi_node_.emplace_back( - model_[i], get_local_gpu(i), grouped_wgrad_buff_[i], get_embedding_vec_size(), - embedding_params_.max_num_frequent_categories); - } - - // 2.7 construct infrequent embedding - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - infrequent_embeddings_single_node_.emplace_back(model_[i], get_local_gpu(i), - get_embedding_vec_size()); - } - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink_.emplace_back(model_[i], get_local_gpu(i), - get_embedding_vec_size()); - } - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_.emplace_back(model_[i], get_local_gpu(i), - get_embedding_vec_size()); - } - - // 2.8 construct communication - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - size_t max_buf_size = embedding_params_.embedding_vec_size * - std::max(get_batch_size(true), get_batch_size(false)) * - embedding_params_.slot_num; - infrequent_embeddings_ib_nvlink_.back().init_comms( - embedding_params_.embedding_vec_size, &get_local_gpu(i), buf.get(), max_buf_size); - } - - // Construct comm buffers - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_[i].init_comms( - embedding_params_.max_num_infrequent_samples, embedding_params_.slot_num, - embedding_params_.embedding_vec_size, buf.get(), get_batch_size(true), - get_batch_size(false), local_gpu_count); - } - - // For global barrier in eval - { - Tensor2 tensor; - buf->reserve({1}, &tensor); - d_barrier_store_.push_back(tensor); - } - buf->allocate(); - } - - // Frequent AR comm init - if ((embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) || - (embedding_params_.communication_type == CommunicationType::IB_NVLink)) { - if (!grouped_all_reduce_) { - // Do your own all-reduce - auto ar_comm = resource_manager_->get_ar_comm(); - frequent_embedding_handle_ = ar_comm->register_coll(); - // Frequent all reduce comm - for (uint32_t i = 0; i < local_gpu_count; i++) { - frequent_embeddings_multi_node_[i].init_ar_comm(ar_comm, frequent_embedding_handle_, i); - } - ar_comm->register_coll_buf(frequent_embedding_handle_); - } - } - - // Init after buffer allocation - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { -#ifdef ENABLE_MPI - ib_comm_ = resource_manager_->get_ib_comm(); - if (!ib_comm_) { - resource_manager_->init_ib_comm(); - ib_comm_ = resource_manager_->get_ib_comm(); - } - comm_stream_.resize(local_gpu_count); - - std::vector h_model_indices_sizes_ptrs(local_gpu_count); - std::vector h_network_indices_sizes_ptrs(local_gpu_count); - std::vector h_fwd_send_buffer_ptrs(local_gpu_count); - std::vector h_bwd_send_buffer_ptrs(local_gpu_count); - for (uint32_t i = 0; i < local_gpu_count; i++) { - h_model_indices_sizes_ptrs[i] = - infrequent_embeddings_ib_nvlink_hier_[i].model_indices_sizes_.get_ptr(); - h_network_indices_sizes_ptrs[i] = - infrequent_embeddings_ib_nvlink_hier_[i].network_indices_sizes_.get_ptr(); - h_fwd_send_buffer_ptrs[i] = infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->send_buffer.get_ptr(); - h_bwd_send_buffer_ptrs[i] = infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->send_buffer.get_ptr(); - } - - // Forward coll init - auto infrequent_forward_coll_handle = ib_comm_->register_hier_a2a_v_coll(true); - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - - // download pointers - HCTR_LIB_THROW(cudaMemcpyAsync( - infrequent_embeddings_ib_nvlink_hier_[i].model_indices_sizes_ptrs_.get_ptr(), - h_model_indices_sizes_ptrs.data(), sizeof(size_t *) * local_gpu_count, - cudaMemcpyHostToDevice, get_local_gpu(i).get_stream())); - - HCTR_LIB_THROW(cudaMemcpyAsync( - infrequent_embeddings_ib_nvlink_hier_[i].network_indices_sizes_ptrs_.get_ptr(), - h_network_indices_sizes_ptrs.data(), sizeof(size_t *) * local_gpu_count, - cudaMemcpyHostToDevice, get_local_gpu(i).get_stream())); - - HCTR_LIB_THROW( - cudaMemcpyAsync(infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->send_buffer_ptrs.get_ptr(), - h_fwd_send_buffer_ptrs.data(), sizeof(emtype *) * local_gpu_count, - cudaMemcpyHostToDevice, get_local_gpu(i).get_stream())); - - HCTR_LIB_THROW( - cudaMemcpyAsync(infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->send_buffer_ptrs.get_ptr(), - h_bwd_send_buffer_ptrs.data(), sizeof(emtype *) * local_gpu_count, - cudaMemcpyHostToDevice, get_local_gpu(i).get_stream())); - - HCTR_LIB_THROW(cudaStreamSynchronize(get_local_gpu(i).get_stream())); - - // Initialize IB comm - HCTR_LIB_THROW(cudaStreamCreateWithPriority(&comm_stream_[i], cudaStreamNonBlocking, -100)); - ib_comm_->set_a2a_coll_stream(infrequent_forward_coll_handle, comm_stream_[i], i); - - ib_comm_->set_a2a_coll_buf( - infrequent_forward_coll_handle, - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->send_buffer.get_ptr(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->send_buffer.get_size_in_bytes(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->recv_buffer.get_ptr(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->recv_buffer.get_size_in_bytes(), - i); - - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_ = - std::make_unique>( - i, infrequent_forward_coll_handle, - infrequent_embeddings_ib_nvlink_hier_[i].model_indices_sizes_ptrs_.get_ptr(), - &get_local_gpu(i), ib_comm_, comm_stream_[i]); - } - ib_comm_->register_a2a_coll_buf(infrequent_forward_coll_handle); - - // Backward coll init - auto infrequent_backward_coll_handle = ib_comm_->register_hier_a2a_v_coll(true); - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - - ib_comm_->set_a2a_coll_stream(infrequent_backward_coll_handle, comm_stream_[i], i); - ib_comm_->set_a2a_coll_buf( - infrequent_backward_coll_handle, - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->send_buffer.get_ptr(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->send_buffer.get_size_in_bytes(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->recv_buffer.get_ptr(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->recv_buffer.get_size_in_bytes(), - i); - - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_backward_comms_ = - std::make_unique>( - i, infrequent_backward_coll_handle, - infrequent_embeddings_ib_nvlink_hier_[i].network_indices_sizes_ptrs_.get_ptr(), - &get_local_gpu(i), ib_comm_, comm_stream_[i]); - } - ib_comm_->register_a2a_coll_buf(infrequent_backward_coll_handle); -#else - HCTR_OWN_THROW(Error_t::WrongInput, "MPI is not enabled but trying to use IB_NVLink_Hier"); -#endif - } - - // 2.9 Single-node: copy some pointers arrays to device - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - // Initialize GPU barrier - gpu_barrier_ = std::make_unique(resource_manager_->get_local_gpu_count(), - resource_manager_->get_local_gpu_device_id_list(), - graph_mode_); - - std::vector frequent_vectors_cache_pointers(local_gpu_count); - std::vector interaction_layer_input_pointers_train(local_gpu_count); - std::vector interaction_layer_input_pointers_eval(local_gpu_count); - std::vector gradients_pointers(local_gpu_count); - std::vector frequent_partial_gradients_pointers(local_gpu_count); - - for (uint32_t i = 0; i < local_gpu_count; i++) { - frequent_vectors_cache_pointers[i] = - frequent_embeddings_single_node_[i].get_embedding_vectors_cache().get_ptr(); - interaction_layer_input_pointers_train[i] = train_output_tensors_[i].get_ptr(); - gradients_pointers[i] = train_output_tensors_[i].get_ptr(); - interaction_layer_input_pointers_eval[i] = evaluate_output_tensors_[i].get_ptr(); - frequent_partial_gradients_pointers[i] = - frequent_embeddings_single_node_[i].frequent_data_.get_gradients().get_ptr(); - } - - for (uint32_t i = 0; i < local_gpu_count; i++) { - int cur_device = get_local_gpu(i).get_device_id(); - context.set_device(cur_device); - - HCTR_LIB_THROW(cudaMemcpyAsync( - frequent_embeddings_single_node_[i].embedding_vectors_cache_pointers_.get_ptr(), - frequent_vectors_cache_pointers.data(), local_gpu_count * sizeof(float *), - cudaMemcpyHostToDevice, get_local_gpu(i).get_stream())); - - infrequent_embeddings_single_node_[i].init_pointers( - local_gpu_count, get_local_gpu(i).get_stream(), interaction_layer_input_pointers_train, - interaction_layer_input_pointers_eval, gradients_pointers); - HCTR_LIB_THROW(cudaMemcpyAsync( - frequent_embeddings_single_node_[i].partial_gradients_pointers_.get_ptr(), - frequent_partial_gradients_pointers.data(), local_gpu_count * sizeof(emtype *), - cudaMemcpyHostToDevice, get_local_gpu(i).get_stream())); - } - } - - // Setup default indices - train_batch_indices_.emplace_back(model_, train_input_tensors, resource_manager_, - get_batch_size(true), embedding_params_.slot_size_array, - embedding_params_.max_num_frequent_categories, - embedding_params_.communication_type); - - eval_batch_indices_.emplace_back(model_, evaluate_input_tensors, resource_manager_, - get_batch_size(false), embedding_params_.slot_size_array, - embedding_params_.max_num_frequent_categories, - embedding_params_.communication_type); - - } catch (const std::runtime_error &rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } -} - -template -void HybridSparseEmbedding::init_model(const SparseTensors &data, - size_t &wgrad_offset_in_bytes) { - size_t local_gpu_count = resource_manager_->get_local_gpu_count(); -#pragma omp parallel for num_threads(local_gpu_count) - for (size_t id = 0; id < local_gpu_count; ++id) { - int cur_device = get_local_gpu(id).get_device_id(); - CudaDeviceContext context(cur_device); - std::shared_ptr> buf = GeneralBuffer2::create(); - Tensor2 tmp_categories; - buf->reserve({(size_t)statistics_[id].num_categories, 1}, &tmp_categories); - buf->allocate(); - auto stream = get_local_gpu(id).get_stream(); - data_statistics_[id].data_to_unique_categories(data[id].get_value_tensor(), stream); - model_[id].init_hybrid_model(calibration_[id], statistics_[id], data_statistics_[id], - tmp_categories, stream); - get_frequent_embedding_data(id).initialize_embedding_vectors(data_statistics_[id].table_sizes, - wgrad_offset_in_bytes); - - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - infrequent_embeddings_single_node_[id].initialize_embedding_vectors( - data_statistics_[id].table_sizes); - } - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink_[id].initialize_embedding_vectors( - data_statistics_[id].table_sizes); - } - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_[id].initialize_embedding_vectors( - data_statistics_[id].table_sizes); - } - - if (embedding_params_.max_num_frequent_categories < (size_t)model_[id].num_frequent) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "Found too many frequent categories, please increase 'max_num_frequent_categories'"); - } - } - // free statistics_ memory - // statistics_.clear(); - data_statistics_.clear(); - - HCTR_LOG_S(INFO, ROOT) << "Initialized hybrid model with " << model_[0].num_frequent - << " frequent categories, probability of being frequent is " - << model_[0].frequent_probability << std::endl; - - size_t avg_train_infrequent = (1 - model_[0].frequent_probability) * - embedding_params_.slot_size_array.size() * get_batch_size(true); - size_t avg_evaluate_infrequent = (1 - model_[0].frequent_probability) * - embedding_params_.slot_size_array.size() * get_batch_size(false); - - HCTR_LOG_S(INFO, ROOT) << "Estimated number of infrequent categories per train batch: " - << avg_train_infrequent << ", eval batch: " << avg_evaluate_infrequent - << std::endl; - - if ((embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) || - (embedding_params_.communication_type == CommunicationType::IB_NVLink)) { - size_t wgrad_size = - model_[0].num_frequent * embedding_params_.embedding_vec_size * sizeof(emtype); - - if (!grouped_all_reduce_) { - // Manage your own all-reduce - auto ar_comm = resource_manager_->get_ar_comm(); - ar_comm->update_size(frequent_embedding_handle_, wgrad_size); - } else { - wgrad_offset_in_bytes += wgrad_size; - } - } -} - -template -void HybridSparseEmbedding::setup_buffered_indices(bool is_train, - AsyncReader *data_reader) { - if (is_train) { - // Double buffering for overlapping indices calculation between iterations - data_reader->set_tensor_buffering(2); - } else { - // If get_max_batches_inflight() is > than the number of eval batches in the dataset, - // this will cause the batch tensors to be cached. We need the tensors to be cached in order - // for the indices to be cached because the index calculation is done in place in these - // tensors. - // TODO: if OOM then eval_data_reader->set_tensor_buffering(2) - data_reader->set_tensor_buffering(data_reader->get_max_batches_inflight()); - } - - const auto data_tensors = data_reader->get_value_tensor_buffers(); - auto &batch_indices = is_train ? train_batch_indices_ : eval_batch_indices_; - batch_indices.clear(); // remove default - for (size_t i = 0; i < data_tensors.size(); ++i) { - batch_indices.emplace_back(model_, data_tensors.at(i), resource_manager_, - get_batch_size(is_train), embedding_params_.slot_size_array, - embedding_params_.max_num_frequent_categories, - embedding_params_.communication_type); - } -} - -template -void HybridSparseEmbedding::forward(bool is_train) { - size_t local_gpu_count = resource_manager_->get_local_gpu_count(); - -// Index calculations -#pragma omp parallel for num_threads(local_gpu_count) - for (size_t i = 0; i < local_gpu_count; i++) { - auto &gpu = get_local_gpu(i); - CudaDeviceContext context(gpu.get_device_id()); - - index_calculation(is_train, i); - infreq_model_forward(i); - freq_forward(is_train, i, true); - infreq_network_forward(is_train, i); - } -} - -template -void HybridSparseEmbedding::backward() { - size_t local_gpu_count = resource_manager_->get_local_gpu_count(); - -#pragma omp parallel for num_threads(local_gpu_count) - for (size_t i = 0; i < local_gpu_count; i++) { - auto cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - - freq_backward(i); - infreq_network_backward(i); - infreq_model_backward(i); - } -} - -template -void HybridSparseEmbedding::update_params() { - size_t local_gpu_count = resource_manager_->get_local_gpu_count(); - -#pragma omp parallel for num_threads(local_gpu_count) - for (size_t i = 0; i < local_gpu_count; i++) { - auto cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - - freq_update_params(i); - } -} - -template -void HybridSparseEmbedding::init_params() { - // TODO: create init_params() -} - -template -void HybridSparseEmbedding::load_parameters(std::string sparse_model) { - // TODO: create load_parameters() -} - -template -void HybridSparseEmbedding::dump_parameters(std::string sparse_model) const { - // TODO: create dump_parameters() -} - -template -void HybridSparseEmbedding::set_learning_rate(float lr) { - HCTR_OWN_THROW(Error_t::WrongInput, "HybridSparseEmbedding only supports GPU LR scheduler"); -} - -template -GpuLearningRateSchedulers HybridSparseEmbedding::get_learning_rate_schedulers() - const { - return lr_scheds_; -} - -template -size_t HybridSparseEmbedding::get_params_num() const { - return 0; -} - -template -size_t HybridSparseEmbedding::get_vocabulary_size() const { - // TODO: create get_vocabulary_size() - return 0; -} - -template -size_t HybridSparseEmbedding::get_max_vocabulary_size() const { - // TODO: create get_max_vocabulary_size() - return 0; -} - -template -std::vector HybridSparseEmbedding::get_train_output_tensors() const { - return tensors_to_bags(train_output_tensors_); -} - -template -std::vector HybridSparseEmbedding::get_evaluate_output_tensors() const { - return tensors_to_bags(evaluate_output_tensors_); -} - -template -void HybridSparseEmbedding::assign_input_tensors(bool is_train, size_t batch_size, - size_t inflight_id, bool cached) { - if (is_train) { - train_inflight_id_ = inflight_id; - current_train_batch_size_ = batch_size; - current_train_batch_cached_ = cached; - } else { - eval_inflight_id_ = inflight_id; - current_eval_batch_size_ = batch_size; - current_eval_batch_cached_ = cached; - } -} - -template -void HybridSparseEmbedding::index_calculation(bool is_train, int i) { - int cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - auto &gpu = get_local_gpu(i); - cudaStream_t stream = gpu.get_stream(); - - auto &batch_indices = is_train ? train_batch_indices_.at(train_inflight_id_) - : eval_batch_indices_.at(eval_inflight_id_); - - if (is_train) { - if (!current_train_batch_cached_) { - batch_indices.compute(i, current_train_batch_size_, stream); - } - } else { // eval - if (!current_eval_batch_cached_) { - batch_indices.compute(i, current_eval_batch_size_, stream); - } - } - - // We don't copy the sparse tensor since all the required data are already in the - // Data type and indices - get_frequent_embedding(i).set_current_indices(&batch_indices.get_frequent(i)); - get_infrequent_embedding(i).set_current_indices(&batch_indices.get_infrequent(i)); -} - -template -void HybridSparseEmbedding::freq_forward(bool is_train, int i, - bool is_first_eval_batch) { - int cur_device = get_local_gpu(i).get_device_id(); - auto &gpu = get_local_gpu(i); - CudaDeviceContext context(cur_device); - cudaStream_t stream = gpu.get_stream(); - - auto &output = (is_train) ? train_output_tensors_[i] : evaluate_output_tensors_[i]; - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - if (is_train) { - frequent_embeddings_single_node_[i].forward_model(stream); - } else { - if (is_first_eval_batch) { - frequent_embeddings_single_node_[i].forward_model_eval(stream); - } - } - gpu_barrier_->sync_all_gpus(stream, i); - - frequent_embeddings_single_node_[i].forward_network(output.get_ptr(), stream); - } - if (embedding_params_.communication_type == CommunicationType::IB_NVLink || - embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - frequent_embeddings_multi_node_[i].forward_network(output.get_ptr(), stream); - } -} - -template -void HybridSparseEmbedding::freq_backward(int i) { - int cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - auto &gpu = get_local_gpu(i); - cudaStream_t stream = gpu.get_stream(); - - if (frequent_embeddings_single_node_.size()) { - frequent_embeddings_single_node_[i].local_reduce(train_output_tensors_[i].get_ptr(), stream); - } else { - frequent_embeddings_multi_node_[i].local_reduce(train_output_tensors_[i].get_ptr(), stream); - if (!grouped_all_reduce_) { - frequent_embeddings_multi_node_[i].communicate(stream); - } - } -} - -template -void HybridSparseEmbedding::freq_update_params(int i) { - int cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - float *dev_lr = lr_scheds_[i]->get_learning_rate(); - float scale = opt_params_[i].scaler; - auto &gpu = get_local_gpu(i); - cudaStream_t stream = gpu.get_stream(); - - if (embedding_params_.communication_type != CommunicationType::NVLink_SingleNode) { - frequent_embeddings_multi_node_[i].update_model(dev_lr, scale, stream); - } -} - -template -void HybridSparseEmbedding::infreq_model_forward(int i) { - int cur_device = get_local_gpu(i).get_device_id(); - auto &gpu = get_local_gpu(i); - CudaDeviceContext context(cur_device); - cudaStream_t stream = gpu.get_stream(); - - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink_[i].forward_model( - infrequent_embeddings_ib_nvlink_[i].infrequent_forward_comm_buffers_->send_buffer.get_ptr(), - stream); - } else if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_[i].calculate_model_indices_sizes_from_offsets(stream); - infrequent_embeddings_ib_nvlink_hier_[i].calculate_network_indices_sizes_from_offsets(stream); - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_->update_sizes(stream); - infrequent_embeddings_ib_nvlink_hier_[i].fused_intra_forward_model( - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->send_buffer_ptrs.get_ptr(), - stream); - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_->initiate_communication( - stream); - } -} - -template -void HybridSparseEmbedding::infreq_network_forward(bool is_train, int i) { - int cur_device = get_local_gpu(i).get_device_id(); - auto &gpu = get_local_gpu(i); - CudaDeviceContext context(cur_device); - cudaStream_t stream = gpu.get_stream(); - - auto &output = (is_train) ? train_output_tensors_[i] : evaluate_output_tensors_[i]; - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink_[i].infrequent_forward_comms_->communicate(stream); - infrequent_embeddings_ib_nvlink_[i].forward_network( - infrequent_embeddings_ib_nvlink_[i].infrequent_forward_comm_buffers_->recv_buffer.get_ptr(), - output.get_ptr(), stream); - } else if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_forward_comms_->wait_completion(stream); - infrequent_embeddings_ib_nvlink_hier_[i].hier_forward_network( - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_forward_comm_buffers_->recv_buffer.get_ptr(), - output.get_ptr(), stream); - } else { - infrequent_embeddings_single_node_[i].forward_network_direct(is_train, stream); - } -} - -template -void HybridSparseEmbedding::global_barrier(bool is_train, int i) { - int cur_device = get_local_gpu(i).get_device_id(); - auto &gpu = get_local_gpu(i); - CudaDeviceContext context(cur_device); - cudaStream_t stream = gpu.get_stream(); - - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - if (!is_train) { - HCTR_LIB_THROW(ncclAllReduce((const void *)d_barrier_store_[i].get_ptr(), - d_barrier_store_[i].get_ptr(), sizeof(uint32_t), - NcclDataType::getType(), ncclSum, - get_local_gpu(i).get_nccl(), stream)); - } - } -} - -template -void HybridSparseEmbedding::infreq_network_backward(int i) { - int cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - auto &gpu = get_local_gpu(i); - cudaStream_t stream = gpu.get_stream(); - - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink_[i].update_network( - train_output_tensors_[i].get_ptr(), - infrequent_embeddings_ib_nvlink_[i] - .infrequent_backward_comm_buffers_->send_buffer.get_ptr(), - stream); - } - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_backward_comms_->update_sizes(stream); - infrequent_embeddings_ib_nvlink_hier_[i].fused_intra_update_network( - train_output_tensors_[i].get_ptr(), - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->send_buffer_ptrs.get_ptr(), - stream); - } -} - -// Everything that involves network and can be better overlapped with compute -template -void HybridSparseEmbedding::infreq_model_backward(int i) { - int cur_device = get_local_gpu(i).get_device_id(); - CudaDeviceContext context(cur_device); - auto &gpu = get_local_gpu(i); - cudaStream_t stream = gpu.get_stream(); - float *dev_lr = lr_scheds_[i]->get_learning_rate(); - float scale = opt_params_[i].scaler; - - if (embedding_params_.communication_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink_[i].infrequent_backward_comms_->communicate(stream); - infrequent_embeddings_ib_nvlink_[i].update_model( - infrequent_embeddings_ib_nvlink_[i] - .infrequent_backward_comm_buffers_->recv_buffer.get_ptr(), - dev_lr, scale, stream); - } - - if (embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier_[i].infrequent_backward_comms_->communicate(stream); - - infrequent_embeddings_ib_nvlink_hier_[i].hier_update_model( - infrequent_embeddings_ib_nvlink_hier_[i] - .infrequent_backward_comm_buffers_->recv_buffer.get_ptr(), - dev_lr, scale, stream); - } - if (embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - // Synchronize all GPUs before pulling the reduced gradients - gpu_barrier_->sync_all_gpus(stream, i); - - float *dev_lr = lr_scheds_[i]->get_learning_rate(); - float scale = opt_params_[i].scaler; - frequent_embeddings_single_node_[i].update_model_direct(dev_lr, scale, stream); - - infrequent_embeddings_single_node_[i].update_model_direct(dev_lr, scale, stream); - } -} - -template class HybridSparseEmbedding; -template class HybridSparseEmbedding; -template class HybridSparseEmbedding; -template class HybridSparseEmbedding; - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu b/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu deleted file mode 100644 index 6fce1afb63..0000000000 --- a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu +++ /dev/null @@ -1,1334 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#ifdef ENABLE_MPI -#include -#endif - -#include -#include - -namespace HugeCTR { -namespace localized_onehot_filter_keys_kernel { - -template -__global__ void select_value_by_slot_id_kernel(const TypeKey *value, size_t num, - TypeKey *filter_value, size_t slot_num_per_gpu, - size_t slot_num, size_t global_id, - size_t global_num) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < num) { - int batch_size = tid / slot_num; - int slot_id = tid % slot_num; - if (slot_id % global_num == global_id) { - int res_slot_id = slot_id / global_num; - filter_value[batch_size * slot_num_per_gpu + res_slot_id] = __ldg(value + tid); - } - } -} -} // namespace localized_onehot_filter_keys_kernel - -template -void LocalizedSlotSparseEmbeddingOneHot::filter_keys_per_gpu( - bool is_train, size_t id, size_t global_id, size_t global_num) { - const SparseTensor &all_gather_key = embedding_data_.get_input_keys(is_train)[id]; - auto &local_gpu = embedding_data_.get_local_gpu(id); - Tensor2 value_tensor = embedding_data_.get_value_tensors(is_train)[id]; - std::shared_ptr nnz_ptr = embedding_data_.get_nnz_array(is_train)[id]; - - if (all_gather_key.get_dimensions().size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "localized embedding all gather key dimension != 2"); - } - - size_t batch_size = embedding_data_.embedding_params_.get_batch_size(is_train); - size_t slot_num_per_gpu = slot_num_per_gpu_[id]; - size_t slot_num = (all_gather_key.rowoffset_count() - 1) / batch_size; - - constexpr size_t block_size = 256; - size_t grid_size = (all_gather_key.nnz() - 1) / block_size + 1; - localized_onehot_filter_keys_kernel:: - select_value_by_slot_id_kernel<<>>( - all_gather_key.get_value_ptr(), all_gather_key.nnz(), value_tensor.get_ptr(), - slot_num_per_gpu, slot_num, global_id, global_num); - - *nnz_ptr = (all_gather_key.nnz() / slot_num) * slot_num_per_gpu; -} - -template -void LocalizedSlotSparseEmbeddingOneHot< - TypeHashKey, TypeEmbeddingComp>::data_to_unique_categories_per_gpu(bool is_train, size_t id) { - SparseTensor &all_gather_key = embedding_data_.get_input_keys(is_train)[id]; - auto &local_gpu = embedding_data_.get_local_gpu(id); - - if (all_gather_key.get_dimensions().size() != 2) { - HCTR_OWN_THROW(Error_t::WrongInput, "localized embedding all gather key dimension != 2"); - } - - size_t batch_size = embedding_data_.embedding_params_.get_batch_size(is_train); - size_t nnz = all_gather_key.nnz(); - size_t slot_num = (all_gather_key.rowoffset_count() - 1) / batch_size; - - data_to_unique_categories(all_gather_key.get_value_ptr(), - embedding_data_.embedding_offsets_[id].get_ptr(), slot_num, nnz, - local_gpu.get_stream()); -} - -namespace { - -template -__global__ void upload_value_tensor_kernel(value_type *value_buf, size_t *index_buf, - value_type *dst_tensor, int emb_vec_size, size_t len) { - size_t gid = blockIdx.x * blockDim.x + threadIdx.x; - if (gid < len) { - size_t src_offset = gid * emb_vec_size; - size_t dst_offset = index_buf[gid] * emb_vec_size; - for (int i = 0; i < emb_vec_size; i++) { - dst_tensor[dst_offset + i] = value_buf[src_offset + i]; - } - } -} - -} // namespace - -template -LocalizedSlotSparseEmbeddingOneHot:: - LocalizedSlotSparseEmbeddingOneHot( - const Tensors2 &train_row_offsets_tensors, - const Tensors2 &train_value_tensors, - const std::vector> &train_nnz_array, - const Tensors2 &evaluate_row_offsets_tensors, - const Tensors2 &evaluate_value_tensors, - const std::vector> &evaluate_nnz_array, - const SparseEmbeddingHashParams &embedding_params, - const std::shared_ptr &resource_manager) - : embedding_data_(train_row_offsets_tensors, train_value_tensors, train_nnz_array, - evaluate_row_offsets_tensors, evaluate_value_tensors, evaluate_nnz_array, - Embedding_t::LocalizedSlotSparseEmbeddingOneHot, embedding_params, - resource_manager), - slot_size_array_(embedding_params.slot_size_array) { - embedding_data_.embedding_params_.is_data_parallel = - false; // this ctor is only used for embedding plugin - try { - max_vocabulary_size_ = 0; - for (size_t slot_size : slot_size_array_) { - max_vocabulary_size_ += slot_size; - } - - max_vocabulary_size_per_gpu_ = - cal_max_voc_size_per_gpu(slot_size_array_, embedding_data_.get_resource_manager()); - - HCTR_LOG_S(INFO, ROOT) << "max_vocabulary_size_per_gpu_=" << max_vocabulary_size_per_gpu_ - << std::endl; - - CudaDeviceContext context; - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - size_t gid = embedding_data_.get_local_gpu(id).get_global_id(); - size_t slot_num_per_gpu = - embedding_data_.embedding_params_.slot_num / - embedding_data_.get_resource_manager().get_global_gpu_count() + - ((gid < embedding_data_.embedding_params_.slot_num % - embedding_data_.get_resource_manager().get_global_gpu_count()) - ? 1 - : 0); - slot_num_per_gpu_.push_back(slot_num_per_gpu); - - // new GeneralBuffer objects - const std::shared_ptr> &buf = embedding_data_.get_buffer(id); - - // new hash table value vectors - { - const std::shared_ptr> &block = buf->create_block(); - Tensors2 tensors; - for (size_t i = 0; i < slot_size_array_.size(); i++) { - if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == gid) { - Tensor2 tensor; - block->reserve( - {slot_size_array_[i], embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - tensors.push_back(tensor); - } - } - value_table_tensors_.push_back(tensors); - hash_table_value_tensors_.push_back(block->as_tensor()); - } - - // list of top categories, from single iteration worth of data, so max size is same as - // hash_table_value_index_ array - { - HCTR_LOG_S(INFO, WORLD) << "Initializing size_top_categories_ and top_categories.." - << std::endl; - Tensor2 tensor; - buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() * - embedding_data_.embedding_params_.max_feature_num}, - &tensor); - size_top_categories_.push_back(0); - top_categories_.push_back(tensor); - // HCTR_LOG_S(INFO, WORLD) << "top_categories size : " << Base::get_universal_batch_size() * - // Base::get_max_feature_num() << std::endl; - } - - // new hash table value_index that get() from HashTable - { - Tensor2 tensor; - buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() * - embedding_data_.embedding_params_.max_feature_num}, - &tensor); - hash_value_index_tensors_.push_back(tensor); - } - - // new embedding features reduced by hash table values(results of forward) - { - Tensor2 tensor; - buf->reserve( - {embedding_data_.embedding_params_.get_universal_batch_size() * slot_num_per_gpu, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - embedding_feature_tensors_.push_back(tensor); - } - - // new wgrad used by backward - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * slot_num_per_gpu, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - wgrad_tensors_.push_back(tensor); - } - - // new optimizer params used by update_params - switch (embedding_data_.embedding_params_.opt_params.optimizer) { - case Optimizer_t::SGD: - break; - - default: - throw std::runtime_error( - std::string("[HCDEBUG][ERROR] Runtime error: Invalid optimizer type\n")); - } - - // the tenosrs for storing slot ids - // TODO: init to -1 ? - { - Tensor2 tensor; - buf->reserve({max_vocabulary_size_per_gpu_, 1}, &tensor); - hash_table_slot_id_tensors_.push_back(tensor); - } - - // temp tensors for all2all - { - Tensor2 tensor; - buf->reserve({embedding_data_.get_universal_batch_size_per_gpu() * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - all2all_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_universal_batch_size() * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_forward_temp_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.get_batch_size_per_gpu(true) * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_all2all_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.get_batch_size_per_gpu(true) * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_reorder_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_backward_temp_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({1, slot_num_per_gpu}, &tensor); - mapping_offsets_per_gpu_tensors_.push_back(tensor); - } - -// init GenenralBuffers to do real allocation -#ifndef NDEBUG - HCTR_LOG_S(DEBUG, WORLD) << " max_feature_num_:" - << embedding_data_.embedding_params_.max_feature_num << std::endl; -#endif - - } // end of for(int id = 0; id < embedding_data_.get_local_gpu_count(); id++) - -#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count()) - { - size_t id = omp_get_thread_num(); - CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id()); - embedding_data_.get_buffer(id)->allocate(); - HCTR_LIB_THROW(cudaStreamSynchronize(embedding_data_.get_local_gpu(id).get_stream())); - } - - // get the mapping table between local value_index and input value_index - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - uint32_t slot_sizes_prefix_sum = 0; - uint32_t slot_sizes_prefix_sum_local = 0; - int slot_num = 0; - for (size_t i = 0; i < slot_size_array_.size(); i++) { - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - size_t slot_size = slot_size_array_[i]; - if (i % embedding_data_.get_resource_manager().get_global_gpu_count() == global_id) { - uint32_t mapping_offset = slot_sizes_prefix_sum - slot_sizes_prefix_sum_local; - HCTR_LIB_THROW(cudaMemcpy(&((mapping_offsets_per_gpu_tensors_[id].get_ptr())[slot_num]), - &mapping_offset, sizeof(uint32_t), cudaMemcpyHostToDevice)); - slot_sizes_prefix_sum_local += slot_size; - slot_num++; - } - slot_sizes_prefix_sum += slot_size; - } - } - - // Check whether the P2P access can be enabled - if (embedding_data_.get_resource_manager().get_local_gpu_count() > 1 && - !embedding_data_.get_resource_manager().all_p2p_enabled()) { - throw std::runtime_error( - std::string("[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot " - "cannot be used on machine without GPU peer2peer access support. \n")); - } -#ifdef ENABLE_MPI - { - const int num_processor{core23::MpiInitService::get().world_size()}; - if (num_processor > 1) { - throw std::runtime_error( - std::string("[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot " - "cannot support multi-node currently. \n")); - } - } -#endif - - std::shared_ptr> unified_buf = - GeneralBuffer2::create(); - unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()}, - &train_embedding_features_); - unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()}, - &evaluate_embedding_features_); - unified_buf->allocate(); - - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) { - train_embedding_features_.get_ptr()[id] = - embedding_data_.get_output_tensors(true)[id].get_ptr(); - evaluate_embedding_features_.get_ptr()[id] = - embedding_data_.get_output_tensors(false)[id].get_ptr(); - } - - } catch (const std::runtime_error &rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } - - return; -} - -template -LocalizedSlotSparseEmbeddingOneHot:: - LocalizedSlotSparseEmbeddingOneHot(const SparseTensors &train_keys, - const SparseTensors &evaluate_keys, - const SparseEmbeddingHashParams &embedding_params, - const std::shared_ptr &resource_manager) - : embedding_data_(Embedding_t::LocalizedSlotSparseEmbeddingOneHot, train_keys, evaluate_keys, - embedding_params, resource_manager), - slot_size_array_(embedding_params.slot_size_array) { - try { - max_vocabulary_size_ = 0; - for (size_t slot_size : slot_size_array_) { - max_vocabulary_size_ += slot_size; - } - - max_vocabulary_size_per_gpu_ = - cal_max_voc_size_per_gpu(slot_size_array_, embedding_data_.get_resource_manager()); - - HCTR_LOG_S(INFO, ROOT) << "max_vocabulary_size_per_gpu_=" << max_vocabulary_size_per_gpu_ - << std::endl; - - CudaDeviceContext context; - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - size_t gid = embedding_data_.get_local_gpu(id).get_global_id(); - size_t slot_num_per_gpu = - embedding_data_.embedding_params_.slot_num / - embedding_data_.get_resource_manager().get_global_gpu_count() + - ((gid < embedding_data_.embedding_params_.slot_num % - embedding_data_.get_resource_manager().get_global_gpu_count()) - ? 1 - : 0); - slot_num_per_gpu_.push_back(slot_num_per_gpu); - - // new GeneralBuffer objects - const std::shared_ptr> &buf = embedding_data_.get_buffer(id); - - // new hash table value vectors - { - const std::shared_ptr> &block = buf->create_block(); - Tensors2 tensors; - for (size_t i = 0; i < slot_size_array_.size(); i++) { - if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == gid) { - Tensor2 tensor; - block->reserve( - {slot_size_array_[i], embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - tensors.push_back(tensor); - } - } - value_table_tensors_.push_back(tensors); - hash_table_value_tensors_.push_back(block->as_tensor()); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(true), - embedding_data_.embedding_params_.max_feature_num}, - &tensor); - embedding_data_.train_value_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(false), - embedding_data_.embedding_params_.max_feature_num}, - &tensor); - embedding_data_.evaluate_value_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * - embedding_data_.embedding_params_.slot_num + - 1}, - &tensor); - embedding_data_.train_row_offsets_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(false) * - embedding_data_.embedding_params_.slot_num + - 1}, - &tensor); - embedding_data_.evaluate_row_offsets_tensors_.push_back(tensor); - } - { embedding_data_.train_nnz_array_.push_back(std::make_shared(0)); } - { embedding_data_.evaluate_nnz_array_.push_back(std::make_shared(0)); } - - // list of top categories, from single iteration worth of data, so max size is same as - { - HCTR_LOG_S(INFO, WORLD) << "Initializing size_top_categories_ and top_categories.." - << std::endl; - Tensor2 tensor; - buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() * - embedding_data_.embedding_params_.max_feature_num}, - &tensor); - size_top_categories_.push_back(0); - top_categories_.push_back(tensor); - } - - // new hash table value_index that get() from HashTable - { - Tensor2 tensor; - buf->reserve({1, embedding_data_.embedding_params_.get_universal_batch_size() * - embedding_data_.embedding_params_.max_feature_num}, - &tensor); - hash_value_index_tensors_.push_back(tensor); - } - - // new embedding features reduced by hash table values(results of forward) - { - Tensor2 tensor; - buf->reserve( - {embedding_data_.embedding_params_.get_universal_batch_size() * slot_num_per_gpu, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - embedding_feature_tensors_.push_back(tensor); - } - - // new wgrad used by backward - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * slot_num_per_gpu, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - wgrad_tensors_.push_back(tensor); - } - - // new optimizer params used by update_params - switch (embedding_data_.embedding_params_.opt_params.optimizer) { - case Optimizer_t::SGD: - break; - - default: - throw std::runtime_error( - std::string("[HCDEBUG][ERROR] Runtime error: Invalid optimizer type\n")); - } - - // the tenosrs for storing slot ids - // TODO: init to -1 ? - { - Tensor2 tensor; - buf->reserve({max_vocabulary_size_per_gpu_, 1}, &tensor); - hash_table_slot_id_tensors_.push_back(tensor); - } - - // temp tensors for all2all - { - Tensor2 tensor; - buf->reserve({embedding_data_.get_universal_batch_size_per_gpu() * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - all2all_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_universal_batch_size() * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_forward_temp_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.get_batch_size_per_gpu(true) * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_all2all_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.get_batch_size_per_gpu(true) * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_reorder_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({embedding_data_.embedding_params_.get_batch_size(true) * - embedding_data_.embedding_params_.slot_num, - embedding_data_.embedding_params_.embedding_vec_size}, - &tensor); - utest_backward_temp_tensors_.push_back(tensor); - } - { - Tensor2 tensor; - buf->reserve({1, slot_num_per_gpu}, &tensor); - mapping_offsets_per_gpu_tensors_.push_back(tensor); - } - -// init GenenralBuffers to do real allocation -#ifndef NDEBUG - HCTR_LOG_S(DEBUG, WORLD) << " max_feature_num_:" - << embedding_data_.embedding_params_.max_feature_num << std::endl; -#endif - - } // end of for(int id = 0; id < embedding_data_.get_local_gpu_count(); id++) - -#pragma omp parallel for num_threads(embedding_data_.get_resource_manager().get_local_gpu_count()) - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); ++id) { - CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id()); - embedding_data_.get_buffer(id)->allocate(); - - // filling rowoffset and slot_size_array - cudaStream_t stream = embedding_data_.get_local_gpu(id).get_stream(); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - } - - { - std::vector embedding_offsets; - TypeHashKey slot_sizes_prefix_sum = 0; - for (size_t i = 0; i < slot_size_array_.size(); i++) { - embedding_offsets.push_back(slot_sizes_prefix_sum); - slot_sizes_prefix_sum += slot_size_array_[i]; - } - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); ++id) { - CudaDeviceContext context(embedding_data_.get_local_gpu(id).get_device_id()); - - HCTR_LIB_THROW( - cudaMemcpy(embedding_data_.embedding_offsets_[id].get_ptr(), embedding_offsets.data(), - embedding_offsets.size() * sizeof(TypeHashKey), cudaMemcpyHostToDevice)); - - size_t slot_num_per_gpu = slot_num_per_gpu_[id]; - { - std::vector rowoffset_host( - embedding_data_.embedding_params_.get_batch_size(true) * - embedding_data_.embedding_params_.slot_num + - 1); - std::iota(rowoffset_host.begin(), rowoffset_host.end(), 0); - HCTR_LIB_THROW(cudaMemcpy( - embedding_data_.train_row_offsets_tensors_[id].get_ptr(), rowoffset_host.data(), - rowoffset_host.size() * sizeof(TypeHashKey), cudaMemcpyHostToDevice)); - } - { - std::vector rowoffset_host( - embedding_data_.embedding_params_.get_batch_size(false) * - embedding_data_.embedding_params_.slot_num + - 1); - std::iota(rowoffset_host.begin(), rowoffset_host.end(), 0); - HCTR_LIB_THROW(cudaMemcpy( - embedding_data_.evaluate_row_offsets_tensors_[id].get_ptr(), rowoffset_host.data(), - rowoffset_host.size() * sizeof(TypeHashKey), cudaMemcpyHostToDevice)); - } - } - } - - // get the mapping table between local value_index and input value_index - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - uint32_t slot_sizes_prefix_sum = 0; - uint32_t slot_sizes_prefix_sum_local = 0; - int slot_num = 0; - for (size_t i = 0; i < slot_size_array_.size(); i++) { - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - size_t slot_size = slot_size_array_[i]; - if (i % embedding_data_.get_resource_manager().get_global_gpu_count() == global_id) { - uint32_t mapping_offset = slot_sizes_prefix_sum - slot_sizes_prefix_sum_local; - HCTR_LIB_THROW(cudaMemcpy(&((mapping_offsets_per_gpu_tensors_[id].get_ptr())[slot_num]), - &mapping_offset, sizeof(uint32_t), cudaMemcpyHostToDevice)); - slot_sizes_prefix_sum_local += slot_size; - slot_num++; - } - slot_sizes_prefix_sum += slot_size; - } - } - - // Check whether the P2P access can be enabled - if (embedding_data_.get_resource_manager().get_local_gpu_count() > 1 && - !embedding_data_.get_resource_manager().all_p2p_enabled()) { - throw std::runtime_error( - "[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot cannot be used " - "on machine without GPU peer2peer access support.\n"); - } -#ifdef ENABLE_MPI - { - const int num_processor{core23::MpiInitService::get().world_size()}; - if (num_processor > 1) { - throw std::runtime_error( - "[HCDEBUG][ERROR] Runtime error: Localized_slot_sparse_embedding_one_hot cannot " - "support multi-node currently.\n"); - } - } -#endif - - std::shared_ptr> unified_buf = - GeneralBuffer2::create(); - unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()}, - &train_embedding_features_); - unified_buf->reserve({embedding_data_.get_resource_manager().get_local_gpu_count()}, - &evaluate_embedding_features_); - unified_buf->allocate(); - - for (size_t id = 0; id < embedding_data_.get_resource_manager().get_local_gpu_count(); id++) { - train_embedding_features_.get_ptr()[id] = - embedding_data_.get_output_tensors(true)[id].get_ptr(); - evaluate_embedding_features_.get_ptr()[id] = - embedding_data_.get_output_tensors(false)[id].get_ptr(); - } - - } catch (const std::runtime_error &rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } - - return; -} - -template -void LocalizedSlotSparseEmbeddingOneHot::load_parameters( - std::string sparse_model) { - const std::string key_file(sparse_model + "/key"); - const std::string slot_file(sparse_model + "/slot_id"); - const std::string vec_file(sparse_model + "/emb_vector"); - - auto fs = FileSystemBuilder::build_unique_by_path(sparse_model); - - size_t key_file_size_in_byte = fs->get_file_size(key_file); - size_t slot_file_size_in_byte = fs->get_file_size(slot_file); - size_t vec_file_size_in_byte = fs->get_file_size(vec_file); - - size_t key_size = sizeof(long long); - size_t slot_size = sizeof(size_t); - size_t vec_size = sizeof(float) * embedding_data_.embedding_params_.embedding_vec_size; - size_t key_num = key_file_size_in_byte / key_size; - size_t slot_num = slot_file_size_in_byte / slot_size; - size_t vec_num = vec_file_size_in_byte / vec_size; - - if (key_num != vec_num || key_file_size_in_byte % key_size != 0 || - vec_file_size_in_byte % vec_size != 0 || key_num != slot_num || - slot_file_size_in_byte % slot_size != 0) { - HCTR_OWN_THROW(Error_t::WrongInput, "Error: file size is not correct"); - } - - auto blobs_buff = GeneralBuffer2::create(); - - Tensor2 keys; - blobs_buff->reserve({key_num}, &keys); - - Tensor2 slot_id; - blobs_buff->reserve({slot_num}, &slot_id); - - Tensor2 embeddings; - blobs_buff->reserve({vec_num, embedding_data_.embedding_params_.embedding_vec_size}, &embeddings); - - blobs_buff->allocate(); - - TypeHashKey *key_ptr = keys.get_ptr(); - size_t *slot_id_ptr = slot_id.get_ptr(); - float *embedding_ptr = embeddings.get_ptr(); - - if (std::is_same::value) { - fs->read(key_file, reinterpret_cast(key_ptr), key_file_size_in_byte, 0); - } else { - std::vector i64_key_vec(key_num, 0); - fs->read(key_file, reinterpret_cast(i64_key_vec.data()), key_file_size_in_byte, 0); - std::transform(i64_key_vec.begin(), i64_key_vec.end(), key_ptr, - [](long long key) { return static_cast(key); }); - } - fs->read(slot_file, reinterpret_cast(slot_id_ptr), slot_file_size_in_byte, 0); - fs->read(vec_file, reinterpret_cast(embedding_ptr), vec_file_size_in_byte, 0); - - load_parameters(keys, slot_id, embeddings, key_num, - embedding_data_.embedding_params_.embedding_vec_size, hash_table_value_tensors_, - slot_size_array_, mapping_offsets_per_gpu_tensors_); -} - -template -void LocalizedSlotSparseEmbeddingOneHot::load_parameters( - BufferBag &buf_bag, size_t num) { - const TensorBag2 &keys_bag = buf_bag.keys; - const TensorBag2 &slot_id_bag = buf_bag.slot_id; - const Tensor2 &embeddings = buf_bag.embedding; - Tensor2 keys = Tensor2::stretch_from(keys_bag); - Tensor2 slot_id = Tensor2::stretch_from(slot_id_bag); - - load_parameters(keys, slot_id, embeddings, num, - embedding_data_.embedding_params_.embedding_vec_size, hash_table_value_tensors_, - slot_size_array_, mapping_offsets_per_gpu_tensors_); -} - -template -void LocalizedSlotSparseEmbeddingOneHot::load_parameters( - const Tensor2 &keys, const Tensor2 &slot_id, - const Tensor2 &embeddings, size_t num, size_t embedding_vec_size, - Tensors2 &hash_table_value_tensors, const std::vector &slot_sizes, - const Tensors2 &mapping_offsets_per_gpu_tensors) { - if (num == 0) return; - - CudaDeviceContext context; - if (keys.get_dimensions()[0] < num || embeddings.get_dimensions()[0] < num) { - HCTR_OWN_THROW(Error_t::WrongInput, "The rows of keys and embeddings are not consistent."); - } - - const TypeHashKey *key_ptr = keys.get_ptr(); - const size_t *slot_id_ptr = slot_id.get_ptr(); - const float *embedding_ptr = embeddings.get_ptr(); - - // define size - size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); - size_t chunk_size = 1000; - size_t tile_size = 1; // must be 1, because we need to cal (key&local_gpu_count) to decide - // gpu_id for each - size_t hash_table_value_tile_size = tile_size * embedding_vec_size; - size_t hash_table_value_tile_size_in_B = hash_table_value_tile_size * sizeof(float); - size_t hash_table_value_chunk_size = hash_table_value_tile_size * chunk_size; - size_t hash_table_value_chunk_size_in_B = hash_table_value_chunk_size * sizeof(float); - size_t total_gpu_count = embedding_data_.get_resource_manager().get_global_gpu_count(); - - // CAUTION: can not decide how many values for each GPU, so need to allocate enough memory for - // each GPU allocate CPU/GPU memory for value/index chunk - std::unique_ptr h_hash_table_value_chunk_per_gpu(new float *[local_gpu_count]); - for (size_t id = 0; id < local_gpu_count; id++) { - HCTR_LIB_THROW( - cudaMallocHost(&h_hash_table_value_chunk_per_gpu[id], hash_table_value_chunk_size_in_B)); - } - std::unique_ptr d_hash_table_value_chunk_per_gpu(new float *[local_gpu_count]); - for (size_t id = 0; id < local_gpu_count; id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - HCTR_LIB_THROW( - cudaMalloc(&d_hash_table_value_chunk_per_gpu[id], hash_table_value_chunk_size_in_B)); - } - std::unique_ptr h_hash_table_index_chunk_per_gpu(new size_t *[local_gpu_count]); - for (size_t id = 0; id < local_gpu_count; id++) { - HCTR_LIB_THROW( - cudaMallocHost(&h_hash_table_index_chunk_per_gpu[id], chunk_size * sizeof(size_t))); - } - std::unique_ptr d_hash_table_index_chunk_per_gpu(new size_t *[local_gpu_count]); - for (size_t id = 0; id < local_gpu_count; id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - HCTR_LIB_THROW(cudaMalloc(&d_hash_table_index_chunk_per_gpu[id], chunk_size * sizeof(size_t))); - } - - std::unique_ptr tile_counter_in_chunk_per_gpu(new size_t[local_gpu_count]); - memset(tile_counter_in_chunk_per_gpu.get(), 0, sizeof(size_t) * local_gpu_count); - - // The vector that store the relationship between slot_id and slot order on the specific GPU - std::vector local_slot_id(slot_sizes.size()); - std::vector local_slot_num(local_gpu_count, 0); - for (size_t i = 0; i < slot_sizes.size(); i++) { - size_t gid = i % total_gpu_count; // global GPU ID - size_t id = embedding_data_.get_resource_manager().get_gpu_local_id_from_global_id( - gid); // local GPU ID (not gpudevice id) - int dst_rank = - embedding_data_.get_resource_manager().get_process_id_from_gpu_global_id(gid); // node id - if (embedding_data_.get_resource_manager().get_process_id() == dst_rank) { - local_slot_id[i] = local_slot_num[id]; - local_slot_num[id]++; - } - } - - // Host buffer to keep mapping_offset - std::vector h_mapping_offsets_per_gpu_tensors(local_gpu_count); - for (size_t id = 0; id < local_gpu_count; id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - HCTR_LIB_THROW(cudaMallocHost(&h_mapping_offsets_per_gpu_tensors[id], - local_slot_num[id] * sizeof(uint32_t))); - // Copy the mapping offset from GPU to Host - HCTR_LIB_THROW(cudaMemcpyAsync(h_mapping_offsets_per_gpu_tensors[id], - mapping_offsets_per_gpu_tensors[id].get_ptr(), - local_slot_num[id] * sizeof(uint32_t), cudaMemcpyDeviceToHost, - embedding_data_.get_local_gpu(id).get_stream())); - } - - // sync wait - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - // do upload - const size_t loop_num = num / chunk_size; - HCTR_LOG_S(INFO, ROOT) << "Start to upload embedding table file to GPUs, total loop_num: " - << loop_num << std::endl; - for (size_t i = 0; i < loop_num; i++) { - float *value_dst_buf; - size_t *tensor_index_dst_buf; - for (size_t k = 0; k < chunk_size; k++) { // process a tile in each loop - size_t slot_id = slot_id_ptr[i * chunk_size + k]; - size_t gid = slot_id % total_gpu_count; // global GPU ID - size_t id = embedding_data_.get_resource_manager().get_gpu_local_id_from_global_id( - gid); // local GPU ID (not gpudevice id) - int dst_rank = - embedding_data_.get_resource_manager().get_process_id_from_gpu_global_id(gid); // node id - - if (embedding_data_.get_resource_manager().get_process_id() == dst_rank) { - TypeHashKey tile_key = key_ptr[i * chunk_size + k]; - size_t tensor_index = - tile_key - (h_mapping_offsets_per_gpu_tensors[id][local_slot_id[slot_id]]); - - // memcpy hash_table_value to corresponding GPU - value_dst_buf = h_hash_table_value_chunk_per_gpu[id] + - tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size; - memcpy(value_dst_buf, embedding_ptr + (i * chunk_size + k) * embedding_vec_size, - hash_table_value_tile_size_in_B); - - tensor_index_dst_buf = - h_hash_table_index_chunk_per_gpu[id] + tile_counter_in_chunk_per_gpu[id]; - *tensor_index_dst_buf = tensor_index; - tile_counter_in_chunk_per_gpu[id] += 1; - } else { - continue; - } - } // end of for(int k = 0; k < (chunk_size * local_gpu_count); k++) - - // memcpy hash_table_slot_id and hash_table_value from CPU to GPU - for (size_t id = 0; id < local_gpu_count; id++) { - if (tile_counter_in_chunk_per_gpu[id] == 0) { - continue; - } - - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - // Copy value buffer and tensor_index buffer to GPU - size_t value_chunk_size = tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size; - float *src_buf_value = h_hash_table_value_chunk_per_gpu[id]; - float *dst_buf_value = d_hash_table_value_chunk_per_gpu[id]; - HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_value, src_buf_value, value_chunk_size * sizeof(float), - cudaMemcpyHostToDevice, - embedding_data_.get_local_gpu(id).get_stream())); - size_t *src_buf_index = h_hash_table_index_chunk_per_gpu[id]; - size_t *dst_buf_index = d_hash_table_index_chunk_per_gpu[id]; - value_chunk_size = tile_counter_in_chunk_per_gpu[id]; - HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_index, src_buf_index, - value_chunk_size * sizeof(size_t), cudaMemcpyHostToDevice, - embedding_data_.get_local_gpu(id).get_stream())); - - // Call kernel to insert the value into embedding value tensor - const size_t grid_size = (tile_counter_in_chunk_per_gpu[id] - 1) / 256 + 1; - upload_value_tensor_kernel<<>>( - d_hash_table_value_chunk_per_gpu[id], d_hash_table_index_chunk_per_gpu[id], - hash_table_value_tensors[id].get_ptr(), hash_table_value_tile_size, - tile_counter_in_chunk_per_gpu[id]); - } - - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - // set counter value - for (size_t id = 0; id < local_gpu_count; id++) { - tile_counter_in_chunk_per_gpu[id] = 0; // reset chunk counter to zero - } - } // end of for(int i = 0; i < loop_num; i++) - - // process the remaining data(less than a chunk) - const size_t remain_loop_num = num - loop_num * chunk_size; - float *value_dst_buf; - size_t *tensor_index_dst_buf; - for (size_t i = 0; i < remain_loop_num; i++) { // process one tile in each loop - - size_t slot_id = slot_id_ptr[loop_num * chunk_size + i]; - size_t gid = slot_id % total_gpu_count; // global GPU ID - size_t id = embedding_data_.get_resource_manager().get_gpu_local_id_from_global_id( - gid); // local GPU ID (not gpudevice id) - int dst_rank = - embedding_data_.get_resource_manager().get_process_id_from_gpu_global_id(gid); // node id - - if (embedding_data_.get_resource_manager().get_process_id() == dst_rank) { - TypeHashKey tile_key = key_ptr[loop_num * chunk_size + i]; - size_t tensor_index = - tile_key - (h_mapping_offsets_per_gpu_tensors[id][local_slot_id[slot_id]]); - - // memcpy hash_table_value to corresponding GPU - value_dst_buf = h_hash_table_value_chunk_per_gpu[id] + - tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size; - memcpy(value_dst_buf, embedding_ptr + (loop_num * chunk_size + i) * embedding_vec_size, - hash_table_value_tile_size_in_B); - - tensor_index_dst_buf = - h_hash_table_index_chunk_per_gpu[id] + tile_counter_in_chunk_per_gpu[id]; - *tensor_index_dst_buf = tensor_index; - tile_counter_in_chunk_per_gpu[id] += 1; - - } else { - continue; - } - } - - // memcpy hash_table_slot_id and hash_table_value from CPU to GPU and insert into embedding - // table - for (size_t id = 0; id < local_gpu_count; id++) { - if (tile_counter_in_chunk_per_gpu[id] == 0) { - continue; - } - - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - // Copy value buffer and tensor_index buffer to GPU - size_t value_chunk_size = tile_counter_in_chunk_per_gpu[id] * hash_table_value_tile_size; - float *src_buf_value = h_hash_table_value_chunk_per_gpu[id]; - float *dst_buf_value = d_hash_table_value_chunk_per_gpu[id]; - HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_value, src_buf_value, value_chunk_size * sizeof(float), - cudaMemcpyHostToDevice, - embedding_data_.get_local_gpu(id).get_stream())); - size_t *src_buf_index = h_hash_table_index_chunk_per_gpu[id]; - size_t *dst_buf_index = d_hash_table_index_chunk_per_gpu[id]; - value_chunk_size = tile_counter_in_chunk_per_gpu[id]; - HCTR_LIB_THROW(cudaMemcpyAsync(dst_buf_index, src_buf_index, value_chunk_size * sizeof(size_t), - cudaMemcpyHostToDevice, - embedding_data_.get_local_gpu(id).get_stream())); - - // Call kernel to insert the value into embedding value tensor - const size_t grid_size = (tile_counter_in_chunk_per_gpu[id] - 1) / 256 + 1; - upload_value_tensor_kernel<<>>( - d_hash_table_value_chunk_per_gpu[id], d_hash_table_index_chunk_per_gpu[id], - hash_table_value_tensors[id].get_ptr(), hash_table_value_tile_size, - tile_counter_in_chunk_per_gpu[id]); - } - - // sync wait - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - HCTR_LOG(INFO, ROOT, "Done\n"); - - // release resources - for (size_t id = 0; id < local_gpu_count; id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - HCTR_LIB_THROW(cudaFree(d_hash_table_value_chunk_per_gpu[id])); - HCTR_LIB_THROW(cudaFree(d_hash_table_index_chunk_per_gpu[id])); - } - for (size_t id = 0; id < local_gpu_count; id++) { - HCTR_LIB_THROW(cudaFreeHost(h_hash_table_value_chunk_per_gpu[id])); - HCTR_LIB_THROW(cudaFreeHost(h_hash_table_index_chunk_per_gpu[id])); - HCTR_LIB_THROW(cudaFreeHost(h_mapping_offsets_per_gpu_tensors[id])); - } -} - -template -void LocalizedSlotSparseEmbeddingOneHot::dump_parameters( - std::string sparse_model) const { - dump_parameters(sparse_model, embedding_data_.embedding_params_.embedding_vec_size, - hash_table_value_tensors_, slot_size_array_); -} - -template -void LocalizedSlotSparseEmbeddingOneHot::dump_parameters( - BufferBag &buf_bag, size_t *num) const { - TensorBag2 keys_bag = buf_bag.keys; - TensorBag2 slot_id_bag = buf_bag.slot_id; - Tensor2 &embeddings = buf_bag.embedding; - Tensor2 keys = Tensor2::stretch_from(keys_bag); - Tensor2 slot_id = Tensor2::stretch_from(slot_id_bag); - - dump_parameters(keys, slot_id, embeddings, num, - embedding_data_.embedding_params_.embedding_vec_size, hash_table_value_tensors_, - slot_size_array_); -} - -template -void LocalizedSlotSparseEmbeddingOneHot::dump_parameters( - const std::string &sparse_model, size_t embedding_vec_size, - const Tensors2 &hash_table_value_tensors, const std::vector &slot_sizes) const { - CudaDeviceContext context; - size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); - - auto fs = FileSystemBuilder::build_unique_by_path(sparse_model); - bool is_local_path = IOUtils::is_local_path(sparse_model); - - const std::string key_file(sparse_model + "/key"); - const std::string slot_file(sparse_model + "/slot_id"); - const std::string vec_file(sparse_model + "/emb_vector"); - -#ifdef ENABLE_MPI - HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported."); - fs->create_dir(sparse_model); - MPI_File key_fh, slot_fh, vec_fh; - HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, - MPI_INFO_NULL, &key_fh)); - HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, slot_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, - MPI_INFO_NULL, &slot_fh)); - HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, vec_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, - MPI_INFO_NULL, &vec_fh)); -#endif - - // memory allocation - std::unique_ptr count(new size_t[local_gpu_count]); - size_t total_count = 0; - - for (size_t id = 0; id < local_gpu_count; id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - count[id] = 0; - for (size_t i = 0; i < slot_sizes.size(); i++) { - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) { - count[id] += slot_sizes[i]; - } - } - total_count += count[id]; - } - - std::vector offset_host(local_gpu_count, 0); - std::exclusive_scan(count.get(), count.get() + local_gpu_count, offset_host.begin(), 0); - - TypeHashKey *h_hash_table_key; - size_t *h_hash_table_slot_id; - float *h_hash_table_value; - HCTR_LIB_THROW(cudaMallocHost(&h_hash_table_key, total_count * sizeof(TypeHashKey))); - HCTR_LIB_THROW(cudaMallocHost(&h_hash_table_slot_id, total_count * sizeof(size_t))); - HCTR_LIB_THROW( - cudaMallocHost(&h_hash_table_value, total_count * embedding_vec_size * sizeof(float))); - - std::unique_ptr d_hash_table_key(new TypeHashKey *[local_gpu_count]); - std::unique_ptr d_hash_table_slot_id(new size_t *[local_gpu_count]); - - for (size_t id = 0; id < local_gpu_count; id++) { - if (count[id] == 0) continue; - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - HCTR_LIB_THROW(cudaMalloc(&d_hash_table_key[id], count[id] * sizeof(TypeHashKey))); - HCTR_LIB_THROW(cudaMalloc(&d_hash_table_slot_id[id], count[id] * sizeof(size_t))); - } - - // Generate key and slot_id tensor, dump value tensor on GPU - for (size_t id = 0; id < local_gpu_count; id++) { - if (count[id] == 0) continue; - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - HCTR_LOG_S(INFO, WORLD) << "Rank" << embedding_data_.get_resource_manager().get_process_id() - << ": Dump embedding table from GPU" << id << std::endl; - - // Loop for each slot - size_t buffer_offset = 0; - for (size_t i = 0; i < slot_sizes.size(); i++) { - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) { - // Generate key buffer - size_t key_offset = 0; - for (size_t j = 0; j < i; j++) { - key_offset += slot_sizes[j]; - } - functors_.memset_liner(d_hash_table_key[id] + buffer_offset, (TypeHashKey)key_offset, - (TypeHashKey)1, slot_sizes[i], - embedding_data_.get_local_gpu(id).get_stream()); - - // Generate slot_id - functors_.memset_const(d_hash_table_slot_id[id] + buffer_offset, i, slot_sizes[i], - embedding_data_.get_local_gpu(id).get_stream()); - - buffer_offset += slot_sizes[i]; - } - } - // Copy key buffer to host - HCTR_LIB_THROW(cudaMemcpyAsync(h_hash_table_key + offset_host[id], d_hash_table_key[id], - count[id] * sizeof(TypeHashKey), cudaMemcpyDeviceToHost, - embedding_data_.get_local_gpu(id).get_stream())); - // Copy value buffer to host - HCTR_LIB_THROW(cudaMemcpyAsync( - h_hash_table_value + offset_host[id] * embedding_vec_size, - hash_table_value_tensors[id].get_ptr(), count[id] * embedding_vec_size * sizeof(float), - cudaMemcpyDeviceToHost, embedding_data_.get_local_gpu(id).get_stream())); - // Copy slot_id to host - HCTR_LIB_THROW(cudaMemcpyAsync(h_hash_table_slot_id + offset_host[id], d_hash_table_slot_id[id], - count[id] * sizeof(size_t), cudaMemcpyDeviceToHost, - embedding_data_.get_local_gpu(id).get_stream())); - } - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - long long *h_key_ptr; - std::vector i64_key_vec; - if (std::is_same::value) { - h_key_ptr = reinterpret_cast(h_hash_table_key); - } else { - i64_key_vec.resize(total_count); - std::transform(h_hash_table_key, h_hash_table_key + total_count, i64_key_vec.begin(), - [](unsigned key) { return static_cast(key); }); - h_key_ptr = i64_key_vec.data(); - } - - const size_t key_size = sizeof(long long); - const size_t slot_size = sizeof(size_t); - const size_t vec_size = sizeof(float) * embedding_vec_size; - - // write sparse model to file - HCTR_LOG_S(INFO, WORLD) << "Rank" << embedding_data_.get_resource_manager().get_process_id() - << ": Write hash table pairs to file" << std::endl; -#ifdef ENABLE_MPI - MPI_Datatype TYPE_EMB_VECTOR; - HCTR_MPI_THROW(MPI_Type_contiguous(embedding_vec_size, MPI_FLOAT, &TYPE_EMB_VECTOR)); - HCTR_MPI_THROW(MPI_Type_commit(&TYPE_EMB_VECTOR)); - - int my_rank = embedding_data_.get_resource_manager().get_process_id(); - int n_ranks = embedding_data_.get_resource_manager().get_num_process(); - - std::vector offset_per_rank(n_ranks, 0); - HCTR_MPI_THROW(MPI_Allgather(&total_count, sizeof(size_t), MPI_CHAR, offset_per_rank.data(), - sizeof(size_t), MPI_CHAR, MPI_COMM_WORLD)); - std::exclusive_scan(offset_per_rank.begin(), offset_per_rank.end(), offset_per_rank.begin(), 0); - - size_t key_offset = offset_per_rank[my_rank] * key_size; - size_t slot_offset = offset_per_rank[my_rank] * slot_size; - size_t vec_offset = offset_per_rank[my_rank] * vec_size; - - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); - MPI_Status status; - HCTR_MPI_THROW( - MPI_File_write_at(key_fh, key_offset, h_key_ptr, total_count, MPI_LONG_LONG_INT, &status)); - HCTR_MPI_THROW(MPI_File_write_at(slot_fh, slot_offset, h_hash_table_slot_id, total_count, - MPI_SIZE_T, &status)); - HCTR_MPI_THROW(MPI_File_write_at(vec_fh, vec_offset, h_hash_table_value, total_count, - TYPE_EMB_VECTOR, &status)); - - HCTR_MPI_THROW(MPI_File_close(&key_fh)); - HCTR_MPI_THROW(MPI_File_close(&slot_fh)); - HCTR_MPI_THROW(MPI_File_close(&vec_fh)); - HCTR_MPI_THROW(MPI_Type_free(&TYPE_EMB_VECTOR)); -#else - fs->write(key_file, reinterpret_cast(h_key_ptr), total_count * key_size, true); - fs->write(slot_file, reinterpret_cast(h_hash_table_slot_id), total_count * slot_size, - true); - fs->write(vec_file, reinterpret_cast(h_hash_table_value), total_count * vec_size, true); -#endif - HCTR_LOG(INFO, ROOT, "Done\n"); - - for (size_t id = 0; id < local_gpu_count; id++) { - if (count[id] == 0) continue; - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - HCTR_LIB_THROW(cudaFree(d_hash_table_key[id])); - HCTR_LIB_THROW(cudaFree(d_hash_table_slot_id[id])); - } - HCTR_LIB_THROW(cudaFreeHost(h_hash_table_key)); - HCTR_LIB_THROW(cudaFreeHost(h_hash_table_slot_id)); - HCTR_LIB_THROW(cudaFreeHost(h_hash_table_value)); -} - -template -void LocalizedSlotSparseEmbeddingOneHot::dump_parameters( - Tensor2 &keys, Tensor2 &slot_id, Tensor2 &embeddings, size_t *num, - size_t embedding_vec_size, const Tensors2 &hash_table_value_tensors, - const std::vector &slot_sizes) const { - TypeHashKey *key_ptr = keys.get_ptr(); - size_t *slot_id_ptr = slot_id.get_ptr(); - float *embedding_ptr = embeddings.get_ptr(); - - size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); - - // memory allocation - std::unique_ptr count(new size_t[local_gpu_count]); - size_t total_count = 0; - - CudaDeviceContext context; - for (size_t id = 0; id < local_gpu_count; id++) { - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - count[id] = 0; - for (size_t i = 0; i < slot_sizes.size(); i++) { - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) { - count[id] += slot_sizes[i]; - } - } - total_count += count[id]; - } - - std::vector offset_host(local_gpu_count, 0); - std::exclusive_scan(count.get(), count.get() + local_gpu_count, offset_host.begin(), 0); - *num = total_count; - - std::unique_ptr d_hash_table_key(new TypeHashKey *[local_gpu_count]); - std::unique_ptr d_hash_table_slot_id(new size_t *[local_gpu_count]); - - for (size_t id = 0; id < local_gpu_count; id++) { - if (count[id] == 0) continue; - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - HCTR_LIB_THROW(cudaMalloc(&d_hash_table_key[id], count[id] * sizeof(TypeHashKey))); - HCTR_LIB_THROW(cudaMalloc(&d_hash_table_slot_id[id], count[id] * sizeof(size_t))); - } - - // Generate key and slot_id tensor, dump value tensor on GPU - for (size_t id = 0; id < local_gpu_count; id++) { - if (count[id] == 0) continue; - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - // Loop for each slot - size_t buffer_offset = 0; - for (size_t i = 0; i < slot_sizes.size(); i++) { - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - if ((i % embedding_data_.get_resource_manager().get_global_gpu_count()) == global_id) { - // Generate key buffer - size_t key_offset = 0; - for (size_t j = 0; j < i; j++) { - key_offset += slot_sizes[j]; - } - functors_.memset_liner(d_hash_table_key[id] + buffer_offset, - static_cast(key_offset), static_cast(1), - slot_sizes[i], embedding_data_.get_local_gpu(id).get_stream()); - - // Generate slot_id - functors_.memset_const(d_hash_table_slot_id[id] + buffer_offset, i, slot_sizes[i], - embedding_data_.get_local_gpu(id).get_stream()); - - buffer_offset += slot_sizes[i]; - } - } - // Copy key buffer to host - HCTR_LIB_THROW(cudaMemcpyAsync(key_ptr + offset_host[id], d_hash_table_key[id], - count[id] * sizeof(TypeHashKey), cudaMemcpyDeviceToHost, - embedding_data_.get_local_gpu(id).get_stream())); - // Copy value buffer to host - HCTR_LIB_THROW(cudaMemcpyAsync( - embedding_ptr + offset_host[id] * embedding_vec_size, - hash_table_value_tensors[id].get_ptr(), count[id] * embedding_vec_size * sizeof(float), - cudaMemcpyDeviceToHost, embedding_data_.get_local_gpu(id).get_stream())); - // Copy slot_id to host - HCTR_LIB_THROW(cudaMemcpyAsync(slot_id_ptr + offset_host[id], d_hash_table_slot_id[id], - count[id] * sizeof(size_t), cudaMemcpyDeviceToHost, - embedding_data_.get_local_gpu(id).get_stream())); - } - functors_.sync_all_gpus(embedding_data_.get_resource_manager()); - - for (size_t id = 0; id < local_gpu_count; id++) { - if (count[id] == 0) continue; - context.set_device(embedding_data_.get_local_gpu(id).get_device_id()); - - HCTR_LIB_THROW(cudaFree(d_hash_table_key[id])); - HCTR_LIB_THROW(cudaFree(d_hash_table_slot_id[id])); - } -} - -template -void LocalizedSlotSparseEmbeddingOneHot::init_embedding( - const std::vector slot_sizes, size_t embedding_vec_size, - std::vector> &hash_table_value_tensors, - Tensors2 &hash_table_slot_id_tensors) { - size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); - size_t total_gpu_count = embedding_data_.get_resource_manager().get_global_gpu_count(); - -#ifndef NDEBUG - HCTR_LOG_S(DEBUG, ROOT) << "local_gpu_count=" << local_gpu_count - << ", total_gpu_count=" << total_gpu_count << std::endl; -#endif - -#pragma omp parallel num_threads(embedding_data_.get_resource_manager().get_local_gpu_count()) - { - size_t id = omp_get_thread_num(); - size_t device_id = embedding_data_.get_local_gpu(id).get_device_id(); - size_t global_id = embedding_data_.get_local_gpu(id).get_global_id(); - -#ifndef NDEBUG - HCTR_LOG_S(DEBUG, ROOT) << "id=" << id << ", device_id=" << device_id - << ", global_id=" << global_id << std::endl; -#endif - - functors_.init_embedding_per_gpu(global_id, total_gpu_count, slot_sizes, embedding_vec_size, - hash_table_value_tensors[id], hash_table_slot_id_tensors[id], - embedding_data_.get_local_gpu(id)); - - HCTR_LIB_THROW(cudaStreamSynchronize(embedding_data_.get_local_gpu(id).get_stream())); - HCTR_LOG_S(INFO, ROOT) << "gpu" << id << " init embedding done" << std::endl; - } - - return; -} - -template -void LocalizedSlotSparseEmbeddingOneHot::reset() { - CudaDeviceContext context; - for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) { - functors_.init_embedding_per_gpu( - embedding_data_.get_local_gpu(i).get_global_id(), - embedding_data_.get_resource_manager().get_global_gpu_count(), slot_size_array_, - embedding_data_.embedding_params_.embedding_vec_size, value_table_tensors_[i], - hash_table_slot_id_tensors_[i], embedding_data_.get_local_gpu(i)); - } - - for (size_t i = 0; i < embedding_data_.get_resource_manager().get_local_gpu_count(); i++) { - HCTR_LIB_THROW(cudaStreamSynchronize(embedding_data_.get_local_gpu(i).get_stream())); - } -} - -template class LocalizedSlotSparseEmbeddingOneHot; -template class LocalizedSlotSparseEmbeddingOneHot; -template class LocalizedSlotSparseEmbeddingOneHot; -template class LocalizedSlotSparseEmbeddingOneHot; - -} // namespace HugeCTR diff --git a/HugeCTR/src/embeddings/update_params_functor.cu b/HugeCTR/src/embeddings/update_params_functor.cu deleted file mode 100644 index b11aecd5f3..0000000000 --- a/HugeCTR/src/embeddings/update_params_functor.cu +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include // for implicitly including cub headers - -#include -#include -#include - -#define max_size_top_categories 16 -#define num_samples_per_block 128 -#define embedding_block_size 128 - -namespace HugeCTR { - -size_t get_max_size_top_categories() { return max_size_top_categories; } -size_t get_num_samples_per_block() { return num_samples_per_block; } -size_t get_embedding_block_size() { return embedding_block_size; } - -namespace { - -// TODO: it must be moved to SparseOptimizer -// The local memory version of the atomic update kernel - opt_sgd_atomic_kernel for one hot -// embedding. -// -// This function updates the embedding vectors of the top-n features in shared memory -// before writing the accumulated result to global memory. This reduces the number of -// global memory accesses, locks and collisions. -// -// num_samples_per_block number of samples are updated per block and they are iterated over, -// such that all threads update the embedding vector of a single feature simultaneously. -// -// shared ds_top_features_index : the row indices of the top-n - top_features_size - features -// shared ds_embedding : the embedding vector corresponding to the top features (rows) -template -__global__ void opt_sgd_cached_kernel(int nnz, int embedding_vec_size, float lr_scale, - const size_t *top_categories, - const size_t top_categories_size, - const size_t *hash_value_index, - const TypeEmbeddingComp *wgrad, float *hash_table_value) { - int bid = blockIdx.x; - int tid = threadIdx.x; - - // read a number of top_categories_size top categories indices from global memory - // note: max_size_top_n (16) less than warp size - __shared__ size_t ds_top_categories[max_size_top_categories]; - if (tid < top_categories_size) { - ds_top_categories[tid] = top_categories[tid]; - } - //__syncthreads(); - - // reads num_samples_per_block values indices from hash_value_index into shared memory - __shared__ size_t ds_category[num_samples_per_block]; // embedding indices for current block - for (int ds_offset = 0; ds_offset < num_samples_per_block; ds_offset += blockDim.x) { - int ds_index = ds_offset + tid; - int key_id = bid * num_samples_per_block + ds_index; - if (ds_index < num_samples_per_block && key_id < nnz) { - ds_category[ds_index] = hash_value_index[key_id]; - } - } - __syncthreads(); - - // map sample category indices to top_category indices - __shared__ int - ds_index_top_categories[num_samples_per_block]; // index to top category index array, - // max_size_top_categories if not present - { - for (int ci_offset = 0; ci_offset < num_samples_per_block; ci_offset += blockDim.x) { - int index_ds_category = ci_offset + tid; - if (index_ds_category < num_samples_per_block) { - // loop over top features - int i_top = max_size_top_categories; // one past end - if (index_ds_category + bid * num_samples_per_block < nnz) { - int category_embedding_index = ds_category[index_ds_category]; - for (int k = 0; k < top_categories_size; ++k) { - if (category_embedding_index == ds_top_categories[k]) i_top = k; - } - } - ds_index_top_categories[index_ds_category] = i_top; - } - } - } - __syncthreads(); - - // store the sum of deltaw in ds_embedding - // TODO: make this work for embedding size > 128 - __shared__ float ds_embedding[max_size_top_categories][embedding_block_size]; - // initialize the local embedding vectors - for (int i = 0; i < top_categories_size; ++i) { - if (tid < embedding_block_size) { - ds_embedding[i][tid] = 0.f; - } - } - __syncthreads(); - - unsigned int update_top_category = 0; // bit indicator sequence - - size_t key_id_local = 0; - for (size_t key_id = bid * num_samples_per_block; - key_id < nnz && key_id < (bid + 1) * num_samples_per_block; ++key_id) { - if (tid < embedding_vec_size) { - int index_top_category = ds_index_top_categories[key_id_local]; - size_t category_embedding_index = ds_category[key_id_local]; - if (index_top_category < max_size_top_categories) { - // write to shared memory - update_top_category = (update_top_category | (1 << index_top_category)); - // write results to embedding vector in shared memory - float deltaw = -lr_scale * TypeConvertFunc::convert( - wgrad[key_id * embedding_vec_size + tid]); - ds_embedding[index_top_category][tid] += deltaw; - } else { - // write to global memory using atomic - float deltaw = -lr_scale * TypeConvertFunc::convert( - wgrad[key_id * embedding_vec_size + tid]); - - // atomic update - size_t feature_index = category_embedding_index * embedding_vec_size + tid; - atomicAdd(&hash_table_value[feature_index], deltaw); - } - } - - key_id_local++; - } - __syncthreads(); - - // write the embedding vectors for top features which are in shared memory to global memory - // for (int i=0; i < max_size_top_categories; ++i) { // maybe this is actually more optimized - if (tid < embedding_vec_size) { - for (int i = 0; i < top_categories_size; ++i) { - // only those that were updated - if ((update_top_category & (1 << i)) > 0) { - size_t category_embedding_index = ds_top_categories[i]; - size_t embedding_element_index = category_embedding_index * embedding_vec_size + tid; - atomicAdd(&hash_table_value[embedding_element_index], ds_embedding[i][tid]); - } - } - } -} - -// only support LocalizedSlotSparseEmbeddingOneHot -template -__global__ void opt_sgd_atomic_kernel(int nnz, int embedding_vec_size, float lr_scale, - const size_t *hash_value_index, - const TypeEmbeddingComp *wgrad, float *hash_table_value) { - int bid = blockIdx.x; - int tid = threadIdx.x; - - if (tid < embedding_vec_size && bid < nnz) { - for (int key_id = bid; key_id < nnz; key_id += gridDim.x) { - // for one-hot, the max_feature_per_slot is 1, so sample_id is equal to key_id - float deltaw = -lr_scale * TypeConvertFunc::convert( - wgrad[key_id * embedding_vec_size + tid]); - - // atomic update - size_t value_index = hash_value_index[key_id]; - size_t feature_index = value_index * embedding_vec_size + tid; - atomicAdd(&hash_table_value[feature_index], deltaw); - } - } -} - -} // namespace - -template -void SparseEmbeddingFunctors::opt_sgd_atomic_cached( - size_t num_samples, size_t embedding_vec_size, const size_t *hash_value_index, float lr, - float scaler, const TypeEmbeddingComp *wgrad, float *hash_table_value, size_t *top_categories, - size_t &size_top_categories, cudaStream_t stream, bool force_stats) { - static bool perform_stats = true; - if (perform_stats || force_stats) { - uint32_t num_unique_categories; - /// TODO: refactor instead of using placeholder values for the other params - hybrid_embedding::Statistics statistics(num_samples, 1, 1, 1); - - statistics.sort_categories_by_count(hash_value_index, (uint32_t)num_samples, top_categories, - statistics.counts_sorted.get_ptr(), num_unique_categories, - stream); - size_top_categories = std::min((size_t)num_unique_categories, (size_t)max_size_top_categories); - - perform_stats = false; - } - - float lr_scale = lr / scaler; - // treats num_samples_per_block samples - size_t grid_size = max(1ul, (num_samples - 1) / num_samples_per_block + 1); - // each thread sets one embedding vector element - size_t block_size = embedding_vec_size; - HCTR_LIB_THROW(cudaPeekAtLastError()); - opt_sgd_cached_kernel<<>>( - num_samples, embedding_vec_size, lr_scale, top_categories, size_top_categories, - hash_value_index, wgrad, hash_table_value); - HCTR_LIB_THROW(cudaPeekAtLastError()); -} - -template -void SparseEmbeddingFunctors::update_params( - size_t embedding_vec_size, const OptParams &opt_params, size_t nnz, - const Tensor2 &hash_value_index, const Tensor2 &wgrad, - Tensor2 &hash_table_value, Tensor2 &top_categories, size_t &size_top_categories, - size_t sm_count, cudaStream_t stream, bool force_stats) { - try { - if (opt_params.optimizer == Optimizer_t::SGD && opt_params.hyperparams.sgd.atomic_update) { - float lr_scale = opt_params.lr / opt_params.scaler; - - opt_sgd_atomic_cached(nnz, embedding_vec_size, hash_value_index.get_ptr(), - opt_params.lr, opt_params.scaler, wgrad.get_ptr(), - hash_table_value.get_ptr(), top_categories.get_ptr(), - size_top_categories, stream, force_stats); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "Error: Invalid opitimizer type"); - } - - } catch (const std::runtime_error &rt_err) { - HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl; - throw; - } - - return; -} - -template void SparseEmbeddingFunctors::opt_sgd_atomic_cached( - size_t num_samples, size_t embedding_vec_size, const size_t *hash_value_index, float lr, - float scaler, const float *wgrad, float *hash_table_value, size_t *top_categories, - size_t &size_top_categories, cudaStream_t stream, bool force_stats); - -template void SparseEmbeddingFunctors::opt_sgd_atomic_cached<__half>( - size_t num_samples, size_t embedding_vec_size, const size_t *hash_value_index, float lr, - float scaler, const __half *wgrad, float *hash_table_value, size_t *top_categories, - size_t &size_top_categories, cudaStream_t stream, bool force_stats); - -template void SparseEmbeddingFunctors::update_params( - size_t embedding_vec_size, const OptParams &opt_params, size_t nnz, - const Tensor2 &hash_value_index, const Tensor2 &wgrad, - Tensor2 &hash_table_value, Tensor2 &top_categories, size_t &size_top_categories, - size_t sm_count, cudaStream_t stream, bool force_stats); - -template void SparseEmbeddingFunctors::update_params<__half>( - size_t embedding_vec_size, const OptParams &opt_params, size_t nnz, - const Tensor2 &hash_value_index, const Tensor2<__half> &wgrad, - Tensor2 &hash_table_value, Tensor2 &top_categories, size_t &size_top_categories, - size_t sm_count, cudaStream_t stream, bool force_stats); - -} // namespace HugeCTR diff --git a/HugeCTR/src/exchange_wgrad.cpp b/HugeCTR/src/exchange_wgrad.cpp index 627fa27023..d1232652a2 100644 --- a/HugeCTR/src/exchange_wgrad.cpp +++ b/HugeCTR/src/exchange_wgrad.cpp @@ -21,17 +21,18 @@ namespace HugeCTR { template NetworkExchangeWgrad::NetworkExchangeWgrad( - const std::shared_ptr& resource_manager) - : resource_manager_(resource_manager), num_gpus_(resource_manager->get_local_gpu_count()) { + const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager) + : collective_manager_(collective_manager), num_gpus_(resource_manager->get_local_gpu_count()) { // TODO remove it after Hybrid embedding is deprecated null_wgrad_buffs_.resize(num_gpus_, nullptr); - auto ar_comm = resource_manager_->get_ar_comm(); + auto ar_comm = collective_manager_->get_ar_comm(); ar_handle_ = ar_comm->register_coll(); } template void NetworkExchangeWgrad::init_ar_comm(const std::vector& ptr, size_t sizes) { network_wgrad_size_ = sizes; - auto ar_comm = resource_manager_->get_ar_comm(); + auto ar_comm = collective_manager_->get_ar_comm(); for (size_t g = 0; g < num_gpus_; g++) { HCTR_CHECK_HINT(ptr[g], "buffer does not exist"); ar_comm->set_coll_buf(ar_handle_, ptr[g], network_wgrad_size_, g); @@ -46,24 +47,25 @@ void NetworkExchangeWgrad::update_embed_wgrad_size(size_t size) { template void NetworkExchangeWgrad::allreduce(size_t device_id, cudaStream_t stream) { - auto ar_comm = resource_manager_->get_ar_comm(); + auto ar_comm = collective_manager_->get_ar_comm(); ar_comm->all_reduce(ar_handle_, stream, device_id); } template GroupedExchangeWgrad::GroupedExchangeWgrad( - const std::shared_ptr& resource_manager) - : resource_manager_(resource_manager), num_gpus_(resource_manager->get_local_gpu_count()) { + const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager) + : collective_manager_(collective_manager), num_gpus_(resource_manager->get_local_gpu_count()) { // TODO remove it after Hybrid embedding is deprecated embed_wgrad_buffs_.resize(num_gpus_, nullptr); - auto ar_comm = resource_manager_->get_ar_comm(); + auto ar_comm = collective_manager_->get_ar_comm(); ar_handle_ = ar_comm->register_coll(); } template void GroupedExchangeWgrad::init_ar_comm(const std::vector& ptr, size_t sizes) { network_wgrad_size_ = sizes; - auto ar_comm = resource_manager_->get_ar_comm(); + auto ar_comm = collective_manager_->get_ar_comm(); for (size_t g = 0; g < num_gpus_; g++) { HCTR_CHECK_HINT(ptr[g], "buffer does not exist"); ar_comm->set_coll_buf(ar_handle_, ptr[g], network_wgrad_size_, g); @@ -79,7 +81,7 @@ void GroupedExchangeWgrad::update_embed_wgrad_size(size_t size) { template void GroupedExchangeWgrad::allreduce(size_t device_id, cudaStream_t stream) { - auto ar_comm = resource_manager_->get_ar_comm(); + auto ar_comm = collective_manager_->get_ar_comm(); ar_comm->all_reduce(ar_handle_, stream, device_id); } diff --git a/HugeCTR/src/pybind/add_dense_layer.cpp b/HugeCTR/src/pybind/add_dense_layer.cpp index c590ba57ab..0cfae5649f 100644 --- a/HugeCTR/src/pybind/add_dense_layer.cpp +++ b/HugeCTR/src/pybind/add_dense_layer.cpp @@ -97,8 +97,6 @@ void save_graph_to_json(nlohmann::json& layer_config_array, for (size_t i = 0; i < input_param.data_reader_sparse_param_array.size(); ++i) { nlohmann::json input_sparse_config; input_sparse_config["top"] = input_param.data_reader_sparse_param_array[i].top_name; - input_sparse_config["type"] = - READER_SPARSE_TYPE_TO_STRING[input_param.data_reader_sparse_param_array[i].type]; input_sparse_config["nnz_per_slot"] = input_param.data_reader_sparse_param_array[i].nnz_per_slot; input_sparse_config["is_fixed_length"] = @@ -131,26 +129,6 @@ void save_graph_to_json(nlohmann::json& layer_config_array, if (sparse_embedding_params[i].slot_size_array.size() > 0) { sparse_hparam_config["slot_size_array"] = sparse_embedding_params[i].slot_size_array; } - if (sparse_embedding_params[i].embedding_type == Embedding_t::HybridSparseEmbedding) { - sparse_hparam_config["max_num_frequent_categories"] = - sparse_embedding_params[i].hybrid_embedding_param.max_num_frequent_categories; - sparse_hparam_config["max_num_infrequent_samples"] = - sparse_embedding_params[i].hybrid_embedding_param.max_num_infrequent_samples; - sparse_hparam_config["p_dup_max"] = - sparse_embedding_params[i].hybrid_embedding_param.p_dup_max; - sparse_hparam_config["max_all_reduce_bandwidth"] = - sparse_embedding_params[i].hybrid_embedding_param.max_all_reduce_bandwidth; - sparse_hparam_config["max_all_to_all_bandwidth"] = - sparse_embedding_params[i].hybrid_embedding_param.max_all_to_all_bandwidth; - sparse_hparam_config["efficiency_bandwidth_ratio"] = - sparse_embedding_params[i].hybrid_embedding_param.efficiency_bandwidth_ratio; - sparse_hparam_config["communication_type"] = - HE_COMM_TYPE_TO_STRING[sparse_embedding_params[i] - .hybrid_embedding_param.communication_type]; - sparse_hparam_config["hybrid_embedding_type"] = - HE_TYPE_TO_STRING[sparse_embedding_params[i] - .hybrid_embedding_param.hybrid_embedding_type]; - } sparse_config["sparse_embedding_hparam"] = sparse_hparam_config; nlohmann::json optimizer_config; nlohmann::json optimizer_hparam_config; diff --git a/HugeCTR/src/pybind/add_input.cpp b/HugeCTR/src/pybind/add_input.cpp index 55f05cf0dc..facd22a60d 100644 --- a/HugeCTR/src/pybind/add_input.cpp +++ b/HugeCTR/src/pybind/add_input.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include #include @@ -107,8 +106,7 @@ void add_input(Input& input, DataReaderParams& reader_params, std::vector>& train_tensor_entries_list, std::vector>& evaluate_tensor_entries_list, std::shared_ptr& train_data_reader, - std::shared_ptr& evaluate_data_reader, - std::shared_ptr& init_data_reader, size_t batch_size, + std::shared_ptr& evaluate_data_reader, size_t batch_size, size_t batch_size_eval, bool use_mixed_precision, bool repeat_dataset, bool train_intra_iteration_overlap, size_t num_iterations_statistics, const std::shared_ptr resource_manager) { @@ -187,138 +185,8 @@ void add_input(Input& input, DataReaderParams& reader_params, eval_num_batches_per_thread, input.data_reader_sparse_param_array, total_label_dim, dense_dim, use_mixed_precision, false, schedule_h2d, is_float_dense)); - } else { // use original one-hot async reader - bool is_float_dense = reader_params.async_param.is_dense_float; - HCTR_CHECK_HINT(!is_float_dense, "One-hot RawAsync Reader only supports int32 dense type\n"); - if (!repeat_dataset) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "Epoch mode cannot be used with RawAsync reader, please set repeat_dataset as true"); - } - std::string proc_file("/proc/sys/fs/aio-max-nr"), max_nr_str; - std::ifstream tmp_fs(proc_file, std::ifstream::in); - if (!tmp_fs.good()) { - HCTR_OWN_THROW(Error_t::InvalidEnv, "Can't read /proc/sys/fs/aio-max-nr"); - } - int max_nr_requests_allowed_system = -1; - int actual_nr_requests = 2; - std::getline(tmp_fs, max_nr_str); - max_nr_requests_allowed_system = std::stoi(max_nr_str); - tmp_fs.close(); - // TODO currently label+dense have to be int - size_t bytes_per_batch = - ((total_label_dim + dense_dim) * sizeof(int) + total_max_sparse_dim * sizeof(TypeKey)) * - batch_size; - Alignment_t aligned_type = reader_params.async_param.aligned_type; - int num_threads = reader_params.async_param.num_threads; - int num_batches_per_thread = reader_params.async_param.num_batches_per_thread; - int max_num_requests_per_thread = reader_params.async_param.max_num_requests_per_thread; - int io_depth = reader_params.async_param.io_depth; - int io_alignment = reader_params.async_param.io_alignment; - bool shuffle = reader_params.async_param.shuffle; - - // Could be different if eval and train datasets are on different storage systems - int max_logical_sector_size = - std::max(get_logical_sector_size(source_data), get_logical_sector_size(eval_source)); - - if (max_logical_sector_size > io_alignment) { - HCTR_LOG_C(WARNING, WORLD, "Invalid io_alignment of ", io_alignment, ", using ", - max_logical_sector_size, '\n'); - io_alignment = max_logical_sector_size; - } - - int io_block_size = io_alignment; - // TODO train_reader + evaluate_reader + init_reader? - int max_nr_requests_user = max_num_requests_per_thread * num_threads; - int max_num_batches = num_batches_per_thread * num_threads; - - // note that nr_requests = max_num_batches * (bytes_per_batch / io_block_size + 2). Each - // batch has at least 2 io requests - if (max_nr_requests_user > max_nr_requests_allowed_system) { - HCTR_LOG( - WARNING, WORLD, - "Too many concurrent io requests, will automatically compute (overall #io requests " - "= num_batches_per_thread * num_threads * (bytes_per_batch / io_block_size+2).\n"); - max_nr_requests_user = - std::max(2, (max_nr_requests_allowed_system - 1) / max_num_batches) * max_num_batches; - } - if (max_nr_requests_user > max_nr_requests_allowed_system || - max_num_batches * 2 >= max_nr_requests_user) { - HCTR_DIE("Too many batches for each thread!\n"); - } - HCTR_LOG_S(INFO, ROOT) << "total_max_sparse_dim = " << total_max_sparse_dim << std::endl; - HCTR_LOG_S(INFO, ROOT) << "max_nr_requests_user = " << max_nr_requests_user << std::endl; - HCTR_LOG_S(INFO, ROOT) << "bytes_per_batch = " << bytes_per_batch << std::endl; - HCTR_LOG_S(INFO, ROOT) << "max_num_batches = " << max_num_batches << std::endl; - int next_nr_requests = 0; - for (int io_blk = io_alignment;; io_blk += io_alignment) { - actual_nr_requests = max_num_batches * (bytes_per_batch / io_blk + 2); - next_nr_requests = max_num_batches * (bytes_per_batch / (io_blk + 1) + 2); - // upper_bound - if ((actual_nr_requests <= max_nr_requests_user && actual_nr_requests > next_nr_requests) || - bytes_per_batch < io_blk) { - io_block_size = io_blk; - break; - } - } - // int num_blocks_per_batch = max_nr_requests_user / max_num_batches - 2; - - HCTR_CHECK_HINT(io_block_size % io_alignment == 0, - " params_.io_block_size \% params_.io_alignment != 0"); - - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_threads = " << num_threads << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_batches_per_thread = " << num_batches_per_thread - << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: total_io_nr_requests = " << actual_nr_requests - << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_block_size = " << io_block_size << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_depth = " << io_depth << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_alignment = " << io_alignment << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: shuffle = " << (shuffle ? "ON" : "OFF") << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_iterations_statistics = " - << num_iterations_statistics << std::endl; - - const bool wait_for_gpu_idle = train_intra_iteration_overlap; // scheduling H2D - train_data_reader.reset(new AsyncReader( - source_data, batch_size, total_label_dim, dense_dim, input.data_reader_sparse_param_array, - use_mixed_precision, resource_manager, num_threads, num_batches_per_thread, io_block_size, - io_depth, io_alignment, shuffle, wait_for_gpu_idle, aligned_type)); - - // If we want to cache eval, make sure we have enough buffers - auto eval_num_batches_per_thread = num_batches_per_thread; - int cache_eval_data = reader_params.cache_eval_data; - if (cache_eval_data > num_threads * num_batches_per_thread) { - eval_num_batches_per_thread = (cache_eval_data + num_threads - 1) / num_threads; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: eval reader increased batches per thread to " - << eval_num_batches_per_thread << " to accommodate for the caching" - << std::endl; - } - - // Small IO block may lead to too many AIO requests which hang, - // so use a larger one for eval and init which are typically larger than train - evaluate_data_reader.reset(new AsyncReader( - eval_source, batch_size_eval, total_label_dim, dense_dim, - input.data_reader_sparse_param_array, use_mixed_precision, resource_manager, num_threads, - eval_num_batches_per_thread, io_block_size * 8, io_depth, io_alignment, false, false, - aligned_type)); - - init_data_reader.reset(new AsyncReader( - source_data, num_iterations_statistics * batch_size, total_label_dim, dense_dim, - input.data_reader_sparse_param_array, use_mixed_precision, resource_manager, 1, 1, - io_block_size * 8, 4, io_alignment, false, false, aligned_type)); - - auto train_data_reader_as = - std::dynamic_pointer_cast>(train_data_reader); - auto evaluate_data_reader_as = - std::dynamic_pointer_cast>(evaluate_data_reader); - - if (input.data_reader_sparse_param_array.size() > 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "Only one sparse input is supported."); - } - const auto& sparse_input = - sparse_input_map.find(input.data_reader_sparse_param_array[0].top_name); - sparse_input->second.train_sparse_tensors = train_data_reader_as->get_value_tensor23s(); - sparse_input->second.evaluate_sparse_tensors = evaluate_data_reader_as->get_value_tensor23s(); + } else { + HCTR_OWN_THROW(Error_t::WrongInput, "Only multi-hot async datareader is supported."); } auto schedulable_train_reader = @@ -506,13 +374,13 @@ template void add_input(Input&, DataReaderParams&, std::vector>&, std::vector>&, std::shared_ptr&, std::shared_ptr&, - std::shared_ptr&, size_t, size_t, bool, bool, bool, - size_t, const std::shared_ptr); + size_t, size_t, bool, bool, bool, size_t, + const std::shared_ptr); template void add_input(Input&, DataReaderParams&, std::map>&, std::vector>&, std::vector>&, std::shared_ptr&, std::shared_ptr&, - std::shared_ptr&, size_t, size_t, bool, bool, - bool, size_t, const std::shared_ptr); + size_t, size_t, bool, bool, bool, size_t, + const std::shared_ptr); } // namespace HugeCTR diff --git a/HugeCTR/src/pybind/add_sparse_embedding.cpp b/HugeCTR/src/pybind/add_sparse_embedding.cpp index 165aa227d2..4d9078ab41 100644 --- a/HugeCTR/src/pybind/add_sparse_embedding.cpp +++ b/HugeCTR/src/pybind/add_sparse_embedding.cpp @@ -16,9 +16,7 @@ #include #include -#include #include -#include #include #include #include @@ -142,35 +140,9 @@ SparseEmbedding get_sparse_embedding_from_json(const nlohmann::json& j_sparse_em } } } - HybridEmbeddingParam hybrid_embedding_param; - hybrid_embedding_param.max_num_frequent_categories = - get_value_from_json_soft(j_hparam, "max_num_frequent_categories", 1); - hybrid_embedding_param.max_num_infrequent_samples = - get_value_from_json_soft(j_hparam, "max_num_infrequent_samples", -1); - hybrid_embedding_param.p_dup_max = - get_value_from_json_soft(j_hparam, "p_dup_max", 1. / 100); - hybrid_embedding_param.max_all_reduce_bandwidth = - get_value_from_json_soft(j_hparam, "max_all_reduce_bandwidth", 1.3e11); - hybrid_embedding_param.max_all_to_all_bandwidth = - get_value_from_json_soft(j_hparam, "max_all_to_all_bandwidth", 1.9e11); - hybrid_embedding_param.efficiency_bandwidth_ratio = - get_value_from_json_soft(j_hparam, "efficiency_bandwidth_ratio", 1.0); - std::string communication_type_string = - get_value_from_json_soft(j_hparam, "communication_type", "IB_NVLink"); - std::string hybrid_embedding_type_string = - get_value_from_json_soft(j_hparam, "hybrid_embedding_type", "Distributed"); - if (!find_item_in_map(hybrid_embedding_param.communication_type, communication_type_string, - COMMUNICATION_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, "No such communication type: " + communication_type_string); - } - if (!find_item_in_map(hybrid_embedding_param.hybrid_embedding_type, hybrid_embedding_type_string, - HYBRID_EMBEDDING_TYPE_MAP)) { - HCTR_OWN_THROW(Error_t::WrongInput, - "No such hybrid embedding type: " + hybrid_embedding_type_string); - } - SparseEmbedding sparse_embedding = SparseEmbedding( - embedding_type, workspace_size_per_gpu_in_mb, embedding_vec_size, combiner_str, top_name, - bottom_name, slot_size_array, embedding_opt_params, hybrid_embedding_param); + SparseEmbedding sparse_embedding = + SparseEmbedding(embedding_type, workspace_size_per_gpu_in_mb, embedding_vec_size, + combiner_str, top_name, bottom_name, slot_size_array, embedding_opt_params); return sparse_embedding; } @@ -181,6 +153,7 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding, std::vector>& evaluate_tensor_entries_list, std::vector>& embeddings, const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager, size_t batch_size, size_t batch_size_eval, OptParams& embedding_opt_params, std::shared_ptr& exchange_wgrad, bool use_cuda_graph, @@ -235,57 +208,6 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding, embedding_params, resource_manager)); break; } - case Embedding_t::LocalizedSlotSparseEmbeddingOneHot: { - const SparseEmbeddingHashParams embedding_params = {batch_size, - batch_size_eval, - 0, - sparse_embedding.slot_size_array, - embedding_vec_size, - sparse_input.max_feature_num_per_sample, - sparse_input.slot_num, - combiner, // combiner: 0-sum, 1-mean - embedding_opt_params}; - embeddings.emplace_back(new LocalizedSlotSparseEmbeddingOneHot( - core_helper::convert_sparse_tensors23_to_sparse_tensors( - sparse_input.train_sparse_tensors), - core_helper::convert_sparse_tensors23_to_sparse_tensors( - sparse_input.evaluate_sparse_tensors), - embedding_params, resource_manager)); - break; - } - case Embedding_t::HybridSparseEmbedding: { - auto& embed_wgrad_buff = - (grouped_all_reduce) - ? std::dynamic_pointer_cast>(exchange_wgrad) - ->get_embed_wgrad_buffs() - : std::dynamic_pointer_cast>(exchange_wgrad) - ->get_embed_wgrad_buffs(); - - const HybridSparseEmbeddingParams embedding_params = { - batch_size, - batch_size_eval, - num_iterations_statistics, // TBD - sparse_embedding.hybrid_embedding_param.max_num_frequent_categories * - std::max(batch_size, batch_size_eval), // TBD - sparse_embedding.hybrid_embedding_param.max_num_infrequent_samples, // TBD - sparse_embedding.hybrid_embedding_param.p_dup_max, - embedding_vec_size, - sparse_input.slot_num, - sparse_embedding.slot_size_array, - sparse_embedding.hybrid_embedding_param.communication_type, - sparse_embedding.hybrid_embedding_param.max_all_reduce_bandwidth, - sparse_embedding.hybrid_embedding_param.max_all_to_all_bandwidth, // TBD - sparse_embedding.hybrid_embedding_param.efficiency_bandwidth_ratio, - sparse_embedding.hybrid_embedding_param.hybrid_embedding_type, - embedding_opt_params}; - embeddings.emplace_back(new HybridSparseEmbedding( - core_helper::convert_sparse_tensors23_to_sparse_tensors( - sparse_input.train_sparse_tensors), - core_helper::convert_sparse_tensors23_to_sparse_tensors( - sparse_input.evaluate_sparse_tensors), - embedding_params, embed_wgrad_buff, gpu_lr_sches, use_cuda_graph, resource_manager)); - break; - } default: HCTR_OWN_THROW(Error_t::UnspecificError, "add_sparse_embedding with no specified embedding type."); @@ -306,25 +228,25 @@ void add_sparse_embedding(SparseEmbedding& sparse_embedding, template void add_sparse_embedding( SparseEmbedding&, std::map>&, std::vector>&, std::vector>&, - std::vector>&, const std::shared_ptr&, size_t, - size_t, OptParams&, std::shared_ptr&, bool, bool, size_t, - GpuLearningRateSchedulers&); + std::vector>&, const std::shared_ptr&, + const std::shared_ptr&, size_t, size_t, OptParams&, + std::shared_ptr&, bool, bool, size_t, GpuLearningRateSchedulers&); template void add_sparse_embedding( SparseEmbedding&, std::map>&, std::vector>&, std::vector>&, - std::vector>&, const std::shared_ptr&, size_t, - size_t, OptParams&, std::shared_ptr&, bool, bool, size_t, - GpuLearningRateSchedulers&); + std::vector>&, const std::shared_ptr&, + const std::shared_ptr&, size_t, size_t, OptParams&, + std::shared_ptr&, bool, bool, size_t, GpuLearningRateSchedulers&); template void add_sparse_embedding( SparseEmbedding&, std::map>&, std::vector>&, std::vector>&, - std::vector>&, const std::shared_ptr&, size_t, - size_t, OptParams&, std::shared_ptr&, bool, bool, size_t, - GpuLearningRateSchedulers&); + std::vector>&, const std::shared_ptr&, + const std::shared_ptr&, size_t, size_t, OptParams&, + std::shared_ptr&, bool, bool, size_t, GpuLearningRateSchedulers&); template void add_sparse_embedding( SparseEmbedding&, std::map>&, std::vector>&, std::vector>&, - std::vector>&, const std::shared_ptr&, size_t, - size_t, OptParams&, std::shared_ptr&, bool, bool, size_t, - GpuLearningRateSchedulers&); + std::vector>&, const std::shared_ptr&, + const std::shared_ptr&, size_t, size_t, OptParams&, + std::shared_ptr&, bool, bool, size_t, GpuLearningRateSchedulers&); } // namespace HugeCTR diff --git a/HugeCTR/src/pybind/model.cpp b/HugeCTR/src/pybind/model.cpp index 8a419ea282..ed0b08a30e 100644 --- a/HugeCTR/src/pybind/model.cpp +++ b/HugeCTR/src/pybind/model.cpp @@ -22,15 +22,13 @@ #include #include #include -#include #include -#include #include #include #include #include #include -#include +#include #include using namespace HugeCTR::MultiHot; @@ -38,58 +36,6 @@ namespace HugeCTR { namespace { -/** - * check if device is available. - * lowest available CC is min_major.min_minor - * @param device_id gpu id - * @param min_major minimum compute compatibility required - * @param min_minor minimum compute compatibility required - */ -// #define DATA_READING_TEST 1 -static std::vector& split(const std::string& s, char delim, - std::vector& elems) { - std::istringstream is(s); - std::string item; - while (std::getline(is, item, delim)) { - elems.push_back(item); - } - return elems; -} - -static std::string join(std::vector& strs, std::string delim) { - std::string str; - const std::vector::iterator itlast = strs.end() - 1; - for (auto it = strs.begin(); it != strs.end(); it++) { - str += *it; - if (it != itlast) { - str += delim; - } - } - return str; -} - -static std::string get_tensor_shape(std::string tensor_name, - std::map> tensor_shape_info) { - std::string shape = ""; - if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) { - shape += "("; - for (unsigned int i = 0; i < tensor_shape_info[tensor_name].size(); i++) { - shape += std::to_string(tensor_shape_info[tensor_name][i]); - shape += ","; - } - shape.back() = ')'; - } - return shape; -} -static std::string get_tensor_shape(std::string tensor_name, - std::map tensor_shape_info) { - std::stringstream ss; - if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) { - ss << tensor_shape_info[tensor_name]; - } - return ss.str(); -} - static void check_device(int device_id, int min_major, int min_minor) { int device_count = 0; HCTR_LIB_THROW(cudaGetDeviceCount(&device_count)); @@ -115,25 +61,6 @@ static void check_device(int device_id, int min_major, int min_minor) { return; } -template -auto load_key_files(std::vector const& key_files) { - std::vector keys_vec; - for (auto const& key_file : key_files) { - auto key_file_size = std::filesystem::file_size(key_file); - auto num_new_keys = key_file_size / sizeof(TypeKey); - std::ifstream key_fs(key_file, std::ifstream::binary); - if (!key_fs.is_open()) { - HCTR_OWN_THROW(Error_t::WrongInput, "Cannot open the file: " + key_file); - } - auto num_exist_keys = keys_vec.size(); - keys_vec.resize(num_exist_keys + num_new_keys); - key_fs.read(reinterpret_cast(&keys_vec[num_exist_keys]), key_file_size); - } - std::sort(keys_vec.begin(), keys_vec.end()); - keys_vec.erase(std::unique(keys_vec.begin(), keys_vec.end()), keys_vec.end()); - return keys_vec; -} - } // end namespace DenseLayerComputeConfig::DenseLayerComputeConfig() : async_wgrad(false), fuse_wb(false){}; @@ -235,16 +162,14 @@ SparseEmbedding::SparseEmbedding(Embedding_t embedding_type, size_t workspace_si size_t embedding_vec_size, const std::string& combiner_str, std::string sparse_embedding_name, std::string bottom_name, std::vector& slot_size_array, - std::shared_ptr& embedding_opt_params, - const HybridEmbeddingParam& hybrid_embedding_param) + std::shared_ptr& embedding_opt_params) : embedding_type(embedding_type), workspace_size_per_gpu_in_mb(workspace_size_per_gpu_in_mb), embedding_vec_size(embedding_vec_size), sparse_embedding_name(sparse_embedding_name), bottom_name(bottom_name), slot_size_array(slot_size_array), - embedding_opt_params(embedding_opt_params), - hybrid_embedding_param(hybrid_embedding_param) { + embedding_opt_params(embedding_opt_params) { if (combiner_str == "sum") { combiner = 0; } else if (combiner_str == "mean") { @@ -364,21 +289,26 @@ void init_learning_rate_scheduler(std::shared_ptr& lr_sch } void init_exchange_wgrad(const std::shared_ptr& resource_manager, + const std::shared_ptr& collective_manager, std::shared_ptr& exchange_wgrad, const Solver& solver) { HCTR_LOG(INFO, ROOT, "Using All-reduce algorithm: %s\n", ALLREDUCE_ALGO_TO_STRING[solver.all_reduce_algo].c_str()); - resource_manager->set_ar_comm(solver.all_reduce_algo, solver.use_mixed_precision); + collective_manager->set_ar_comm(solver.all_reduce_algo, solver.use_mixed_precision); if (solver.grouped_all_reduce) { if (solver.use_mixed_precision) { - exchange_wgrad = std::make_shared>(resource_manager); + exchange_wgrad = + std::make_shared>(resource_manager, collective_manager); } else { - exchange_wgrad = std::make_shared>(resource_manager); + exchange_wgrad = + std::make_shared>(resource_manager, collective_manager); } } else { if (solver.use_mixed_precision) { - exchange_wgrad = std::make_shared>(resource_manager); + exchange_wgrad = + std::make_shared>(resource_manager, collective_manager); } else { - exchange_wgrad = std::make_shared>(resource_manager); + exchange_wgrad = + std::make_shared>(resource_manager, collective_manager); } } } @@ -408,13 +338,12 @@ Model::Model(const Solver& solver, const DataReaderParams& reader_params, } else { HCTR_LOG(INFO, ROOT, "Initialize model: %s\n", solver_.model_name.c_str()); } - resource_manager_ = ResourceManagerExt::create(solver.vvgpu, solver.seed, solver.device_layout); - + resource_manager_ = ResourceManagerCore::create(solver.vvgpu, solver.seed, solver.device_layout); + collective_manager_ = std::make_shared(resource_manager_); embedding_para_io_ = std::shared_ptr( new embedding::EmbeddingParameterIO(resource_manager_)); - init_exchange_wgrad(resource_manager_, exchange_wgrad_, solver_); + init_exchange_wgrad(resource_manager_, collective_manager_, exchange_wgrad_, solver_); - graph_scheduler_ = std::make_unique(resource_manager_); for (auto dev : resource_manager_->get_local_gpu_device_id_list()) { if (solver_.use_mixed_precision) { check_device(dev, 7, @@ -507,719 +436,6 @@ void Model::construct_from_json(const std::string& graph_config_file, bool inclu HCTR_LOG(INFO, ROOT, "Load the model graph from %s successfully\n", graph_config_file.c_str()); } -// deep copy -void Model::create_copy_ops_for_network_input(const std::string& dense_name, - const std::string& label_name, bool is_train) { - auto& copy_ops = is_train ? graph_.train_copy_ops_ : graph_.evaluate_copy_ops_; - auto& tensor_entries_list = - is_train ? train_tensor_entities_list_ : evaluate_tensor_entities_list_; - - int num_local_gpus = resource_manager_->get_local_gpu_count(); - // copy ops for dense & label - copy_ops.resize(2 * num_local_gpus); - - for (int id = 0; id < num_local_gpus; ++id) { - core23::Device device(core23::DeviceType::GPU, - resource_manager_->get_local_gpu(id)->get_device_id()); - for (auto& tensor_entry : tensor_entries_list[id]) { - if (tensor_entry.name == dense_name) { - copy_ops[id].reset( - new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor)); - tensor_entry.tensor = copy_ops[id]->get_tensorbag(); - } else if (tensor_entry.name == label_name) { - copy_ops[id + num_local_gpus].reset( - new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor)); - tensor_entry.tensor = copy_ops[id + num_local_gpus]->get_tensorbag(); - } else { - HCTR_OWN_THROW(Error_t::WrongInput, "wrong tensor entry name when creating copy_op."); - } - } - } -} - -void Model::add(Input& input) { - std::string label_name = input.labels_.begin()->first; - int label_dim = input.labels_.begin()->second; - - // If multiple labels, treat them as 1 big label and add a split layer (below) - if (input.labels_.size() > 1) { - label_name = "combined_multi_label"; - label_dim = std::accumulate(std::begin(input.labels_), std::end(input.labels_), 0, - [](const int previous, const std::pair& p) { - return previous + p.second; - }); - } - - input_params_.push_back(input); - activate_tensor(tensor_active_, label_name); - activate_tensor(tensor_active_, input.dense_name); - data_input_info_.push_back(label_name); - data_input_info_.push_back(input.dense_name); - tensor_shape_info_raw_.insert( - std::make_pair(label_name, std::vector{solver_.batchsize, label_dim})); - tensor_shape_info_raw_.insert( - std::make_pair(input.dense_name, std::vector{solver_.batchsize, input.dense_dim})); - if (solver_.use_embedding_collection) { - std::vector top_name_list; - std::vector nnz_per_slot; - bool is_fixed_length = true; - int num_slot = 0; - for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) { - auto& p = input.data_reader_sparse_param_array[i]; - top_name_list.push_back(p.top_name); - if (p.slot_num != 1) { - HCTR_OWN_THROW( - Error_t::WrongInput, - "To use embedding collection, slots_num should be set to 1 in each sparse_param. " - "Please refer to notebooks/embedding_collection.ipynb and separate your multi-slot " - "output into multiple single-slot output"); - } - nnz_per_slot.push_back(p.nnz_per_slot[0]); - if (!p.is_fixed_length) is_fixed_length = false; - num_slot += 1; - hotness_map_.insert({p.top_name, p.max_feature_num}); - } - std::string concat_top_name = join(top_name_list, ","); - DataReaderSparseParam concat_data_reader_sparse_param{concat_top_name, nnz_per_slot, - is_fixed_length, num_slot}; - input.data_reader_sparse_param_array = {concat_data_reader_sparse_param}; - } - std::vector sparse_names; - for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) { - sparse_names.push_back(input.data_reader_sparse_param_array[i].top_name); - tensor_shape_info_raw_.insert(std::make_pair( - input.data_reader_sparse_param_array[i].top_name, - std::vector{solver_.batchsize, input.data_reader_sparse_param_array[i].slot_num})); - } - data_input_info_.push_back(join(sparse_names, ",")); - for (unsigned int i = 0; i < input.data_reader_sparse_param_array.size(); i++) { - activate_tensor(tensor_active_, input.data_reader_sparse_param_array[i].top_name); - } - if (solver_.i64_input_key) { - add_input(input, reader_params_, sparse_input_map_64_, train_tensor_entities_list_, - evaluate_tensor_entities_list_, train_data_reader_, evaluate_data_reader_, - init_data_reader_, solver_.batchsize, solver_.batchsize_eval, - solver_.use_mixed_precision, solver_.repeat_dataset, - solver_.train_intra_iteration_overlap, solver_.num_iterations_statistics, - resource_manager_); - } else { - add_input(input, reader_params_, sparse_input_map_32_, - train_tensor_entities_list_, evaluate_tensor_entities_list_, - train_data_reader_, evaluate_data_reader_, init_data_reader_, - solver_.batchsize, solver_.batchsize_eval, solver_.use_mixed_precision, - solver_.repeat_dataset, solver_.train_intra_iteration_overlap, - solver_.num_iterations_statistics, resource_manager_); - } - - if (solver_.use_embedding_collection and solver_.train_inter_iteration_overlap) { - create_copy_ops_for_network_input(input.dense_name, label_name, true); - } - if (solver_.use_embedding_collection and solver_.eval_inter_iteration_overlap) { - create_copy_ops_for_network_input(input.dense_name, label_name, false); - } - - // Add label weights to model - for (std::map::iterator iter = input.label_weights_.begin(); - iter != input.label_weights_.end(); ++iter) { - label_weights_.insert(std::make_pair(iter->first, iter->second)); - } - - // If multiple labels provided, add a Slice layer to handle breaking up the label - if (input.labels_.size() > 1) { - std::vector label_names; - std::vector> ranges; - int idx = 0; - - for (std::map::iterator iter = input.labels_.begin(); - iter != input.labels_.end(); ++iter) { - label_names.push_back(iter->first); - if (iter->second < 1) { - HCTR_OWN_THROW(Error_t::WrongInput, "Each label dimension must be at lesat 1."); - } - ranges.push_back(std::make_pair(idx, idx + iter->second)); - idx += iter->second; - } - std::vector bottom_name{"combined_multi_label"}; - DenseLayer label_slice_layer = DenseLayer(Layer_t::Slice, bottom_name, label_names); - label_slice_layer.ranges = ranges; - - add(label_slice_layer); - } -} - -void Model::add(SparseEmbedding& sparse_embedding) { - if (resource_manager_->get_num_process() == 1 && solver_.grouped_all_reduce && - sparse_embedding.embedding_type == Embedding_t::HybridSparseEmbedding) { - HCTR_DIE("Grouped all reduce for HybridEmbedding is not supported on single node\n"); - } - if ((reader_params_.data_reader_type == DataReaderType_t::RawAsync && - sparse_embedding.embedding_type != Embedding_t::HybridSparseEmbedding) || - (reader_params_.data_reader_type != DataReaderType_t::RawAsync && - sparse_embedding.embedding_type == Embedding_t::HybridSparseEmbedding)) { - HCTR_OWN_THROW(Error_t::WrongInput, "Raw async reader and hybrid embedding must come together"); - } - OptParams embedding_opt_params; - if (!(sparse_embedding.embedding_opt_params)->initialized) { - sparse_embedding.embedding_opt_params = opt_params_py_; - sparse_embedding.initialize_max_vocabulary_size_per_gpu(); - } - sparse_embedding.max_vocabulary_size_global = - sparse_embedding.max_vocabulary_size_per_gpu * resource_manager_->get_global_gpu_count(); - sparse_embedding_params_.push_back(sparse_embedding); - deactivate_tensor(tensor_active_, sparse_embedding.bottom_name); - activate_tensor(tensor_active_, sparse_embedding.sparse_embedding_name); - int slot_num = tensor_shape_info_raw_[sparse_embedding.bottom_name][1]; - tensor_shape_info_raw_.insert( - std::make_pair(sparse_embedding.sparse_embedding_name, - std::vector{solver_.batchsize, slot_num, - static_cast(sparse_embedding.embedding_vec_size)})); - input_output_info_.push_back( - std::make_pair(sparse_embedding.bottom_name, sparse_embedding.sparse_embedding_name)); - layer_info_.push_back(EMBEDDING_TYPE_TO_STRING[sparse_embedding.embedding_type]); - - embedding_opt_params_list_.push_back(sparse_embedding.embedding_opt_params); - init_optimizer_params(embedding_opt_params, solver_, sparse_embedding.embedding_opt_params); - if (solver_.i64_input_key && !solver_.use_mixed_precision) { - add_sparse_embedding( - sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_, - evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize, - solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph, - solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_); - } else if (solver_.i64_input_key && solver_.use_mixed_precision) { - add_sparse_embedding( - sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_, - evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize, - solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph, - solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_); - } else if (!solver_.i64_input_key && !solver_.use_mixed_precision) { - add_sparse_embedding( - sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_, - evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize, - solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph, - solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_); - } else { - add_sparse_embedding( - sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_, - evaluate_tensor_entities_list_, embeddings_, resource_manager_, solver_.batchsize, - solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, solver_.use_cuda_graph, - solver_.grouped_all_reduce, solver_.num_iterations_statistics, gpu_lr_sches_); - } - embeddings_map_.insert( - std::make_pair(sparse_embedding.sparse_embedding_name, embeddings_.back())); - embedding_dependent_tensors_.insert(sparse_embedding.sparse_embedding_name); -} - -void Model::add(DenseLayer& dense_layer) { - for (auto& top_name : dense_layer.top_names) { - if (tensor_shape_info_raw_.find(top_name) != tensor_shape_info_raw_.end()) { - HCTR_OWN_THROW(Error_t::WrongInput, top_name + ", top tensor name already exists"); - } - } - for (auto& bottom_name : dense_layer.bottom_names) { - if (tensor_shape_info_raw_.find(bottom_name) == tensor_shape_info_raw_.end()) { - HCTR_OWN_THROW(Error_t::WrongInput, bottom_name + ", bottom tensor name does not exists"); - } - } - calculate_tensor_dimensions(tensor_shape_info_raw_, dense_layer); - dense_layer_params_raw_.push_back(dense_layer); -} - -template -void allocate_ebc_output_helper_for_feature_major( - std::shared_ptr resource_manager_, size_t batch_size_per_gpu, - const EmbeddingCollectionConfig& ebc_config, - const embedding::EmbeddingCollectionParam& ebc_param, - std::vector>& tensor_entries_list_, - std::vector& ebc_output) { - HCTR_CHECK(ebc_config.output_layout_ == embedding::EmbeddingLayout::FeatureMajor); - int num_local_gpus = resource_manager_->get_local_gpu_count(); - for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { - CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); - core23::Device device(core23::DeviceType::GPU, - resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); - auto buffer_channel = core23::GetRandomBufferChannel(); - core23::Tensor head_tensor; - core23::BufferParams buffer_param{.channel = buffer_channel}; - core23::TensorParams tensor_param = core23::TensorParams().buffer_params(buffer_param); - int64_t concat_dims = 0; - for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { - const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; - std::string top_name = ebc_config.top_names_[lookup_id]; - int64_t emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat) - ? lookup_param.max_hotness * lookup_param.ev_size - : lookup_param.ev_size; - - core23::Tensor tmp_tensor(tensor_param.shape({(int64_t)batch_size_per_gpu, 1ll, emb_out_dims}) - .device(device) - .data_type(core23::ToScalarType::value)); - concat_dims += emb_out_dims; - tensor_entries_list_[local_gpu_id].push_back({top_name, tmp_tensor}); - if (!lookup_id) { - head_tensor = tmp_tensor; - } - } - // allocate - void* starting_address = head_tensor.data(); - core23::Tensor continous_emb_output = core23::Tensor::bind( - starting_address, core23::Shape({static_cast(batch_size_per_gpu), concat_dims}), - core23::ToScalarType::value, device); - ebc_output.push_back(continous_emb_output); - } -} - -template -void allocate_ebc_output_helper_for_batch_major( - std::shared_ptr resource_manager_, size_t batch_size_per_gpu, - const EmbeddingCollectionConfig& ebc_config, - const embedding::EmbeddingCollectionParam& ebc_param, - std::vector>& tensor_entries_list_, - std::vector& ebc_output) { - int num_local_gpus = resource_manager_->get_local_gpu_count(); - for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { - CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); - - core23::Device device(core23::DeviceType::GPU, - resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); - core23::TensorParams tensor_param; - int64_t emb_out_dims = 0; - for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { - const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; - - emb_out_dims += (lookup_param.combiner == embedding::Combiner::Concat) - ? lookup_param.max_hotness * lookup_param.ev_size - : lookup_param.ev_size; - } - - core23::Tensor continous_emb_output( - tensor_param.shape({(int64_t)batch_size_per_gpu, emb_out_dims}) - .device(device) - .data_type(core23::ToScalarType::value)); - continous_emb_output.data(); - ebc_output.push_back(continous_emb_output); - - tensor_entries_list_[local_gpu_id].push_back( - {ebc_config.batch_major_output_name_, continous_emb_output}); - } -} - -std::vector get_table_id_to_vocabulary_size( - const std::vector& table_params, bool need_vocabulary_size) { - // indices only need to initialize table offset - if (!need_vocabulary_size) { - return {}; - } - - // 2. init table_id_to_vocabulary_size and check if there is dynamic table - std::vector table_id_to_vocabulary_size; - std::transform(table_params.begin(), table_params.end(), - std::back_inserter(table_id_to_vocabulary_size), - [](const embedding::EmbeddingTableParam& table_param) { - return table_param.max_vocabulary_size; - }); - - std::for_each(table_id_to_vocabulary_size.begin(), table_id_to_vocabulary_size.end(), - [](int vocabulary_size) { - HCTR_CHECK_HINT(vocabulary_size > 0, "vocabuary_size should > 0."); - }); - return table_id_to_vocabulary_size; -} - -void Model::add(const EmbeddingCollectionConfig& user_ebc_config) { - auto ebc_config = split_column_wise_sharding_config(user_ebc_config); - TableNameToIDDict table_name_to_id_dict = - create_table_name_to_id_dict_from_ebc_config(ebc_config); - int global_ebc_id = static_cast(ebc_list_.size()); - for (auto& [name, id] : table_name_to_id_dict) { - HCTR_CHECK_HINT(ebc_name_to_global_id_dict_.find(name) == ebc_name_to_global_id_dict_.end(), - "Duplicate table name: ", name, "\n"); - ebc_name_to_global_id_dict_[name] = {global_ebc_id, id}; - } - int num_total_gpus = resource_manager_->get_global_gpu_count(); - int num_local_gpus = resource_manager_->get_local_gpu_count(); - - int num_lookup = ebc_config.lookup_configs_.size(); - core23::DataType key_type = - solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32; - core23::DataType index_type = - solver_.i64_input_key ? core23::ScalarType::UInt64 : core23::ScalarType::UInt32; - core23::DataType offset_type = - solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32; - core23::DataType emb_type = - solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float; - core23::DataType wgrad_type = - solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float; - embedding::EmbeddingLayout input_layout_ = - reader_params_.data_reader_type == DataReaderType_t::RawAsync - ? embedding::EmbeddingLayout::FeatureMajor - : embedding::EmbeddingLayout::BatchMajor; - - std::vector bottom_name_list; - for (auto& bottom_name : ebc_config.bottom_names_) { - bottom_name_list.push_back(bottom_name); - } - - std::string bottom_name = join(bottom_name_list, ","); - deactivate_tensor(tensor_active_, bottom_name); - - layer_info_.push_back("EmbeddingCollection" + std::to_string(ebc_list_.size())); - - auto lookup_params = create_lookup_params_from_ebc_config(table_name_to_id_dict, ebc_config); - for (int lookup_id = 0; lookup_id < num_lookup; ++lookup_id) { - auto b_name = ebc_config.bottom_names_[ebc_config.dr_lookup_ids_[lookup_id]]; - lookup_params[lookup_id].max_hotness = hotness_map_[b_name]; - } - - auto shard_matrix = create_shard_matrix_from_ebc_config(table_name_to_id_dict, ebc_config); - - auto grouped_emb_params = - create_grouped_embedding_param_from_ebc_config(table_name_to_id_dict, ebc_config); - - int num_table = ebc_config.emb_table_config_list_.size(); - auto emb_table_list = create_table_params_from_ebc_config(table_name_to_id_dict, ebc_config); - for (auto& p : emb_table_list) { - if (p.opt_param.optimizer == Optimizer_t::NOT_INITIALIZED) { - p.opt_param = opt_params_; - } - } - - embedding::AllreduceStrategy allreduce_strategy = ebc_config.allreduce_strategy_; - if (solver_.grouped_all_reduce) { - allreduce_strategy = embedding::AllreduceStrategy::GroupDense; - } - - auto compression_param = - create_compression_param_from_ebc_config(table_name_to_id_dict, ebc_config); - embedding::EmbeddingCollectionParam ebc_param{num_table, - num_lookup, - lookup_params, - shard_matrix, - grouped_emb_params, - solver_.batchsize, - key_type, - index_type, - offset_type, - emb_type, - wgrad_type, - input_layout_, - ebc_config.output_layout_, - ebc_config.sort_strategy_, - ebc_config.keys_preprocess_strategy_, - allreduce_strategy, - ebc_config.comm_strategy_, - compression_param}; - - embedding::EmbeddingCollectionParam eval_ebc_param{num_table, - num_lookup, - lookup_params, - shard_matrix, - grouped_emb_params, - solver_.batchsize_eval, - key_type, - index_type, - offset_type, - emb_type, - wgrad_type, - input_layout_, - ebc_config.output_layout_, - ebc_config.sort_strategy_, - ebc_config.keys_preprocess_strategy_, - ebc_config.allreduce_strategy_, - ebc_config.comm_strategy_, - compression_param}; - - std::vector> core_list; - - for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { - auto core_resource_manager = - std::make_shared(resource_manager_, local_gpu_id); - core_list.push_back(core_resource_manager); - } - ebc_list_.push_back(std::make_unique( - resource_manager_, core_list, ebc_param, eval_ebc_param, emb_table_list, exchange_wgrad_)); - embedding_para_io_->add_embedding_collection((ebc_list_[ebc_list_.size() - 1]).get()); - - auto prepare_ebc_input = [&](auto& sparse_input_map, bool is_longlong) { - core23::DataType SparseType = is_longlong ? core23::DataType(core23::ScalarType::Int64) - : core23::DataType(core23::ScalarType::UInt32); - auto tensor_as_type = [&](core23::Tensor input, core23::DataType expected_type) { - auto origin_type = input.data_type(); - HCTR_CHECK_HINT(origin_type.size() == expected_type.size(), - "Size not equal, cannot reinterpret type"); - return core23::Tensor::bind(input.data(), input.shape(), expected_type, input.device()); - }; - auto train_sparse_tensors = sparse_input_map[bottom_name].train_sparse_tensors; - auto evaluate_sparse_tensors = sparse_input_map[bottom_name].evaluate_sparse_tensors; - - for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { - CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); - core23::Device device{core23::DeviceType::GPU, - static_cast( - resource_manager_->get_local_gpu(local_gpu_id)->get_device_id())}; - auto train_key_tensor = - tensor_as_type(train_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType); - train_ebc_key_list_.push_back(train_key_tensor); - - auto train_bucket_range_tensor = - tensor_as_type(train_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType); - train_ebc_bucket_range_list_.push_back(train_bucket_range_tensor); - - train_ebc_num_keys_list_.push_back(train_sparse_tensors[local_gpu_id].get_nnz_ptr().get()); - - auto evaluate_key_tensor = - tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType); - evaluate_ebc_key_list_.push_back(evaluate_key_tensor); - - auto evaluate_bucket_range_tensor = - tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType); - evaluate_ebc_bucket_range_list_.push_back(evaluate_bucket_range_tensor); - - evaluate_ebc_num_keys_list_.push_back( - evaluate_sparse_tensors[local_gpu_id].get_nnz_ptr().get()); - } - }; - - if (reader_params_.data_reader_type != DataReaderType_t::RawAsync) { - if (solver_.i64_input_key) { - prepare_ebc_input(sparse_input_map_64_, true); - } else { - prepare_ebc_input(sparse_input_map_32_, false); - } - } - - // activate_ebc_output_tensor - size_t batch_size_per_gpu = solver_.batchsize / num_total_gpus; - size_t eval_batch_size_per_gpu = solver_.batchsize_eval / num_total_gpus; - if (ebc_param.output_layout_ == embedding::EmbeddingLayout::FeatureMajor) { - std::vector top_name_list; - for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { - embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; - int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat) - ? lookup_param.max_hotness * lookup_param.ev_size - : lookup_param.ev_size; - - std::string top_name = ebc_config.top_names_[lookup_id]; - top_name_list.push_back(top_name); - - activate_tensor(tensor_active_, top_name); - tensor_shape_info_raw_.insert({top_name, {solver_.batchsize, 1, emb_out_dims}}); - embedding_dependent_tensors_.insert(top_name); - } - input_output_info_.push_back(std::make_pair(bottom_name, join(top_name_list, ","))); - if (solver_.use_mixed_precision) { - allocate_ebc_output_helper_for_feature_major<__half>( - resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, - train_ebc_outptut_); - allocate_ebc_output_helper_for_feature_major<__half>( - resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, - evaluate_tensor_entities_list_, evaluate_ebc_outptut_); - } else { - allocate_ebc_output_helper_for_feature_major( - resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, - train_ebc_outptut_); - allocate_ebc_output_helper_for_feature_major( - resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, - evaluate_tensor_entities_list_, evaluate_ebc_outptut_); - } - } else { - int concate_out_dims = 0; - for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { - embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; - - int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat) - ? lookup_param.max_hotness * lookup_param.ev_size - : lookup_param.ev_size; - concate_out_dims += emb_out_dims; - } - - activate_tensor(tensor_active_, ebc_config.batch_major_output_name_); - tensor_shape_info_raw_.insert( - {ebc_config.batch_major_output_name_, {solver_.batchsize, concate_out_dims}}); - input_output_info_.push_back(std::make_pair(bottom_name, ebc_config.batch_major_output_name_)); - embedding_dependent_tensors_.insert(ebc_config.batch_major_output_name_); - - // allocate output buffer - if (solver_.use_mixed_precision) { - allocate_ebc_output_helper_for_batch_major<__half>( - resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, - train_ebc_outptut_); - allocate_ebc_output_helper_for_batch_major<__half>( - resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, - evaluate_tensor_entities_list_, evaluate_ebc_outptut_); - } else { - allocate_ebc_output_helper_for_batch_major( - resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, - train_ebc_outptut_); - allocate_ebc_output_helper_for_batch_major( - resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, - evaluate_tensor_entities_list_, evaluate_ebc_outptut_); - } - } - - train_ddl_output_.clear(); - cache_train_ddl_output_.clear(); - evaluate_ddl_output_.clear(); - cache_evaluate_ddl_output_.clear(); - for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { - train_ddl_output_.push_back( - allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param)); - if (solver_.train_inter_iteration_overlap) { - cache_train_ddl_output_.push_back( - allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param)); - } - evaluate_ddl_output_.push_back( - allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param)); - if (solver_.eval_inter_iteration_overlap) { - cache_evaluate_ddl_output_.push_back( - allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param)); - } - } - - // create data distributors - train_data_distributor_ = std::make_shared(core_list, ebc_param, emb_table_list, - ebc_config.dr_lookup_ids_); - eval_data_distributor_ = std::make_shared( - core_list, eval_ebc_param, emb_table_list, ebc_config.dr_lookup_ids_); -} - -void Model::pre_add_dense_layer(DenseLayer& dense_layer) { - embedding_dependent_ = false; - for (auto& bottom_name : dense_layer.bottom_names) { - deactivate_tensor(tensor_active_, bottom_name); - if (embedding_dependent_tensors_.find(bottom_name) != embedding_dependent_tensors_.end()) { - embedding_dependent_ = true; - } - } - for (auto& top_name : dense_layer.top_names) { - activate_tensor(tensor_active_, top_name); - if (embedding_dependent_) { - embedding_dependent_tensors_.insert(top_name); - } - } - std::string input_names = join(dense_layer.bottom_names, ","); - std::string output_names = join(dense_layer.top_names, ","); - input_output_info_.push_back(std::make_pair(input_names, output_names)); - if (solver_.use_mixed_precision) { - layer_info_.push_back(LAYER_TYPE_TO_STRING_MP[dense_layer.layer_type]); - } else { - layer_info_.push_back(LAYER_TYPE_TO_STRING[dense_layer.layer_type]); - } -} - -void Model::graph_analysis() { - HCTR_LOG(INFO, ROOT, "Graph analysis to resolve tensor dependency\n"); - std::map tensor_usage; - std::map tensor_slice_layer; - std::map tensor_slice_index; - for (auto& dense_layer : dense_layer_params_raw_) { - for (auto& bottom_name : dense_layer.bottom_names) { - analyze_tensor(tensor_usage, bottom_name); - } - } - for (auto iter = tensor_usage.begin(); iter != tensor_usage.end(); iter++) { - if (iter->second > 5) { - HCTR_OWN_THROW(Error_t::WrongInput, "The graph should not include more than 5-way branches"); - } - if (iter->second > 1) { - std::vector bottom_names{iter->first}; - std::vector top_names; - std::vector> ranges; - for (unsigned int i = 0; i < iter->second; i++) { - top_names.push_back(iter->first + "_slice" + std::to_string(i)); - auto dims = tensor_shape_info_raw_[iter->first].size(); - ranges.emplace_back(std::make_pair(0, tensor_shape_info_raw_[iter->first][dims - 1])); - } - DenseLayer slice_layer(Layer_t::Slice, bottom_names, top_names); - slice_layer.ranges = ranges; - tensor_slice_layer.insert(std::pair(iter->first, slice_layer)); - tensor_slice_index.insert(std::pair(iter->first, 0)); - HCTR_LOG(INFO, ROOT, "Add Slice layer for tensor: %s, creating %d copies\n", - iter->first.c_str(), iter->second); - } - } - for (auto& dense_layer : dense_layer_params_raw_) { - bool flag = true; - for (auto& bottom_name : dense_layer.bottom_names) { - if (tensor_usage[bottom_name] > 1) { - flag = false; - break; - } - } - if (flag) { - dense_layer_params_.push_back(dense_layer); - } else { - DenseLayer new_dense_layer = dense_layer; - for (unsigned int i = 0; i < new_dense_layer.bottom_names.size(); i++) { - std::string old_bottom_name = new_dense_layer.bottom_names[i]; - if (tensor_slice_index.find(old_bottom_name) != tensor_slice_index.end()) { - auto iter = tensor_slice_layer.find(old_bottom_name); - if (tensor_slice_index[old_bottom_name] == 0) { - dense_layer_params_.push_back(iter->second); - } - std::string new_bottom_name = iter->second.top_names[tensor_slice_index[old_bottom_name]]; - tensor_slice_index[old_bottom_name] += 1; - new_dense_layer.bottom_names[i] = new_bottom_name; - } - } - dense_layer_params_.push_back(new_dense_layer); - } - } - add_dense_layers(dense_layer_params_); -} - -void Model::compile() { - if (!graph_finalized_) { - graph_analysis(); - graph_finalized_ = true; - } - if (data_input_info_.size() < 3 || layer_info_.size() < 2) { - HCTR_OWN_THROW(Error_t::IllegalCall, "The model should include input and at least two layers"); - } - HCTR_PRINT(INFO, - "===================================================Model " - "Compile===================================================\n"); - build_networks(); - - // TODO: this is a WAR; need to find a way to remove the preallocation - for (int local_gpu_id = 0; local_gpu_id < resource_manager_->get_local_gpu_count(); - ++local_gpu_id) { - auto device_id = resource_manager_->get_local_gpu(local_gpu_id)->get_device_id(); - core23::Device device(core23::DeviceType::GPU, device_id); - bool success = core23::AllocateBuffers(device); - if (!success) { - HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl; - } - } - core23::Device device_h(core23::DeviceType::CPU); - bool success = core23::AllocateBuffers(device_h); - if (!success) { - HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl; - } - initialize(); - create_metrics(); - create_pipelines(); -} - -void Model::compile(std::vector& label_names, std::vector& label_weights) { - update_label_weights(label_names, label_weights); - compile(); -} - -void Model::update_label_weights(std::vector& label_names, - std::vector& label_weights) { - // Add implementation and support in next merge request - if (label_names.size() != label_weights.size()) { - HCTR_OWN_THROW(Error_t::WrongInput, "Must have the same number of label names and weights"); - } - std::map::iterator loss_lookup; - for (size_t i = 0; i < label_names.size(); ++i) { - loss_lookup = label_weights_.find(label_names[i]); - if (loss_lookup == label_weights_.end()) { - HCTR_OWN_THROW(Error_t::WrongInput, "Label name not found: " + label_names[i]); - } - loss_lookup->second = label_weights[i]; - } -} - void Model::load_dense_optimizer_states(const std::string& dense_opt_states_file) { if (!buff_allocated_) { HCTR_OWN_THROW(Error_t::IllegalCall, @@ -1448,76 +664,6 @@ void Model::embedding_dump(const std::string& path, const std::vector layer_type{layer_info_[i]}; - std::vector input_names; - std::vector output_names; - split(input_output_info_[i].first, ',', input_names); - split(input_output_info_[i].second, ',', output_names); - size_t lines = - input_names.size() > output_names.size() ? input_names.size() : output_names.size(); - layer_type.insert(layer_type.end(), lines - 1, ""); - if (lines > input_names.size()) { - input_names.insert(input_names.end(), lines - input_names.size(), ""); - } - if (lines > output_names.size()) { - output_names.insert(output_names.end(), lines - output_names.size(), ""); - } - for (size_t j = 0; j < lines; j++) { - log << std::left << std::setw(40) << std::setfill(' ') << layer_type[j] << std::left - << std::setw(30) << std::setfill(' ') << input_names[j] << std::left << std::setw(30) - << std::setfill(' ') << output_names[j] << std::left << std::setw(30) << std::setfill(' ') - << get_tensor_shape(output_names[j], tensor_shape_info_) << std::endl; - } - log << "----------------------------------------------" - "-----------------------------------" - "---------------------------------" - << std::endl; - } -} - void Model::set_source(std::string source, std::string eval_source) { if (solver_.repeat_dataset) { HCTR_OWN_THROW(Error_t::IllegalCall, @@ -1863,13 +1009,6 @@ void Model::fit(int num_epochs, int max_iter, int display, int eval_interval, in } // end if else high_level_eval_ = false; } -void Model::exchange_wgrad(size_t device_id) { - auto& gpu_resource = resource_manager_->get_local_gpu(device_id); - CudaCPUDeviceContext context(gpu_resource->get_device_id()); - if (resource_manager_->get_global_gpu_count() > 1) { - exchange_wgrad_->allreduce(device_id, gpu_resource->get_stream()); - } -} bool Model::skip_prefetch_in_last_batch(bool is_train) { bool inter_overlap = @@ -1924,9 +1063,6 @@ bool Model::train() { // a file list source, set "num_workers" to a dvisior // of the number of data files in the file list. We // will look into some alternatives in the long term. - if (is_scheduled_datareader() and is_scheduled_embedding()) { - graph_scheduler_->trickling(); - } const char* const skip_h2d_env = std::getenv("SKIP_H2D"); bool skip_h2d = (skip_h2d_env != nullptr && 1 == std::atoi(skip_h2d_env)); @@ -1956,11 +1092,6 @@ bool Model::train() { return true; } - if (is_scheduled_datareader() && is_scheduled_embedding()) { - train_pipeline(current_batchsize); - return true; - } - auto network_update = [&](int id) { networks_[id]->update_params(); }; for (auto& one_embedding : embeddings_) { @@ -2036,26 +1167,22 @@ bool Model::eval() { return true; } - if (is_scheduled_datareader() && is_scheduled_embedding()) { - evaluate_pipeline(current_batchsize); - } else { - for (size_t i = 0; i < embeddings_.size(); ++i) { - auto& one_embedding = embeddings_.at(i); - one_embedding->forward(false); - } + for (size_t i = 0; i < embeddings_.size(); ++i) { + auto& one_embedding = embeddings_.at(i); + one_embedding->forward(false); + } #pragma omp parallel num_threads(number_of_networks()) - { - size_t id = omp_get_thread_num(); - auto gpu = resource_manager_->get_local_gpu(id); + { + size_t id = omp_get_thread_num(); + auto gpu = resource_manager_->get_local_gpu(id); - // doesn't do anything if eval_overlap disabled - graph_.evaluate_pipeline_[id].run(); - } + // doesn't do anything if eval_overlap disabled + graph_.evaluate_pipeline_[id].run(); + } - for (auto& metric : metrics_) { - metric->global_reduce(number_of_networks()); - } + for (auto& metric : metrics_) { + metric->global_reduce(number_of_networks()); } #endif @@ -2376,254 +1503,6 @@ void Model::check_out_tensor(Tensor_t tensor_type, int index, float* global_resu } } -void Model::create_networks() { - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - networks_.emplace_back(new Network(resource_manager_->get_local_cpu(), - resource_manager_->get_local_gpu(i), - solver_.use_mixed_precision)); - } - train_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count()); - evaluate_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count()); -} - -void Model::build_networks() { - for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { - networks_[i]->create_and_set_optimizer(opt_params_); - } - auto aligned_size = 16 * resource_manager_->get_local_gpu_count(); - core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel() - : GetWgradBufferChannel()}; - for (int g = 0; g < resource_manager_->get_local_gpu_count(); g++) { - auto device_id = resource_manager_->get_local_gpu(g)->get_device_id(); - core23::Device device(core23::DeviceType::GPU, device_id); - auto wgrad_buffer = core23::GetBuffer(bp, device); - auto wgrad_size = wgrad_buffer->reserved_size(); - size_t padded_bytes = wgrad_size % aligned_size; - padded_bytes += aligned_size - padded_bytes; - // alignment requirements from grouped allreduce. - wgrad_tensor_successor_.emplace_back(core23::TensorParams() - .device(device) - .shape({static_cast(padded_bytes)}) - .data_type(core23::ScalarType::Char) - .buffer_params(bp)); - } - buff_allocated_ = true; -} - -void Model::initialize() { -#ifndef DATA_READING_TEST - -#pragma omp parallel num_threads(number_of_networks()) - { - size_t id = omp_get_thread_num(); - networks_[id]->initialize(); - if (solver_.use_algorithm_search) { - networks_[id]->search_algorithm(); - } - HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(id)->get_stream())); - } - - int num_gpus = resource_manager_->get_local_gpu_count(); - std::vector wgrad_buffer_ptrs; - size_t wgrad_buffer_size{}; - core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel() - : GetWgradBufferChannel()}; - for (int g = 0; g < num_gpus; g++) { - auto device_id = resource_manager_->get_local_gpu(g)->get_device_id(); - core23::Device device(core23::DeviceType::GPU, device_id); - auto wgrad_buffer = core23::GetBuffer(bp, device); - auto [ptr_, size_] = wgrad_buffer->decay(); - wgrad_buffer_size = size_; - HCTR_CHECK_HINT(size_ && ptr_, "wgrad is null or it's a confederal buffer"); - wgrad_buffer_ptrs.push_back(ptr_); - } - exchange_wgrad_->init_ar_comm(wgrad_buffer_ptrs, wgrad_buffer_size); -#endif - init_params_for_dense_(); - if (solver_.perf_logging) { - for (size_t i = 0; i < dense_layer_params_.size(); i++) { - bool is_trainable = - TRAINABLE_LAYERS.find(dense_layer_params_[i].layer_type) != TRAINABLE_LAYERS.end(); - if (is_trainable) { - std::string output_names = join(dense_layer_params_[i].top_names, "-"); - HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization", output_names); - } - } - } - init_params_for_sparse_(); -} // namespace HugeCTR -void Model::create_metrics() { - int num_total_gpus = resource_manager_->get_global_gpu_count(); - int label_dim = input_params_[0].labels_.begin()->second; - if (input_params_[0].labels_.size() > 1) { - auto labs = input_params_[0].labels_; - label_dim = std::accumulate(std::begin(labs), std::end(labs), 0, - [](const int previous, const std::pair& p) { - return previous + p.second; - }); - } - - auto num_metrics = [&]() { return networks_[0]->get_raw_metrics_all().size(); }; - for (const auto& metric : solver_.metrics_spec) { - // Only AUC is currently supported for models with more than one loss layer - if ((metric.first != metrics::Type::AUC) && num_metrics() > 1) { - HCTR_OWN_THROW(Error_t::WrongInput, - "Metrics besides AUC are not supported for multi-task models."); - } - - metrics_.emplace_back(std::move(metrics::Metric::Create( - metric.first, solver_.use_mixed_precision, solver_.batchsize_eval / num_total_gpus, - solver_.max_eval_batches, label_dim, resource_manager_))); - } -} - -void Model::create_pipelines() { - // TODO: currently it is only for HE - if (embeddings_.size() == 1) { - auto lr_scheds = embeddings_[0]->get_learning_rate_schedulers(); - for (size_t i = 0; i < lr_scheds.size(); i++) { - networks_[i]->set_learning_rate_scheduler(lr_scheds[i]); - } - } - - if (is_scheduled_datareader() && is_scheduled_embedding()) { - // will create pipeline for sparse embedding and dense network - create_train_pipeline(networks_); - create_evaluate_pipeline(networks_); - } else { - if (solver_.use_embedding_collection) { - create_train_pipeline_with_ebc(networks_); - create_evaluate_pipeline_with_ebc(networks_); - } else { - // will create pipeline for dense network. - create_train_network_pipeline(networks_); - create_eval_network_pipeline(networks_); - } - } - - size_t embed_wgrad_size = 0; - if (!reader_params_.async_param.multi_hot_reader) { - auto train_data_reader_ar_i64 = dynamic_cast*>(train_data_reader_.get()); - auto eval_data_reader_ar_i64 = - dynamic_cast*>(evaluate_data_reader_.get()); - auto init_data_reader_ar_i64 = dynamic_cast*>(init_data_reader_.get()); - - auto train_data_reader_ar_i32 = - dynamic_cast*>(train_data_reader_.get()); - auto eval_data_reader_ar_i32 = - dynamic_cast*>(evaluate_data_reader_.get()); - auto init_data_reader_ar_i32 = - dynamic_cast*>(init_data_reader_.get()); - - // FIXME: - // If doing async indices, the Hybrid Sparse Embedding needs access to the sparse tensor buffers - // since we need to initialize the Frequent & Infrequent indices with those exact buffers. - // Otherwise we allocate two copies (one in AsyncReader and the other in HSE) which will cause - // us to OOM. We need to refactor the Frequent/Infrequent Embedding and IndicesView classes to - // not require the sparse tensor buffers on construction. - for (size_t i = 0; i < sparse_embedding_params_.size(); i++) { - if (sparse_embedding_params_[i].embedding_type == Embedding_t::HybridSparseEmbedding) { - if (solver_.use_mixed_precision && solver_.i64_input_key) { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - if (solver_.train_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i64); - } - if (solver_.eval_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i64); - } - } else if (solver_.use_mixed_precision && !solver_.i64_input_key) { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - if (solver_.train_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i32); - } - if (solver_.eval_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i32); - } - } else if (!solver_.use_mixed_precision && solver_.i64_input_key) { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - if (solver_.train_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i64); - } - if (solver_.eval_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i64); - } - } else { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - if (solver_.train_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(true, train_data_reader_ar_i32); - } - if (solver_.eval_inter_iteration_overlap) { - hybrid_embedding->setup_buffered_indices(false, eval_data_reader_ar_i32); - } - } - } - } - - // start to touch dataset, so we can record run_start - if (solver_.perf_logging) { - HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "init_stop"); - HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "run_start"); - } - - if (init_data_reader_ar_i32) { - init_data_reader_ar_i32->start(); - init_data_reader_ar_i32->read_a_batch_to_device(); - } - if (init_data_reader_ar_i64) { - init_data_reader_ar_i64->start(); - init_data_reader_ar_i64->read_a_batch_to_device(); - } - - for (size_t i = 0; i < sparse_embedding_params_.size(); i++) { - if (sparse_embedding_params_[i].embedding_type == Embedding_t::HybridSparseEmbedding) { - if (solver_.use_mixed_precision && solver_.i64_input_key) { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - hybrid_embedding->init_model(init_data_reader_ar_i64->get_value_tensors(), - embed_wgrad_size); - } else if (solver_.use_mixed_precision && !solver_.i64_input_key) { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - hybrid_embedding->init_model(init_data_reader_ar_i32->get_value_tensors(), - embed_wgrad_size); - } else if (!solver_.use_mixed_precision && solver_.i64_input_key) { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - hybrid_embedding->init_model(init_data_reader_ar_i64->get_value_tensors(), - embed_wgrad_size); - } else { - auto hybrid_embedding = - dynamic_cast*>(embeddings_[i].get()); - hybrid_embedding->init_model(init_data_reader_ar_i32->get_value_tensors(), - embed_wgrad_size); - } - } - } - } else { - if (solver_.perf_logging) { - HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "init_stop"); - HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "run_start"); - } - } - - if (solver_.perf_logging) { - for (size_t i = 0; i < sparse_embedding_params_.size(); i++) { - HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization", - sparse_embedding_params_[i].sparse_embedding_name); - } - } - -#ifdef ENABLE_MPI - if (resource_manager_->get_num_process() > 1) { - resource_manager_->set_ready_to_transfer(); - } -#endif -} - size_t Model::number_of_networks() const { return networks_.size(); } } // namespace HugeCTR diff --git a/HugeCTR/src/pybind/model_compile.cpp b/HugeCTR/src/pybind/model_compile.cpp new file mode 100644 index 0000000000..99600f3240 --- /dev/null +++ b/HugeCTR/src/pybind/model_compile.cpp @@ -0,0 +1,977 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace HugeCTR::MultiHot; + +namespace HugeCTR { +namespace { + +static std::string join(std::vector& strs, std::string delim) { + std::string str; + const std::vector::iterator itlast = strs.end() - 1; + for (auto it = strs.begin(); it != strs.end(); it++) { + str += *it; + if (it != itlast) { + str += delim; + } + } + return str; +} + +static std::vector& split(const std::string& s, char delim, + std::vector& elems) { + std::istringstream is(s); + std::string item; + while (std::getline(is, item, delim)) { + elems.push_back(item); + } + return elems; +} + +static std::string get_tensor_shape(std::string tensor_name, + std::map> tensor_shape_info) { + std::string shape = ""; + if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) { + shape += "("; + for (unsigned int i = 0; i < tensor_shape_info[tensor_name].size(); i++) { + shape += std::to_string(tensor_shape_info[tensor_name][i]); + shape += ","; + } + shape.back() = ')'; + } + return shape; +} + +static std::string get_tensor_shape(std::string tensor_name, + std::map tensor_shape_info) { + std::stringstream ss; + if (tensor_shape_info.find(tensor_name) != tensor_shape_info.end()) { + ss << tensor_shape_info[tensor_name]; + } + return ss.str(); +} + +} // namespace + +void Model::add(Input& input) { + std::string label_name = input.labels_.begin()->first; + int label_dim = input.labels_.begin()->second; + + // If multiple labels, treat them as 1 big label and add a split layer (below) + if (input.labels_.size() > 1) { + label_name = "combined_multi_label"; + label_dim = std::accumulate(std::begin(input.labels_), std::end(input.labels_), 0, + [](const int previous, const std::pair& p) { + return previous + p.second; + }); + } + + input_params_.push_back(input); + activate_tensor(tensor_active_, label_name); + activate_tensor(tensor_active_, input.dense_name); + data_input_info_.push_back(label_name); + data_input_info_.push_back(input.dense_name); + tensor_shape_info_raw_.insert( + std::make_pair(label_name, std::vector{solver_.batchsize, label_dim})); + tensor_shape_info_raw_.insert( + std::make_pair(input.dense_name, std::vector{solver_.batchsize, input.dense_dim})); + if (solver_.use_embedding_collection) { + std::vector top_name_list; + std::vector nnz_per_slot; + bool is_fixed_length = true; + int num_slot = 0; + for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) { + auto& p = input.data_reader_sparse_param_array[i]; + top_name_list.push_back(p.top_name); + if (p.slot_num != 1) { + HCTR_OWN_THROW( + Error_t::WrongInput, + "To use embedding collection, slots_num should be set to 1 in each sparse_param. " + "Please refer to notebooks/embedding_collection.ipynb and separate your multi-slot " + "output into multiple single-slot output"); + } + nnz_per_slot.push_back(p.nnz_per_slot[0]); + if (!p.is_fixed_length) is_fixed_length = false; + num_slot += 1; + hotness_map_.insert({p.top_name, p.max_feature_num}); + } + std::string concat_top_name = join(top_name_list, ","); + DataReaderSparseParam concat_data_reader_sparse_param{concat_top_name, nnz_per_slot, + is_fixed_length, num_slot}; + input.data_reader_sparse_param_array = {concat_data_reader_sparse_param}; + } + std::vector sparse_names; + for (size_t i = 0; i < input.data_reader_sparse_param_array.size(); ++i) { + sparse_names.push_back(input.data_reader_sparse_param_array[i].top_name); + tensor_shape_info_raw_.insert(std::make_pair( + input.data_reader_sparse_param_array[i].top_name, + std::vector{solver_.batchsize, input.data_reader_sparse_param_array[i].slot_num})); + } + data_input_info_.push_back(join(sparse_names, ",")); + for (unsigned int i = 0; i < input.data_reader_sparse_param_array.size(); i++) { + activate_tensor(tensor_active_, input.data_reader_sparse_param_array[i].top_name); + } + if (solver_.i64_input_key) { + add_input(input, reader_params_, sparse_input_map_64_, train_tensor_entities_list_, + evaluate_tensor_entities_list_, train_data_reader_, evaluate_data_reader_, + solver_.batchsize, solver_.batchsize_eval, solver_.use_mixed_precision, + solver_.repeat_dataset, solver_.train_intra_iteration_overlap, + solver_.num_iterations_statistics, resource_manager_); + } else { + add_input(input, reader_params_, sparse_input_map_32_, + train_tensor_entities_list_, evaluate_tensor_entities_list_, + train_data_reader_, evaluate_data_reader_, solver_.batchsize, + solver_.batchsize_eval, solver_.use_mixed_precision, + solver_.repeat_dataset, solver_.train_intra_iteration_overlap, + solver_.num_iterations_statistics, resource_manager_); + } + + if (solver_.use_embedding_collection and solver_.train_inter_iteration_overlap) { + create_copy_ops_for_network_input(input.dense_name, label_name, true); + } + if (solver_.use_embedding_collection and solver_.eval_inter_iteration_overlap) { + create_copy_ops_for_network_input(input.dense_name, label_name, false); + } + + // Add label weights to model + for (std::map::iterator iter = input.label_weights_.begin(); + iter != input.label_weights_.end(); ++iter) { + label_weights_.insert(std::make_pair(iter->first, iter->second)); + } + + // If multiple labels provided, add a Slice layer to handle breaking up the label + if (input.labels_.size() > 1) { + std::vector label_names; + std::vector> ranges; + int idx = 0; + + for (std::map::iterator iter = input.labels_.begin(); + iter != input.labels_.end(); ++iter) { + label_names.push_back(iter->first); + if (iter->second < 1) { + HCTR_OWN_THROW(Error_t::WrongInput, "Each label dimension must be at lesat 1."); + } + ranges.push_back(std::make_pair(idx, idx + iter->second)); + idx += iter->second; + } + std::vector bottom_name{"combined_multi_label"}; + DenseLayer label_slice_layer = DenseLayer(Layer_t::Slice, bottom_name, label_names); + label_slice_layer.ranges = ranges; + + add(label_slice_layer); + } +} + +void Model::add(SparseEmbedding& sparse_embedding) { + OptParams embedding_opt_params; + if (!(sparse_embedding.embedding_opt_params)->initialized) { + sparse_embedding.embedding_opt_params = opt_params_py_; + sparse_embedding.initialize_max_vocabulary_size_per_gpu(); + } + sparse_embedding.max_vocabulary_size_global = + sparse_embedding.max_vocabulary_size_per_gpu * resource_manager_->get_global_gpu_count(); + sparse_embedding_params_.push_back(sparse_embedding); + deactivate_tensor(tensor_active_, sparse_embedding.bottom_name); + activate_tensor(tensor_active_, sparse_embedding.sparse_embedding_name); + int slot_num = tensor_shape_info_raw_[sparse_embedding.bottom_name][1]; + tensor_shape_info_raw_.insert( + std::make_pair(sparse_embedding.sparse_embedding_name, + std::vector{solver_.batchsize, slot_num, + static_cast(sparse_embedding.embedding_vec_size)})); + input_output_info_.push_back( + std::make_pair(sparse_embedding.bottom_name, sparse_embedding.sparse_embedding_name)); + layer_info_.push_back(EMBEDDING_TYPE_TO_STRING[sparse_embedding.embedding_type]); + + embedding_opt_params_list_.push_back(sparse_embedding.embedding_opt_params); + init_optimizer_params(embedding_opt_params, solver_, sparse_embedding.embedding_opt_params); + if (solver_.i64_input_key && !solver_.use_mixed_precision) { + add_sparse_embedding( + sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_, + evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_, + solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, + solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics, + gpu_lr_sches_); + } else if (solver_.i64_input_key && solver_.use_mixed_precision) { + add_sparse_embedding( + sparse_embedding, sparse_input_map_64_, train_tensor_entities_list_, + evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_, + solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, + solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics, + gpu_lr_sches_); + } else if (!solver_.i64_input_key && !solver_.use_mixed_precision) { + add_sparse_embedding( + sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_, + evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_, + solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, + solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics, + gpu_lr_sches_); + } else { + add_sparse_embedding( + sparse_embedding, sparse_input_map_32_, train_tensor_entities_list_, + evaluate_tensor_entities_list_, embeddings_, resource_manager_, collective_manager_, + solver_.batchsize, solver_.batchsize_eval, embedding_opt_params, exchange_wgrad_, + solver_.use_cuda_graph, solver_.grouped_all_reduce, solver_.num_iterations_statistics, + gpu_lr_sches_); + } + embeddings_map_.insert( + std::make_pair(sparse_embedding.sparse_embedding_name, embeddings_.back())); + embedding_dependent_tensors_.insert(sparse_embedding.sparse_embedding_name); +} + +void Model::add(DenseLayer& dense_layer) { + for (auto& top_name : dense_layer.top_names) { + if (tensor_shape_info_raw_.find(top_name) != tensor_shape_info_raw_.end()) { + HCTR_OWN_THROW(Error_t::WrongInput, top_name + ", top tensor name already exists"); + } + } + for (auto& bottom_name : dense_layer.bottom_names) { + if (tensor_shape_info_raw_.find(bottom_name) == tensor_shape_info_raw_.end()) { + HCTR_OWN_THROW(Error_t::WrongInput, bottom_name + ", bottom tensor name does not exists"); + } + } + calculate_tensor_dimensions(tensor_shape_info_raw_, dense_layer); + dense_layer_params_raw_.push_back(dense_layer); +} + +template +void allocate_ebc_output_helper_for_feature_major( + std::shared_ptr resource_manager_, size_t batch_size_per_gpu, + const EmbeddingCollectionConfig& ebc_config, + const embedding::EmbeddingCollectionParam& ebc_param, + std::vector>& tensor_entries_list_, + std::vector& ebc_output) { + HCTR_CHECK(ebc_config.output_layout_ == embedding::EmbeddingLayout::FeatureMajor); + int num_local_gpus = resource_manager_->get_local_gpu_count(); + for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { + CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); + core23::Device device(core23::DeviceType::GPU, + resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); + auto buffer_channel = core23::GetRandomBufferChannel(); + core23::Tensor head_tensor; + core23::BufferParams buffer_param{.channel = buffer_channel}; + core23::TensorParams tensor_param = core23::TensorParams().buffer_params(buffer_param); + int64_t concat_dims = 0; + for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { + const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; + std::string top_name = ebc_config.top_names_[lookup_id]; + int64_t emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat) + ? lookup_param.max_hotness * lookup_param.ev_size + : lookup_param.ev_size; + + core23::Tensor tmp_tensor(tensor_param.shape({(int64_t)batch_size_per_gpu, 1ll, emb_out_dims}) + .device(device) + .data_type(core23::ToScalarType::value)); + concat_dims += emb_out_dims; + tensor_entries_list_[local_gpu_id].push_back({top_name, tmp_tensor}); + if (!lookup_id) { + head_tensor = tmp_tensor; + } + } + // allocate + void* starting_address = head_tensor.data(); + core23::Tensor continous_emb_output = core23::Tensor::bind( + starting_address, core23::Shape({static_cast(batch_size_per_gpu), concat_dims}), + core23::ToScalarType::value, device); + ebc_output.push_back(continous_emb_output); + } +} + +template +void allocate_ebc_output_helper_for_batch_major( + std::shared_ptr resource_manager_, size_t batch_size_per_gpu, + const EmbeddingCollectionConfig& ebc_config, + const embedding::EmbeddingCollectionParam& ebc_param, + std::vector>& tensor_entries_list_, + std::vector& ebc_output) { + int num_local_gpus = resource_manager_->get_local_gpu_count(); + for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { + CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); + + core23::Device device(core23::DeviceType::GPU, + resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); + core23::TensorParams tensor_param; + int64_t emb_out_dims = 0; + for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { + const embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; + + emb_out_dims += (lookup_param.combiner == embedding::Combiner::Concat) + ? lookup_param.max_hotness * lookup_param.ev_size + : lookup_param.ev_size; + } + + core23::Tensor continous_emb_output( + tensor_param.shape({(int64_t)batch_size_per_gpu, emb_out_dims}) + .device(device) + .data_type(core23::ToScalarType::value)); + continous_emb_output.data(); + ebc_output.push_back(continous_emb_output); + + tensor_entries_list_[local_gpu_id].push_back( + {ebc_config.batch_major_output_name_, continous_emb_output}); + } +} + +void Model::add(const EmbeddingCollectionConfig& user_ebc_config) { + auto ebc_config = split_column_wise_sharding_config(user_ebc_config); + TableNameToIDDict table_name_to_id_dict = + create_table_name_to_id_dict_from_ebc_config(ebc_config); + int global_ebc_id = static_cast(ebc_list_.size()); + for (auto& [name, id] : table_name_to_id_dict) { + HCTR_CHECK_HINT(ebc_name_to_global_id_dict_.find(name) == ebc_name_to_global_id_dict_.end(), + "Duplicate table name: ", name, "\n"); + ebc_name_to_global_id_dict_[name] = {global_ebc_id, id}; + } + int num_total_gpus = resource_manager_->get_global_gpu_count(); + int num_local_gpus = resource_manager_->get_local_gpu_count(); + + int num_lookup = ebc_config.lookup_configs_.size(); + core23::DataType key_type = + solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32; + core23::DataType index_type = + solver_.i64_input_key ? core23::ScalarType::UInt64 : core23::ScalarType::UInt32; + core23::DataType offset_type = + solver_.i64_input_key ? core23::ScalarType::Int64 : core23::ScalarType::UInt32; + core23::DataType emb_type = + solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float; + core23::DataType wgrad_type = + solver_.use_mixed_precision ? core23::ScalarType::Half : core23::ScalarType::Float; + embedding::EmbeddingLayout input_layout_ = + reader_params_.data_reader_type == DataReaderType_t::RawAsync + ? embedding::EmbeddingLayout::FeatureMajor + : embedding::EmbeddingLayout::BatchMajor; + + std::vector bottom_name_list; + for (auto& bottom_name : ebc_config.bottom_names_) { + bottom_name_list.push_back(bottom_name); + } + + std::string bottom_name = join(bottom_name_list, ","); + deactivate_tensor(tensor_active_, bottom_name); + + layer_info_.push_back("EmbeddingCollection" + std::to_string(ebc_list_.size())); + + auto lookup_params = create_lookup_params_from_ebc_config(table_name_to_id_dict, ebc_config); + for (int lookup_id = 0; lookup_id < num_lookup; ++lookup_id) { + auto b_name = ebc_config.bottom_names_[ebc_config.dr_lookup_ids_[lookup_id]]; + lookup_params[lookup_id].max_hotness = hotness_map_[b_name]; + } + + auto shard_matrix = create_shard_matrix_from_ebc_config(table_name_to_id_dict, ebc_config); + + auto grouped_emb_params = + create_grouped_embedding_param_from_ebc_config(table_name_to_id_dict, ebc_config); + + int num_table = ebc_config.emb_table_config_list_.size(); + auto emb_table_list = create_table_params_from_ebc_config(table_name_to_id_dict, ebc_config); + for (auto& p : emb_table_list) { + if (p.opt_param.optimizer == Optimizer_t::NOT_INITIALIZED) { + p.opt_param = opt_params_; + } + } + + embedding::AllreduceStrategy allreduce_strategy = ebc_config.allreduce_strategy_; + if (solver_.grouped_all_reduce) { + allreduce_strategy = embedding::AllreduceStrategy::GroupDense; + } + + auto compression_param = + create_compression_param_from_ebc_config(table_name_to_id_dict, ebc_config); + embedding::EmbeddingCollectionParam ebc_param{num_table, + num_lookup, + lookup_params, + shard_matrix, + grouped_emb_params, + solver_.batchsize, + key_type, + index_type, + offset_type, + emb_type, + wgrad_type, + input_layout_, + ebc_config.output_layout_, + ebc_config.sort_strategy_, + ebc_config.keys_preprocess_strategy_, + allreduce_strategy, + ebc_config.comm_strategy_, + compression_param}; + + embedding::EmbeddingCollectionParam eval_ebc_param{num_table, + num_lookup, + lookup_params, + shard_matrix, + grouped_emb_params, + solver_.batchsize_eval, + key_type, + index_type, + offset_type, + emb_type, + wgrad_type, + input_layout_, + ebc_config.output_layout_, + ebc_config.sort_strategy_, + ebc_config.keys_preprocess_strategy_, + ebc_config.allreduce_strategy_, + ebc_config.comm_strategy_, + compression_param}; + + std::vector> core_list; + + for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { + auto core_resource_manager = + std::make_shared(resource_manager_, local_gpu_id); + core_list.push_back(core_resource_manager); + } + ebc_list_.push_back(std::make_unique( + resource_manager_, core_list, ebc_param, eval_ebc_param, emb_table_list, exchange_wgrad_)); + embedding_para_io_->add_embedding_collection((ebc_list_[ebc_list_.size() - 1]).get()); + + auto prepare_ebc_input = [&](auto& sparse_input_map, bool is_longlong) { + core23::DataType SparseType = is_longlong ? core23::DataType(core23::ScalarType::Int64) + : core23::DataType(core23::ScalarType::UInt32); + auto tensor_as_type = [&](core23::Tensor input, core23::DataType expected_type) { + auto origin_type = input.data_type(); + HCTR_CHECK_HINT(origin_type.size() == expected_type.size(), + "Size not equal, cannot reinterpret type"); + return core23::Tensor::bind(input.data(), input.shape(), expected_type, input.device()); + }; + auto train_sparse_tensors = sparse_input_map[bottom_name].train_sparse_tensors; + auto evaluate_sparse_tensors = sparse_input_map[bottom_name].evaluate_sparse_tensors; + + for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { + CudaDeviceContext context(resource_manager_->get_local_gpu(local_gpu_id)->get_device_id()); + core23::Device device{core23::DeviceType::GPU, + static_cast( + resource_manager_->get_local_gpu(local_gpu_id)->get_device_id())}; + auto train_key_tensor = + tensor_as_type(train_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType); + train_ebc_key_list_.push_back(train_key_tensor); + + auto train_bucket_range_tensor = + tensor_as_type(train_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType); + train_ebc_bucket_range_list_.push_back(train_bucket_range_tensor); + + train_ebc_num_keys_list_.push_back(train_sparse_tensors[local_gpu_id].get_nnz_ptr().get()); + + auto evaluate_key_tensor = + tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_value_tensor(), SparseType); + evaluate_ebc_key_list_.push_back(evaluate_key_tensor); + + auto evaluate_bucket_range_tensor = + tensor_as_type(evaluate_sparse_tensors[local_gpu_id].get_rowoffset_tensor(), SparseType); + evaluate_ebc_bucket_range_list_.push_back(evaluate_bucket_range_tensor); + + evaluate_ebc_num_keys_list_.push_back( + evaluate_sparse_tensors[local_gpu_id].get_nnz_ptr().get()); + } + }; + + if (reader_params_.data_reader_type != DataReaderType_t::RawAsync) { + if (solver_.i64_input_key) { + prepare_ebc_input(sparse_input_map_64_, true); + } else { + prepare_ebc_input(sparse_input_map_32_, false); + } + } + + // activate_ebc_output_tensor + size_t batch_size_per_gpu = solver_.batchsize / num_total_gpus; + size_t eval_batch_size_per_gpu = solver_.batchsize_eval / num_total_gpus; + if (ebc_param.output_layout_ == embedding::EmbeddingLayout::FeatureMajor) { + std::vector top_name_list; + for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { + embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; + int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat) + ? lookup_param.max_hotness * lookup_param.ev_size + : lookup_param.ev_size; + + std::string top_name = ebc_config.top_names_[lookup_id]; + top_name_list.push_back(top_name); + + activate_tensor(tensor_active_, top_name); + tensor_shape_info_raw_.insert({top_name, {solver_.batchsize, 1, emb_out_dims}}); + embedding_dependent_tensors_.insert(top_name); + } + input_output_info_.push_back(std::make_pair(bottom_name, join(top_name_list, ","))); + if (solver_.use_mixed_precision) { + allocate_ebc_output_helper_for_feature_major<__half>( + resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, + train_ebc_outptut_); + allocate_ebc_output_helper_for_feature_major<__half>( + resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, + evaluate_tensor_entities_list_, evaluate_ebc_outptut_); + } else { + allocate_ebc_output_helper_for_feature_major( + resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, + train_ebc_outptut_); + allocate_ebc_output_helper_for_feature_major( + resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, + evaluate_tensor_entities_list_, evaluate_ebc_outptut_); + } + } else { + int concate_out_dims = 0; + for (int lookup_id = 0; lookup_id < ebc_param.num_lookup; ++lookup_id) { + embedding::LookupParam& lookup_param = ebc_param.lookup_params[lookup_id]; + + int emb_out_dims = (lookup_param.combiner == embedding::Combiner::Concat) + ? lookup_param.max_hotness * lookup_param.ev_size + : lookup_param.ev_size; + concate_out_dims += emb_out_dims; + } + + activate_tensor(tensor_active_, ebc_config.batch_major_output_name_); + tensor_shape_info_raw_.insert( + {ebc_config.batch_major_output_name_, {solver_.batchsize, concate_out_dims}}); + input_output_info_.push_back(std::make_pair(bottom_name, ebc_config.batch_major_output_name_)); + embedding_dependent_tensors_.insert(ebc_config.batch_major_output_name_); + + // allocate output buffer + if (solver_.use_mixed_precision) { + allocate_ebc_output_helper_for_batch_major<__half>( + resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, + train_ebc_outptut_); + allocate_ebc_output_helper_for_batch_major<__half>( + resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, + evaluate_tensor_entities_list_, evaluate_ebc_outptut_); + } else { + allocate_ebc_output_helper_for_batch_major( + resource_manager_, batch_size_per_gpu, ebc_config, ebc_param, train_tensor_entities_list_, + train_ebc_outptut_); + allocate_ebc_output_helper_for_batch_major( + resource_manager_, eval_batch_size_per_gpu, ebc_config, ebc_param, + evaluate_tensor_entities_list_, evaluate_ebc_outptut_); + } + } + + train_ddl_output_.clear(); + cache_train_ddl_output_.clear(); + evaluate_ddl_output_.clear(); + cache_evaluate_ddl_output_.clear(); + for (int local_gpu_id = 0; local_gpu_id < num_local_gpus; ++local_gpu_id) { + train_ddl_output_.push_back( + allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param)); + if (solver_.train_inter_iteration_overlap) { + cache_train_ddl_output_.push_back( + allocate_output_for_data_distributor(core_list[local_gpu_id], ebc_param)); + } + evaluate_ddl_output_.push_back( + allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param)); + if (solver_.eval_inter_iteration_overlap) { + cache_evaluate_ddl_output_.push_back( + allocate_output_for_data_distributor(core_list[local_gpu_id], eval_ebc_param)); + } + } + + // create data distributors + train_data_distributor_ = std::make_shared(core_list, ebc_param, emb_table_list, + ebc_config.dr_lookup_ids_); + eval_data_distributor_ = std::make_shared( + core_list, eval_ebc_param, emb_table_list, ebc_config.dr_lookup_ids_); +} + +void Model::pre_add_dense_layer(DenseLayer& dense_layer) { + embedding_dependent_ = false; + for (auto& bottom_name : dense_layer.bottom_names) { + deactivate_tensor(tensor_active_, bottom_name); + if (embedding_dependent_tensors_.find(bottom_name) != embedding_dependent_tensors_.end()) { + embedding_dependent_ = true; + } + } + for (auto& top_name : dense_layer.top_names) { + activate_tensor(tensor_active_, top_name); + if (embedding_dependent_) { + embedding_dependent_tensors_.insert(top_name); + } + } + std::string input_names = join(dense_layer.bottom_names, ","); + std::string output_names = join(dense_layer.top_names, ","); + input_output_info_.push_back(std::make_pair(input_names, output_names)); + if (solver_.use_mixed_precision) { + layer_info_.push_back(LAYER_TYPE_TO_STRING_MP[dense_layer.layer_type]); + } else { + layer_info_.push_back(LAYER_TYPE_TO_STRING[dense_layer.layer_type]); + } +} + +void Model::graph_analysis() { + HCTR_LOG(INFO, ROOT, "Graph analysis to resolve tensor dependency\n"); + std::map tensor_usage; + std::map tensor_slice_layer; + std::map tensor_slice_index; + for (auto& dense_layer : dense_layer_params_raw_) { + for (auto& bottom_name : dense_layer.bottom_names) { + analyze_tensor(tensor_usage, bottom_name); + } + } + for (auto iter = tensor_usage.begin(); iter != tensor_usage.end(); iter++) { + if (iter->second > 5) { + HCTR_OWN_THROW(Error_t::WrongInput, "The graph should not include more than 5-way branches"); + } + if (iter->second > 1) { + std::vector bottom_names{iter->first}; + std::vector top_names; + std::vector> ranges; + for (unsigned int i = 0; i < iter->second; i++) { + top_names.push_back(iter->first + "_slice" + std::to_string(i)); + auto dims = tensor_shape_info_raw_[iter->first].size(); + ranges.emplace_back(std::make_pair(0, tensor_shape_info_raw_[iter->first][dims - 1])); + } + DenseLayer slice_layer(Layer_t::Slice, bottom_names, top_names); + slice_layer.ranges = ranges; + tensor_slice_layer.insert(std::pair(iter->first, slice_layer)); + tensor_slice_index.insert(std::pair(iter->first, 0)); + HCTR_LOG(INFO, ROOT, "Add Slice layer for tensor: %s, creating %d copies\n", + iter->first.c_str(), iter->second); + } + } + for (auto& dense_layer : dense_layer_params_raw_) { + bool flag = true; + for (auto& bottom_name : dense_layer.bottom_names) { + if (tensor_usage[bottom_name] > 1) { + flag = false; + break; + } + } + if (flag) { + dense_layer_params_.push_back(dense_layer); + } else { + DenseLayer new_dense_layer = dense_layer; + for (unsigned int i = 0; i < new_dense_layer.bottom_names.size(); i++) { + std::string old_bottom_name = new_dense_layer.bottom_names[i]; + if (tensor_slice_index.find(old_bottom_name) != tensor_slice_index.end()) { + auto iter = tensor_slice_layer.find(old_bottom_name); + if (tensor_slice_index[old_bottom_name] == 0) { + dense_layer_params_.push_back(iter->second); + } + std::string new_bottom_name = iter->second.top_names[tensor_slice_index[old_bottom_name]]; + tensor_slice_index[old_bottom_name] += 1; + new_dense_layer.bottom_names[i] = new_bottom_name; + } + } + dense_layer_params_.push_back(new_dense_layer); + } + } + add_dense_layers(dense_layer_params_); +} + +// deep copy +void Model::create_copy_ops_for_network_input(const std::string& dense_name, + const std::string& label_name, bool is_train) { + auto& copy_ops = is_train ? graph_.train_copy_ops_ : graph_.evaluate_copy_ops_; + auto& tensor_entries_list = + is_train ? train_tensor_entities_list_ : evaluate_tensor_entities_list_; + + int num_local_gpus = resource_manager_->get_local_gpu_count(); + // copy ops for dense & label + copy_ops.resize(2 * num_local_gpus); + + for (int id = 0; id < num_local_gpus; ++id) { + core23::Device device(core23::DeviceType::GPU, + resource_manager_->get_local_gpu(id)->get_device_id()); + for (auto& tensor_entry : tensor_entries_list[id]) { + if (tensor_entry.name == dense_name) { + copy_ops[id].reset( + new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor)); + tensor_entry.tensor = copy_ops[id]->get_tensorbag(); + } else if (tensor_entry.name == label_name) { + copy_ops[id + num_local_gpus].reset( + new CopyOpImpl(resource_manager_->get_local_gpu(id), tensor_entry.tensor)); + tensor_entry.tensor = copy_ops[id + num_local_gpus]->get_tensorbag(); + } else { + HCTR_OWN_THROW(Error_t::WrongInput, "wrong tensor entry name when creating copy_op."); + } + } + } +} + +void Model::compile() { + if (!graph_finalized_) { + graph_analysis(); + graph_finalized_ = true; + } + if (data_input_info_.size() < 3 || layer_info_.size() < 2) { + HCTR_OWN_THROW(Error_t::IllegalCall, "The model should include input and at least two layers"); + } + HCTR_PRINT(INFO, + "===================================================Model " + "Compile===================================================\n"); + build_networks(); + + // TODO: this is a WAR; need to find a way to remove the preallocation + for (int local_gpu_id = 0; local_gpu_id < resource_manager_->get_local_gpu_count(); + ++local_gpu_id) { + auto device_id = resource_manager_->get_local_gpu(local_gpu_id)->get_device_id(); + core23::Device device(core23::DeviceType::GPU, device_id); + bool success = core23::AllocateBuffers(device); + if (!success) { + HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl; + } + } + core23::Device device_h(core23::DeviceType::CPU); + bool success = core23::AllocateBuffers(device_h); + if (!success) { + HCTR_LOG_S(DEBUG, ROOT) << "Nothing to preallocate" << std::endl; + } + initialize(); + create_metrics(); + create_pipelines(); +} + +void Model::update_label_weights(std::vector& label_names, + std::vector& label_weights) { + // Add implementation and support in next merge request + if (label_names.size() != label_weights.size()) { + HCTR_OWN_THROW(Error_t::WrongInput, "Must have the same number of label names and weights"); + } + std::map::iterator loss_lookup; + for (size_t i = 0; i < label_names.size(); ++i) { + loss_lookup = label_weights_.find(label_names[i]); + if (loss_lookup == label_weights_.end()) { + HCTR_OWN_THROW(Error_t::WrongInput, "Label name not found: " + label_names[i]); + } + loss_lookup->second = label_weights[i]; + } +} + +void Model::compile(std::vector& label_names, std::vector& label_weights) { + update_label_weights(label_names, label_weights); + compile(); +} + +void Model::summary() { + if (!graph_finalized_) { + graph_analysis(); + graph_finalized_ = true; + } + if (data_input_info_.size() < 3 || layer_info_.size() < 2) { + HCTR_OWN_THROW(Error_t::IllegalCall, + "The model should include input and at " + "least two layers"); + } + for (auto tensor_entry : train_tensor_entities_list_[0]) { + tensor_shape_info_.insert(std::make_pair(tensor_entry.name, tensor_entry.tensor.shape())); + } + HCTR_PRINT(INFO, + "============================================" + "=======Model " + "Summary=====================================" + "==============\n"); + auto log = HCTR_LOG_S(INFO, ROOT); + log << "Model structure on each GPU" << std::endl; + log << std::left << std::setw(40) << std::setfill(' ') << "Label" << std::left << std::setw(30) + << std::setfill(' ') << "Dense" << std::left << std::setw(30) << std::setfill(' ') << "Sparse" + << std::endl; + log << std::left << std::setw(40) << std::setfill(' ') << data_input_info_[0] << std::left + << std::setw(30) << std::setfill(' ') << data_input_info_[1] << " " << std::left + << std::setw(30) << std::setfill(' ') << data_input_info_[2] << std::endl; + log << std::left << std::setw(40) << std::setfill(' ') + << get_tensor_shape(data_input_info_[0], tensor_shape_info_) << std::left << std::setw(40) + << std::setfill(' ') << get_tensor_shape(data_input_info_[1], tensor_shape_info_) + << std::endl; + log << "————————————————————————————————————————————————" + "—————————————————————————————————" + "—————————————————————————————————" + << std::endl; + log << std::left << std::setw(40) << std::setfill(' ') << "Layer Type" << std::left + << std::setw(30) << std::setfill(' ') << "Input Name" << std::left << std::setw(30) + << std::setfill(' ') << "Output Name" << std::left << std::setw(30) << std::setfill(' ') + << "Output Shape" << std::endl; + log << "————————————————————————————————————————————————" + "—————————————————————————————————" + "—————————————————————————————————" + << std::endl; + for (size_t i = 0; i < layer_info_.size(); ++i) { + std::vector layer_type{layer_info_[i]}; + std::vector input_names; + std::vector output_names; + split(input_output_info_[i].first, ',', input_names); + split(input_output_info_[i].second, ',', output_names); + size_t lines = + input_names.size() > output_names.size() ? input_names.size() : output_names.size(); + layer_type.insert(layer_type.end(), lines - 1, ""); + if (lines > input_names.size()) { + input_names.insert(input_names.end(), lines - input_names.size(), ""); + } + if (lines > output_names.size()) { + output_names.insert(output_names.end(), lines - output_names.size(), ""); + } + for (size_t j = 0; j < lines; j++) { + log << std::left << std::setw(40) << std::setfill(' ') << layer_type[j] << std::left + << std::setw(30) << std::setfill(' ') << input_names[j] << std::left << std::setw(30) + << std::setfill(' ') << output_names[j] << std::left << std::setw(30) << std::setfill(' ') + << get_tensor_shape(output_names[j], tensor_shape_info_) << std::endl; + } + log << "----------------------------------------------" + "-----------------------------------" + "---------------------------------" + << std::endl; + } +} + +void Model::create_networks() { + for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { + networks_.emplace_back(new Network(resource_manager_->get_local_cpu(), + resource_manager_->get_local_gpu(i), + solver_.use_mixed_precision)); + } + train_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count()); + evaluate_tensor_entities_list_.resize(resource_manager_->get_local_gpu_count()); +} + +void Model::build_networks() { + for (size_t i = 0; i < resource_manager_->get_local_gpu_count(); i++) { + networks_[i]->create_and_set_optimizer(opt_params_); + } + auto aligned_size = 16 * resource_manager_->get_local_gpu_count(); + core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel() + : GetWgradBufferChannel()}; + for (int g = 0; g < resource_manager_->get_local_gpu_count(); g++) { + auto device_id = resource_manager_->get_local_gpu(g)->get_device_id(); + core23::Device device(core23::DeviceType::GPU, device_id); + auto wgrad_buffer = core23::GetBuffer(bp, device); + auto wgrad_size = wgrad_buffer->reserved_size(); + size_t padded_bytes = wgrad_size % aligned_size; + padded_bytes += aligned_size - padded_bytes; + // alignment requirements from grouped allreduce. + wgrad_tensor_successor_.emplace_back(core23::TensorParams() + .device(device) + .shape({static_cast(padded_bytes)}) + .data_type(core23::ScalarType::Char) + .buffer_params(bp)); + } + buff_allocated_ = true; +} + +void Model::initialize() { +#ifndef DATA_READING_TEST + +#pragma omp parallel num_threads(number_of_networks()) + { + size_t id = omp_get_thread_num(); + networks_[id]->initialize(); + if (solver_.use_algorithm_search) { + networks_[id]->search_algorithm(); + } + HCTR_LIB_THROW(cudaStreamSynchronize(resource_manager_->get_local_gpu(id)->get_stream())); + } + + int num_gpus = resource_manager_->get_local_gpu_count(); + std::vector wgrad_buffer_ptrs; + size_t wgrad_buffer_size{}; + core23::BufferParams bp{.channel = solver_.use_mixed_precision ? GetWgradHalfBufferChannel() + : GetWgradBufferChannel()}; + for (int g = 0; g < num_gpus; g++) { + auto device_id = resource_manager_->get_local_gpu(g)->get_device_id(); + core23::Device device(core23::DeviceType::GPU, device_id); + auto wgrad_buffer = core23::GetBuffer(bp, device); + auto [ptr_, size_] = wgrad_buffer->decay(); + wgrad_buffer_size = size_; + HCTR_CHECK_HINT(size_ && ptr_, "wgrad is null or it's a confederal buffer"); + wgrad_buffer_ptrs.push_back(ptr_); + } + exchange_wgrad_->init_ar_comm(wgrad_buffer_ptrs, wgrad_buffer_size); +#endif + init_params_for_dense_(); + if (solver_.perf_logging) { + for (size_t i = 0; i < dense_layer_params_.size(); i++) { + bool is_trainable = + TRAINABLE_LAYERS.find(dense_layer_params_[i].layer_type) != TRAINABLE_LAYERS.end(); + if (is_trainable) { + std::string output_names = join(dense_layer_params_[i].top_names, "-"); + HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization", output_names); + } + } + } + init_params_for_sparse_(); +} +void Model::create_metrics() { + int num_total_gpus = resource_manager_->get_global_gpu_count(); + int label_dim = input_params_[0].labels_.begin()->second; + if (input_params_[0].labels_.size() > 1) { + auto labs = input_params_[0].labels_; + label_dim = std::accumulate(std::begin(labs), std::end(labs), 0, + [](const int previous, const std::pair& p) { + return previous + p.second; + }); + } + + auto num_metrics = [&]() { return networks_[0]->get_raw_metrics_all().size(); }; + for (const auto& metric : solver_.metrics_spec) { + // Only AUC is currently supported for models with more than one loss layer + if ((metric.first != metrics::Type::AUC) && num_metrics() > 1) { + HCTR_OWN_THROW(Error_t::WrongInput, + "Metrics besides AUC are not supported for multi-task models."); + } + + metrics_.emplace_back(std::move(metrics::Metric::Create( + metric.first, solver_.use_mixed_precision, solver_.batchsize_eval / num_total_gpus, + solver_.max_eval_batches, label_dim, resource_manager_))); + } +} + +void Model::create_pipelines() { + // TODO: currently it is only for HE + if (embeddings_.size() == 1) { + auto lr_scheds = embeddings_[0]->get_learning_rate_schedulers(); + for (size_t i = 0; i < lr_scheds.size(); i++) { + networks_[i]->set_learning_rate_scheduler(lr_scheds[i]); + } + } + + if (solver_.use_embedding_collection) { + create_train_pipeline_with_ebc(networks_); + create_evaluate_pipeline_with_ebc(networks_); + } else { + // will create pipeline for dense network. + create_train_network_pipeline(networks_); + create_eval_network_pipeline(networks_); + } + + if (solver_.perf_logging) { + HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "init_stop"); + HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "run_start"); + } + + if (solver_.perf_logging) { + for (size_t i = 0; i < sparse_embedding_params_.size(); i++) { + HCTR_LOG_ARGS(timer_log.elapsedMilliseconds(), "weights_initialization", + sparse_embedding_params_[i].sparse_embedding_name); + } + } + +#ifdef ENABLE_MPI + if (resource_manager_->get_num_process() > 1) { + collective_manager_->set_ready_to_transfer(); + } +#endif +} + +} // namespace HugeCTR \ No newline at end of file diff --git a/HugeCTR/src/pybind/model_pipeline.cpp b/HugeCTR/src/pybind/model_pipeline.cpp index a8f9b53fd0..10406befc5 100644 --- a/HugeCTR/src/pybind/model_pipeline.cpp +++ b/HugeCTR/src/pybind/model_pipeline.cpp @@ -18,19 +18,24 @@ #include #include #include -#include -#include #include #include #include #include -#include +#include #include namespace HugeCTR { -template -void Model::create_train_network_pipeline(std::vector>& networks) { +void Model::exchange_wgrad(size_t device_id) { + auto& gpu_resource = resource_manager_->get_local_gpu(device_id); + CudaCPUDeviceContext context(gpu_resource->get_device_id()); + if (resource_manager_->get_global_gpu_count() > 1) { + exchange_wgrad_->allreduce(device_id, gpu_resource->get_stream()); + } +} + +void Model::create_train_network_pipeline(std::vector>& networks) { graph_.train_pipeline_.resize(resource_manager_->get_local_gpu_count()); auto scheduled_reader = dynamic_cast(train_data_reader_.get()); @@ -64,8 +69,7 @@ void Model::create_train_network_pipeline(std::vector -void Model::create_eval_network_pipeline(std::vector>& networks) { +void Model::create_eval_network_pipeline(std::vector>& networks) { graph_.evaluate_pipeline_.resize(resource_manager_->get_local_gpu_count()); for (int local_id = 0; local_id < static_cast(resource_manager_->get_local_gpu_count()); @@ -93,466 +97,9 @@ void Model::create_eval_network_pipeline(std::vector -void Model::create_train_pipeline(std::vector>& networks) { - auto scheduled_reader = dynamic_cast(train_data_reader_.get()); - auto scheduled_embedding = dynamic_cast(embeddings_[0].get()); - bool is_train = true; - bool use_graph = solver_.use_cuda_graph; - - if (solver_.train_inter_iteration_overlap) { - graph_.train_pipeline_.resize(2 * resource_manager_->get_local_gpu_count()); - } else { - graph_.train_pipeline_.resize(resource_manager_->get_local_gpu_count()); - } - -#pragma omp parallel for num_threads(resource_manager_->get_local_gpu_count()) - for (int local_id = 0; local_id < static_cast(resource_manager_->get_local_gpu_count()); - local_id++) { - auto gpu_resource = resource_manager_->get_local_gpu(local_id); - CudaCPUDeviceContext context(gpu_resource->get_device_id()); - - // create scheduleable - auto iteration_start = std::make_shared([=] {}); - - auto schedule_reader = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - if (use_graph && !scheduled_reader->current_batch_incomplete()) { - scheduled_reader->schedule_here_graph(stream, local_id); - } else { - scheduled_reader->schedule_here(stream, local_id); - } - graph_scheduler_->record_execution(local_id, stream); - }); - - auto EMB_input_ready_wait = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - scheduled_reader->stream_wait_sparse_tensors( - stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete()); - }); - - auto BNET_input_ready_wait = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - scheduled_reader->stream_wait_dense_tensors( - stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete()); - }); - - auto schedule_split_3way = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - scheduled_reader->schedule_split_3_way_here( - stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete()); - }); - - auto schedule_d2d = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - scheduled_reader->schedule_d2d_here( - stream, local_id, use_graph && !scheduled_reader->current_batch_incomplete()); - }); - - auto embedding_index_calculation = std::make_shared( - [=] { scheduled_embedding->index_calculation(is_train, local_id); }); - - auto cross_iteration_sync = std::make_shared([] {}); - - auto embedding_freq_forward = std::make_shared( - [=] { scheduled_embedding->freq_forward(is_train, local_id); }); - - auto embedding_freq_backward = std::make_shared( - [=] { scheduled_embedding->freq_backward(local_id); }); - - auto embedding_freq_update_params = std::make_shared( - [=] { scheduled_embedding->freq_update_params(local_id); }); - - auto embedding_infreq_model_forward = std::make_shared( - [=] { scheduled_embedding->infreq_model_forward(local_id); }); - - auto embedding_infreq_network_forward = std::make_shared( - [=] { scheduled_embedding->infreq_network_forward(is_train, local_id); }); - - auto embedding_infreq_network_backward = std::make_shared( - [=] { scheduled_embedding->infreq_network_backward(local_id); }); - - auto embedding_infreq_model_backward = std::make_shared( - [=] { scheduled_embedding->infreq_model_backward(local_id); }); - - auto network_init = std::make_shared([=] { - if (networks[local_id]->use_mixed_precision_ && - networks[local_id]->optimizer_->get_optimizer_type() != Optimizer_t::SGD) { - networks[local_id]->conv_weight_(networks[local_id]->train_weight_tensor_half_, - networks[local_id]->train_weight_tensor_); - } - }); - - auto bottom_network_fprop = std::make_shared([=] { - networks[local_id]->prop_layers(networks[local_id]->bottom_layers_, true, is_train); - }); - - auto top_network_fprop = std::make_shared( - [=] { networks[local_id]->prop_layers(networks[local_id]->top_layers_, true, is_train); }); - - auto init_wgrad = std::make_shared([=] { - networks[local_id]->train_losses_.begin()->second->regularizer_initialize_wgrad(is_train); - }); - - auto lr_sched_update = std::make_shared( - [=]() { networks[local_id]->lr_sched_->update(); }); - - auto cal_loss = std::make_shared([=] { - float rterm = networks[local_id]->train_losses_.begin()->second->regularizer_compute_rterm(); - long long current_batchsize_per_device = - scheduled_reader->get_current_batchsize_per_device(local_id); - - networks[local_id]->train_losses_.begin()->second->compute( - is_train, current_batchsize_per_device, rterm); - }); - - auto top_network_bprop = std::make_shared( - [=] { networks[local_id]->prop_layers(networks[local_id]->top_layers_, false, is_train); }); - - auto bottom_network_bprop = std::make_shared([=] { - networks[local_id]->prop_layers(networks[local_id]->bottom_layers_, false, is_train); - }); - - auto network_exchange_wgrad = - std::make_shared([=] { this->exchange_wgrad(local_id); }); - - auto update_params = - std::make_shared([=] { networks[local_id]->update_params(); }); - - auto iteration_end = std::make_shared([] {}); - - std::vector> scheduleable_list = { - iteration_start, - EMB_input_ready_wait, - embedding_index_calculation, - BNET_input_ready_wait, - cross_iteration_sync, - embedding_infreq_model_forward, - embedding_infreq_network_forward, - embedding_freq_forward, - network_init, - bottom_network_fprop, - init_wgrad, - schedule_reader, - top_network_fprop, - lr_sched_update, - cal_loss, - top_network_bprop, - embedding_freq_backward, - bottom_network_bprop, - embedding_infreq_network_backward, - embedding_infreq_model_backward, - schedule_split_3way, - network_exchange_wgrad, - schedule_d2d, - embedding_freq_update_params, - update_params, - iteration_end, - }; - - if (solver_.train_intra_iteration_overlap) { - std::string infreq_stream = "side_stream"; - std::string freq_stream = "freq_stream"; - std::string network_side_stream = "network_side_stream"; - - auto done_iteration_start = iteration_start->record_done(); - auto done_cross_iteration_sync = cross_iteration_sync->record_done(); - auto done_embedding_infreq_model_forward = embedding_infreq_model_forward->record_done(); - auto done_embedding_infreq_network_forward = embedding_infreq_network_forward->record_done(); - auto done_embedding_freq_forward = embedding_freq_forward->record_done(); - auto done_bottom_network_fprop = bottom_network_fprop->record_done(); - auto done_top_network_fprop = top_network_fprop->record_done(); - auto done_init_wgrad = init_wgrad->record_done(); - auto done_lr_sched_update = lr_sched_update->record_done(); - auto done_top_network_bprop = top_network_bprop->record_done(); - auto done_embedding_freq_backward = embedding_freq_backward->record_done(); - auto done_bottom_network_bprop = bottom_network_bprop->record_done(); - auto done_network_exchange_wgrad = network_exchange_wgrad->record_done(); - auto done_embedding_infreq_network_backward = - embedding_infreq_network_backward->record_done(); - auto done_freq_update_params = embedding_freq_update_params->record_done(); - - EMB_input_ready_wait->set_stream(infreq_stream); - EMB_input_ready_wait->wait_event({done_iteration_start}); - embedding_index_calculation->set_stream(infreq_stream); - cross_iteration_sync->set_stream(infreq_stream); - - embedding_infreq_model_forward->set_stream(infreq_stream); - embedding_infreq_network_forward->set_stream(infreq_stream); - - const bool overlap_infreq_freq = - (sparse_embedding_params_[0].hybrid_embedding_param.communication_type != - hybrid_embedding::CommunicationType::NVLink_SingleNode); - - if (overlap_infreq_freq) { - embedding_freq_forward->set_stream(freq_stream); - embedding_freq_forward->wait_event( - {done_cross_iteration_sync, done_embedding_infreq_model_forward}); - } else { - embedding_freq_forward->set_stream(infreq_stream); - } - - bottom_network_fprop->wait_event({done_embedding_infreq_model_forward}); - schedule_reader->wait_event({ - done_embedding_infreq_network_forward, - done_embedding_freq_forward, - }); - - init_wgrad->set_stream(network_side_stream); - init_wgrad->wait_event({done_bottom_network_fprop}); - - lr_sched_update->set_stream(network_side_stream); - lr_sched_update->wait_event({done_top_network_fprop}); - top_network_bprop->wait_event({ - done_init_wgrad, - done_lr_sched_update, - }); - - embedding_freq_backward->set_stream(infreq_stream); - embedding_freq_backward->wait_event({done_top_network_bprop}); - - network_exchange_wgrad->wait_event({ - done_embedding_freq_backward, - done_bottom_network_bprop, - }); - - embedding_infreq_network_backward->set_stream(infreq_stream); - embedding_infreq_network_backward->wait_event({done_top_network_bprop}); - embedding_infreq_model_backward->set_stream(infreq_stream); - - embedding_freq_update_params->set_stream(infreq_stream); - embedding_freq_update_params->wait_event({done_network_exchange_wgrad}); - iteration_end->wait_event({ - done_embedding_infreq_network_backward, - done_freq_update_params, - }); - } - - auto graph = std::make_shared(scheduleable_list); - graph_.train_pipeline_[local_id] = Pipeline{"train", gpu_resource, {graph}}; - if (solver_.train_inter_iteration_overlap) { - cudaStream_t s3w_stream = gpu_resource->get_stream("s3w"); - cudaStream_t d2d_stream = gpu_resource->get_stream("s3w"); - scheduled_reader->set_schedule_streams(s3w_stream, d2d_stream, local_id); - - auto done_iteration_end = iteration_end->record_done(use_graph); - cross_iteration_sync->wait_event({done_iteration_end}, use_graph); - - auto graph2 = std::make_shared(scheduleable_list); - graph_.train_pipeline_[local_id + resource_manager_->get_local_gpu_count()] = - Pipeline{"train2", gpu_resource, {graph2}}; - } else { - cudaStream_t s3w_stream = gpu_resource->get_stream("train"); - cudaStream_t d2d_stream = gpu_resource->get_stream("train"); - scheduled_reader->set_schedule_streams(s3w_stream, d2d_stream, local_id); - } - } -} - -void Model::train_pipeline(size_t current_batch_size) { - auto scheduled_reader = dynamic_cast(train_data_reader_.get()); - auto scheduled_embedding = dynamic_cast(embeddings_[0].get()); - - const auto inflight_id = scheduled_reader->get_current_inflight_id(); - const bool cached = scheduled_reader->is_batch_cached(); - - const bool use_graph = solver_.use_cuda_graph && !scheduled_reader->current_batch_incomplete(); - - scheduled_embedding->assign_input_tensors(true, current_batch_size, inflight_id, cached); - -#pragma omp parallel num_threads(resource_manager_->get_local_gpu_count()) - { - int id = omp_get_thread_num(); - auto device_id = resource_manager_->get_local_gpu(id)->get_device_id(); - CudaCPUDeviceContext context(device_id); - - const auto graph_id = solver_.train_inter_iteration_overlap - ? (inflight_id * resource_manager_->get_local_gpu_count() + id) - : id; - HCTR_CHECK_HINT(graph_id < graph_.train_pipeline_.size(), "graph_id out of range"); - - if (use_graph) { - graph_.train_pipeline_[graph_id].run_graph(); - if (scheduled_reader) { - scheduled_reader->update_schedule_graph(id); - } - } else { - graph_.train_pipeline_[graph_id].run(); - } - cudaStream_t graph_stream = resource_manager_->get_local_gpu(id)->get_stream( - graph_.train_pipeline_[graph_id].get_stream_name()); - - auto train_sync_back_event = - resource_manager_->get_local_gpu(id)->get_event("train_sync_back_event"); - HCTR_LIB_THROW(cudaEventRecord(train_sync_back_event, graph_stream)); - HCTR_LIB_THROW(cudaStreamWaitEvent(resource_manager_->get_local_gpu(id)->get_stream(), - train_sync_back_event)); - } -} - -template -void Model::create_evaluate_pipeline(std::vector>& networks) { - auto scheduled_reader = dynamic_cast(evaluate_data_reader_.get()); - auto scheduled_embedding = dynamic_cast(embeddings_[0].get()); - bool is_train = false; - - graph_.evaluate_pipeline_.resize(resource_manager_->get_local_gpu_count()); - - for (int local_id = 0; local_id < resource_manager_->get_local_gpu_count(); local_id++) { - auto gpu_resource = resource_manager_->get_local_gpu(local_id); - CudaCPUDeviceContext ctx(gpu_resource->get_device_id()); - - // create scheduleable - auto iteration_strat = std::make_shared([] {}); - - auto EMB_input_ready_wait = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - scheduled_reader->stream_wait_sparse_tensors(stream, local_id, false); - }); - - auto BNET_input_ready_wait = std::make_shared([=] { - auto stream = gpu_resource->get_stream(); - scheduled_reader->stream_wait_dense_tensors(stream, local_id, false); - }); - - auto embedding_index_calculation = std::make_shared( - [=] { scheduled_embedding->index_calculation(is_train, local_id); }); - - auto embedding_freq_forward = std::make_shared([=] { - scheduled_embedding->freq_forward(is_train, local_id, this->graph_.is_first_eval_batch_); - }); - - auto embedding_infreq_model_forward = std::make_shared( - [=] { scheduled_embedding->infreq_model_forward(local_id); }); - - auto embedding_infreq_network_forward = std::make_shared( - [=] { scheduled_embedding->infreq_network_forward(is_train, local_id); }); - - auto embedding_global_barrier = std::make_shared( - [=] { scheduled_embedding->global_barrier(is_train, local_id); }); - - auto network_init = std::make_shared([=] { - if (networks[local_id]->use_mixed_precision_ && - networks[local_id]->optimizer_->get_optimizer_type() != Optimizer_t::SGD) { - networks[local_id]->conv_weight_(networks[local_id]->train_weight_tensor_half_, - networks[local_id]->train_weight_tensor_); - } - }); - - auto network_eval = std::make_shared([=] { - long long current_batchsize_per_device = - scheduled_reader->get_current_batchsize_per_device(local_id); - - networks[local_id]->eval(current_batchsize_per_device); - }); - - auto cal_metrics = std::make_shared([=] { - for (auto& metric : metrics_) { - auto metric_map = networks[local_id]->get_raw_metrics_all().begin()->second; - metric->local_reduce(local_id, metric_map); - } - }); - - std::vector> scheduleable_list = { - iteration_strat, - BNET_input_ready_wait, - EMB_input_ready_wait, - embedding_index_calculation, - embedding_infreq_model_forward, - embedding_infreq_network_forward, - embedding_freq_forward, - embedding_global_barrier, - network_init, - network_eval, - cal_metrics, - }; - - const bool overlap_infreq_freq = - (sparse_embedding_params_[0].hybrid_embedding_param.communication_type != - hybrid_embedding::CommunicationType::NVLink_SingleNode) && - solver_.eval_intra_iteration_overlap; - std::string eval_embedding = "eval_embedding"; - std::string eval_freq = "eval_freq"; - - if (solver_.eval_inter_iteration_overlap) { - // s3w_stream should be the same with embedding stream - cudaStream_t s3w_stream = gpu_resource->get_stream(eval_embedding); - cudaStream_t d2d_stream = gpu_resource->get_stream("default"); - scheduled_reader->set_schedule_streams(s3w_stream, d2d_stream, local_id); - - auto done_embedding_infreq_model_forward = embedding_infreq_model_forward->record_done(); - auto done_embedding_infreq_network_forward = embedding_infreq_network_forward->record_done(); - auto done_embedding_freq_forward = embedding_freq_forward->record_done(); - auto done_network_eval = network_eval->record_done(); - - EMB_input_ready_wait->set_absolute_stream(eval_embedding); - embedding_index_calculation->set_absolute_stream(eval_embedding); - embedding_infreq_model_forward->set_absolute_stream(eval_embedding); - embedding_infreq_network_forward->set_absolute_stream(eval_embedding); - embedding_infreq_network_forward->wait_event({done_network_eval}); - - if (overlap_infreq_freq) { - embedding_freq_forward->set_stream(eval_freq); - embedding_freq_forward->wait_event( - {done_embedding_infreq_model_forward, done_network_eval}); - } else { - embedding_freq_forward->set_absolute_stream(eval_embedding); - } - embedding_global_barrier->set_absolute_stream(eval_embedding); - - network_init->wait_event( - {done_embedding_infreq_network_forward, done_embedding_freq_forward}); - } else if (overlap_infreq_freq) { - auto done_embedding_infreq_model_forward = embedding_infreq_model_forward->record_done(); - auto done_embedding_freq_forward = embedding_freq_forward->record_done(); - - embedding_freq_forward->set_stream(eval_freq); - embedding_freq_forward->wait_event({done_embedding_infreq_model_forward}); - network_init->wait_event({done_embedding_freq_forward}); - } - - auto graph = std::make_shared(scheduleable_list); - graph_.evaluate_pipeline_[local_id] = Pipeline{"default", gpu_resource, {graph}}; - } -} - -void Model::evaluate_pipeline(size_t current_batch_size) { - auto scheduled_reader = dynamic_cast(evaluate_data_reader_.get()); - auto scheduled_embedding = dynamic_cast(embeddings_[0].get()); - - const auto inflight_id = scheduled_reader->get_current_inflight_id(); - const bool cached = scheduled_reader->is_batch_cached(); - - scheduled_embedding->assign_input_tensors(false, current_batch_size, inflight_id, cached); - -#pragma omp parallel num_threads(number_of_networks()) - { - size_t id = omp_get_thread_num(); - auto gpu = resource_manager_->get_local_gpu(id); - CudaCPUDeviceContext ctx(gpu->get_device_id()); - - if (graph_.is_first_eval_batch_) { - auto eval_start_event = gpu->get_event("eval_start_event"); - HCTR_LIB_THROW(cudaEventRecord(eval_start_event, gpu->get_stream())); - - cudaStream_t evaluate_stream = - gpu->get_stream(graph_.evaluate_pipeline_[id].get_stream_name()); - HCTR_LIB_THROW(cudaStreamWaitEvent(evaluate_stream, eval_start_event)); - cudaStream_t eval_embedding_stream = gpu->get_stream("eval_embedding"); - HCTR_LIB_THROW(cudaStreamWaitEvent(eval_embedding_stream, eval_start_event)); - } - - graph_.evaluate_pipeline_[id].run(); - } - - for (auto& metric : metrics_) { - metric->global_reduce(number_of_networks()); - } -} bool is_first_data_distributor = true; -template -void Model::create_train_pipeline_with_ebc(std::vector>& networks) { +void Model::create_train_pipeline_with_ebc(std::vector>& networks) { bool is_train = true; bool use_graph = solver_.use_cuda_graph; @@ -925,8 +472,7 @@ void Model::train_pipeline_with_ebc() { } } -template -void Model::create_evaluate_pipeline_with_ebc(std::vector>& networks) { +void Model::create_evaluate_pipeline_with_ebc(std::vector>& networks) { bool is_train = false; // bool use_graph = solver_.use_cuda_graph; graph_.evaluate_pipeline_.resize(resource_manager_->get_local_gpu_count()); @@ -1112,13 +658,4 @@ void Model::evaluate_pipeline_with_ebc() { metric->global_reduce(number_of_networks()); } } - -template void Model::create_train_pipeline(std::vector>&); -template void Model::create_evaluate_pipeline(std::vector>&); -template void Model::create_train_network_pipeline(std::vector>&); -template void Model::create_eval_network_pipeline(std::vector>&); -template void Model::create_train_pipeline_with_ebc( - std::vector>& networks); -template void Model::create_evaluate_pipeline_with_ebc(std::vector>&); - } // namespace HugeCTR diff --git a/HugeCTR/src/resource_manager.cpp b/HugeCTR/src/resource_manager.cpp index 3fd624fc0c..9a63cca1f0 100644 --- a/HugeCTR/src/resource_manager.cpp +++ b/HugeCTR/src/resource_manager.cpp @@ -23,7 +23,7 @@ namespace HugeCTR { -std::shared_ptr ResourceManager::create( +std::shared_ptr ResourceManagerCore::create( const std::vector>& visible_devices, unsigned long long seed, DeviceMap::Layout layout) { const int size{core23::MpiInitService::get().world_size()}; diff --git a/HugeCTR/src/resource_managers/resource_manager_ext.cpp b/HugeCTR/src/resource_managers/resource_manager_ext.cpp deleted file mode 100644 index fe0e4fa59a..0000000000 --- a/HugeCTR/src/resource_managers/resource_manager_ext.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace HugeCTR { - -std::shared_ptr ResourceManagerExt::create( - const std::vector>& visible_devices, unsigned long long seed, - DeviceMap::Layout layout) { - const int size{core23::MpiInitService::get().world_size()}; - const int rank{core23::MpiInitService::get().world_rank()}; - - DeviceMap device_map(visible_devices, rank, layout); - - std::random_device rd; - if (seed == 0) { - seed = rd(); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Bcast(&seed, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD)); -#endif - - HCTR_LOG(INFO, ROOT, "Global seed is %llu\n", seed); - - std::shared_ptr core( - new ResourceManagerCore(size, rank, std::move(device_map), seed)); - - return std::shared_ptr(new ResourceManagerExt(core)); -} - -#ifdef ENABLE_MPI -void ResourceManagerExt::init_ib_comm() { - int num_process = get_num_process(); - if (num_process > 1) { - int process_id = get_process_id(); - ib_comm_ = std::make_unique(); - ib_comm_->init(num_process, get_local_gpu_count(), process_id, get_local_gpu_device_id_list()); - } -} -#endif - -void ResourceManagerExt::set_ar_comm(AllReduceAlgo algo, bool use_mixed_precision) { - int num_process = get_num_process(); -#ifdef ENABLE_MPI - IbComm* ib_comm_ptr = nullptr; - if (algo == AllReduceAlgo::ONESHOT) { - init_ib_comm(); - ib_comm_ptr = ib_comm_.get(); - } - ar_comm_ = AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, get_local_gpus(), - ib_comm_ptr); -#else - ar_comm_ = AllReduceInPlaceComm::create(num_process, algo, use_mixed_precision, get_local_gpus()); -#endif -} - -} // namespace HugeCTR diff --git a/ci/integration_test/dlrm/benchmark_14node.sub b/ci/integration_test/dlrm/benchmark_14node.sub deleted file mode 100644 index 6a65b12391..0000000000 --- a/ci/integration_test/dlrm/benchmark_14node.sub +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /raid/datasets/criteo/mlperf/40m.limit_preshuffled/ && \ - exec numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100_14x8x640.py" diff --git a/ci/integration_test/dlrm/benchmark_1node.sub b/ci/integration_test/dlrm/benchmark_1node.sub deleted file mode 100644 index 2f23af3667..0000000000 --- a/ci/integration_test/dlrm/benchmark_1node.sub +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" --network sharp bash -cx " \ - cd /raid/datasets/criteo/mlperf/40m.limit_preshuffled/ && \ - numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100.py" diff --git a/ci/integration_test/dlrm/dlrm.sub b/ci/integration_test/dlrm/dlrm.sub deleted file mode 100644 index 649bad7f87..0000000000 --- a/ci/integration_test/dlrm/dlrm.sub +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /dataset/criteo_kaggle/dcn_parquet && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_8gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_fp16_1gpu.json && \ - python3 /workdir/test/pybind_test/single_node_test.py --json-file=/workdir/test/scripts/dlrm_fp16_8gpu.json" \ No newline at end of file diff --git a/ci/integration_test/dlrm/ib_nvlink_1node.sub b/ci/integration_test/dlrm/ib_nvlink_1node.sub deleted file mode 100644 index 1d689b7119..0000000000 --- a/ci/integration_test/dlrm/ib_nvlink_1node.sub +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100_ib_nvlink.py" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/test/embedding_collection_test/dgx_a100_one_hot.py --batchsize 55296 --batchsize_eval=276480 --use_mixed_precision" diff --git a/ci/integration_test/dlrm/ib_nvlink_8node.sub b/ci/integration_test/dlrm/ib_nvlink_8node.sub deleted file mode 100644 index 4f43d21816..0000000000 --- a/ci/integration_test/dlrm/ib_nvlink_8node.sub +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -set -e - -srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - exec numactl --membind=1,3,5,7 python3 /workdir/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py" - -srun --mpi=pmix --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - python3 /workdir/test/embedding_collection_test/dgx_a100_one_hot.py --batchsize 71680 --batchsize_eval=1792000 --use_mixed_precision" diff --git a/ci/integration_test/mlperf_generalization/overlapped_pipeline.sub b/ci/integration_test/mlperf_generalization/overlapped_pipeline.sub deleted file mode 100644 index 6c1d114b24..0000000000 --- a/ci/integration_test/mlperf_generalization/overlapped_pipeline.sub +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \ - cd /etc/workspace/new_criteo_kaggle && - python3 /workdir/test/pybind_test/dgx_a100_48slots.py " \ No newline at end of file diff --git a/ci/selene/ci.yml b/ci/selene/ci.yml index 0351ef3b60..0811166322 100644 --- a/ci/selene/ci.yml +++ b/ci/selene/ci.yml @@ -133,18 +133,6 @@ dcn_8gpu: WALLTIME: "01:00:00" TEST_CMD: ./ci/integration_test/dcn/dcn_8gpu.sub -dlrm_benchmark_1node: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_multi_node - variables: - GPFSFOLDER: $LOGDIR/dlrm_benchmark_1node - CONT: $TRAIN_IMAGE_MULTINODE_VERSIONED - MOUNTS: /raid:/raid - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/dlrm/benchmark_1node.sub - dlrm_dcnv2_benchmark_1node: extends: .selene_test_job needs: @@ -180,18 +168,6 @@ deepfm: WALLTIME: "00:15:00" TEST_CMD: ./ci/integration_test/deepfm/deepfm.sub -dlrm: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/dlrm - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET}:${DATASET_MOUNT} - WALLTIME: "00:45:00" - TEST_CMD: ./ci/integration_test/dlrm/dlrm.sub - mmoe: extends: .selene_test_job needs: @@ -204,18 +180,6 @@ mmoe: WALLTIME: "00:15:00" TEST_CMD: ./ci/integration_test/mmoe/mmoe.sub -mlperf_generalization: - extends: .selene_test_job - needs: - - pipeline: $PARENT_PIPELINE_ID - job: build_train_single_node - variables: - GPFSFOLDER: $LOGDIR/mlperf_generalization - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT} - WALLTIME: "00:15:00" - TEST_CMD: ./ci/integration_test/mlperf_generalization/overlapped_pipeline.sub - inference_hps: extends: .selene_test_job needs: @@ -462,18 +426,6 @@ hps_plugin_benchmark_check: WALLTIME: "00:15:00" TEST_CMD: ./ci/post_test/check_hps_plugin_benchmark.sub -dlrm_1node_check: - # Push logs to gitlab - extends: .selene_post_test_job - needs: - - dlrm_benchmark_1node - variables: - GPFSFOLDER: $LOGDIR/dlrm_1node_check - CONT: $TRAIN_IMAGE_VERSIONED - MOUNTS: $LOGDIR/dlrm_benchmark_1node:/logs - WALLTIME: "00:15:00" - TEST_CMD: ./ci/post_test/check_dlrm_1node.sub - dlrm_dcnv2_1node_check: # Push logs to gitlab extends: .selene_post_test_job diff --git a/ci/template.yml b/ci/template.yml index f7fff05ddc..1e4c71b2d6 100644 --- a/ci/template.yml +++ b/ci/template.yml @@ -461,7 +461,7 @@ stages: variables: GIT_CLONE_PATH: ${GIT_CLONE_PATH_SELENE} CONT: ${UNIFIED_CTR_LATEST} - MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DATASET_CRITEO_SELENE}:${OLD_CRITEO_MOUNT},/raid:/raid,${CI_PROJECT_DIR}:/hugectr + MOUNTS: ${DATASET_NEW_CRITEO_SELENE}:${NEW_CRITEO_MOUNT},${DATASET_CRITEO_SELENE}:${CRITEO_MOUNT},/raid:/raid,${CI_PROJECT_DIR}:/hugectr SLURM_ACCOUNT: coreai_devtech_all OLD_SLURM_ACCOUNT: "devtech" GPFSFOLDER: "/lustre/fsw/$OLD_SLURM_ACCOUNT" diff --git a/ci/utest/utest.sub b/ci/utest/utest.sub index 420c82479d..d49ce258dd 100644 --- a/ci/utest/utest.sub +++ b/ci/utest/utest.sub @@ -2,7 +2,6 @@ srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "\ cd /workdir/build/bin && \ - ./async_reader && \ ./checker_test && \ ./data_reader_test && \ ./device_map_test && \ diff --git a/samples/dlrm/README.md b/samples/dlrm/README.md index f6f55e9318..deba5b181e 100644 --- a/samples/dlrm/README.md +++ b/samples/dlrm/README.md @@ -1,13 +1,12 @@ # DLRM CTR SAMPLE # -> **Deprecation Warning**: DLRM samples are based on the [one-hot RawAsync DataReader](https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw) and HybridEmbedding, both of which will be deprecated in a future release. Please check out the [multi-hot RawAsync DataReader]((https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw)) and [embedding collection](https://nvidia-merlin.github.io/HugeCTR/main/api/hugectr_layer_book.html#embedding-collection) for alternatives. +> **Deprecation Warning**: DLRM samples are based on the [one-hot RawAsync DataReader](https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw) and HybridEmbedding, both of which were deprecated. Please check out the [multi-hot RawAsync DataReader]((https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw)) and [embedding collection](https://nvidia-merlin.github.io/HugeCTR/main/api/hugectr_layer_book.html#embedding-collection) for alternatives. -The purpose of this sample is to demonstrate how to build and train a [DLRM model](https://ai.facebook.com/blog/dlrm-an-advanced-open-source-deep-learning-recommendation-model/) with HugeCTR. +The purpose of this sample is to demonstrate how to build and train a [DLRM DCNv2 model](https://arxiv.org/abs/2008.13535) with HugeCTR. ## Table of Contents -* [Set Up the HugeCTR Docker Environmen](#set-up-the-hugectr-docker-environment) +* [Set Up the HugeCTR Docker Environment](#set-up-the-hugectr-docker-environment) * [MLPerf DLRM](#mlperf-dlrm) -* [Kaggle DLRM](#kaggle-dlrm) ## Set Up the HugeCTR Docker Environment ## You can set up the HugeCTR Docker environment by doing one of the following: @@ -34,38 +33,121 @@ $ export PYTHONPATH=/usr/local/hugectr/lib:$PYTHONPATH ## MLPerf DLRM Ensure that you've met the following requirements: -- MLPerf v1.0: DGX A100 14 nodes +- MLPerf v3.1: DGX H100 1 node, 8 nodes or 16 nodes +- Install requirements: pip install -r requirements.txt -### Preprocess the Terabyte Click Logs ## -The [Terabyte Click Logs](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) provided by CriteoLabs is used in this sample. The row count of each embedding table is limited to 40 million. The data is processed the same way as dlrm. For more information, see [Benchmarking](https://github.com/facebookresearch/dlrm#benchmarking). Each sample has 40 32-bit integers. The first integer is a label, the next 13 integers are dense features, and the last 26 integers are category features. +### Dataset downloading and preprocessing ## +Input preprocessing steps below are based on the instructions from the official reference implementation repository, see [Running the MLPerf DLRM v2 benchmark](https://github.com/mlcommons/training/tree/master/recommendation_v2/torchrec_dlrm#running-the-mlperf-dlrm-v2-benchmark). Besides, there is a final step to convert the reference implementation dataset to the raw format in order to make it consumable by HugeCTR training script. For completeness, all the steps are detailed below. -1. Download the terabyte datasets from the [Terabyte Click Logs](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) into the `"${project_home}/samples/dlrm/"` folder. +This process can take up to several days and needs 7 TB of fast storage space. The preprocessing steps do not require a GPU machine. -2. Unzip the datasets and name them in the following manner: `day_0`, `day_1`, ..., `day_23`. +**1.1** Download the dataset from https://ailab.criteo.com/ressources/criteo-1tb-click-logs-dataset-for-mlperf/. -3. Preprocess the datasets using the following command: - ```bash - # Usage: dlrm_raw input_dir output_dir --train {days for training} --test {days for testing} - $ dlrm_raw ./ ./ \ - --train 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 \ - --test 23 - ``` - This operation will generate `train.bin(671.2GB)` and `test.bin(14.3GB)`. +**1.2** Clone the reference implementation repository. +``` +git clone https://github.com/mlcommons/training.git +cd training/recommendation_v2/torchrec_dlrm +``` -### Run the Terabyte Click Logs with MLPerf v1.0 ## +**1.3** Build and run the reference docker image. +``` +docker build -t dlrmv2_reference . +docker run -it --rm --network=host --ipc=host -v /data:/data dlrmv2_reference +``` -Run the single node DGX-100 Python script using the following command: - ```shell - $ python3 dgx_a100.py - ``` +**1.4** Run preprocessing steps to get data in NumPy format. -Run the 14-node DGX-100 Python script using the following command: - ```shell - $ numactl --interleave=all python3 dgx_a100_14x8x640.py - ``` +``` +./scripts/process_Criteo_1TB_Click_Logs_dataset.sh \ + /data/criteo_1tb/raw_input_dataset_dir \ + /data/criteo_1tb/temp_intermediate_files_dir \ + /data/criteo_1tb/numpy_contiguous_shuffled_output_dataset_dir +``` +As a result, files named: `day_*_labels.npy`, `day_*_dense.npy` and `day_0_sparse.npy` will be created (3 per each of 24 days in the original input dataset, 72 files in total). Once completed, the output data can be verified with md5sums provided in [md5sums_preprocessed_criteo_click_logs_dataset.txt](https://github.com/mlcommons/training/blob/master/recommendation_v2/torchrec_dlrm/md5sums_preprocessed_criteo_click_logs_dataset.txt) file. + +**1.5** Create a synthetic multi-hot Criteo dataset. + +This step produces multi-hot dataset from the original (one-hot) dataset. + +``` +python scripts/materialize_synthetic_multihot_dataset.py \ + --in_memory_binary_criteo_path /data/criteo_1tb/numpy_contiguous_shuffled_output_dataset_dir \ + --output_path /data/criteo_1tb_sparse_multi_hot \ + --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 \ + --multi_hot_sizes 3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1 \ + --multi_hot_distribution_type uniform +``` + +As a result, `day_*_sparse_multi_hot.npz` files will be created (24 files in total). Once done, the output data can be validated with md5sums provided in [md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt](https://github.com/mlcommons/training/blob/master/recommendation_v2/torchrec_dlrm/md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt) file. + +**1.6** Convert NumPy dataset to raw format. + +Because HugeCTR uses, among others, [raw format](https://nvidia-merlin.github.io/HugeCTR/main/api/python_interface.html#raw) for input data, we need to convert NumPy files created in the preceding steps to this format. To this end, use `preprocessing/convert_to_raw.py` script that comes with the container created in section [Build the container and push to a docker registry](#build-the-container-and-push-to-a-docker-registry) below. + +``` +docker run -it --rm --network=host --ipc=host -v /data:/data nvcr.io/nvidia/merlin/merlin-hugectr:23.12 +``` +In that container, run: +``` +python preprocessing/convert_to_raw.py \ + --input_dir_labels_and_dense /data/criteo_1tb/numpy_contiguous_shuffled_output_dataset_dir \ + --input_dir_sparse_multihot /data/criteo_1tb_sparse_multi_hot \ + --output_dir /data/criteo_1tb_multihot_raw \ + --stages train val +``` +As a result, `train_data.bin` and `val_data.bin` will be created. Once done, the output files can be verified with the md5sums provided in `preprocessing/md5sums_raw_dataset.txt` file. + +### Specify the preprocessed data paths in the training script. + +You may need to manually change the location of the datasets in the `train.py` file. +The `source` parameter should specify the absolute path to the `train_data.bin` file and the `eval_source` +parameter should point to the `val_data.bin` file from `/data/criteo_1tb_multihot_raw` folder obtained in the previous step. + +However, for launching with nvidia-docker, you just need to make sure to set `DATADIR` as the path to the directory containing those two files. + +### Steps to launch training on a single node + +#### NVIDIA DGX H100 (single-node) + +Launch configuration and system-specific hyperparameters for the NVIDIA DGX H100 +single-node submission are in the `config_DGXH100_1x8x6912.sh` script and in the `train.py` config file. + +To launch the training on a single node with a Slurm cluster run: +``` +source config_DGXH100_1x8x6912.sh +CONT=/mlperf-nvidia:recommendation_hugectr LOGDIR= sbatch -N 1 run.sub +``` + +Note that this benchmark has high I/O bandwidth requirements. To achieve optimal performance in the case of single-node training job at least 13.4 GB/s and 41.4 GB/s read bandwidth is required during training and evaluation stage, respectively. -**IMPORTANT NOTES**: -- To run the 14-node DGX-100 training script on Selene, you need to submit the job on the Selene login node properly. -- In v2.2.1, there is a CUDA Graph error that occurs when running this sample on DGX2. To run it on DGX2, specify `"use_cuda_graph = False` within `CreateSolver` in the Python script. For detailed information about this error, see [Known Issues](https://github.com/NVIDIA-Merlin/HugeCTR/blob/master/release_notes.md#known-issues). -- `cache_eval_data` is only supported on DGX A100. If you're running DGX2, disable it. +#### Alternative launch with docker + +When generating results for the official v3.0 submission with one node, the +benchmark was launched onto a cluster managed by a Slurm scheduler. The +instructions in [NVIDIA DGX H100 (single node)](#nvidia-dgx-h100-single-node) explain +how that is done. + +However, to make it easier to run this benchmark on a wider set of machine +environments, we are providing here an alternate set of launch instructions +that can be run using `nvidia-docker`. Note that performance or functionality may +vary from the tested Slurm instructions. + +``` +source config_DGXH100_1x8x6912.sh +CONT=mlperf-nvidia:recommendation_hugectr DATADIR= LOGDIR= ./run_with_docker.sh +``` + +### Steps to launch training on multiple nodes + +#### NVIDIA DGX H100 (multi-node) + +Launch configuration and system-specific hyperparameters for the NVIDIA DGX H100 +multi-node submission are in the `config_DGXH100_8x8x2112.sh` or `config_DGXH100_16x8x1056.sh` scripts +and in the `train.py` config file. + +To launch the training for a selected config with a Slurm cluster run: +``` +source config_DGXH100_8x8x2112.sh +CONT=/mlperf-nvidia:recommendation_hugectr LOGDIR= sbatch -N $DGXNNODES run.sub +``` diff --git a/samples/dlrm/config_DGXH100_16x8x1056.sh b/samples/dlrm/config_DGXH100_16x8x1056.sh new file mode 100644 index 0000000000..ad0b29b05d --- /dev/null +++ b/samples/dlrm/config_DGXH100_16x8x1056.sh @@ -0,0 +1,32 @@ +## DL params +export RUN_SCRIPT="train.py" +export BATCHSIZE=135168 +export BATCHSIZE_EVAL=2097152 +export LEARNING_RATE=0.0034 +export USE_MIXED_PRECISION=true +export SCALER=20480 +export SHARDING_PLAN=hier_auto +export MEM_COMM_BW_RATIO=67 +export GEN_LOSS_SUMMARY=true +export MINIMUM_TRAINING_TIME=10 +export DP_SHARDING_THRESHOLD=0.0125 + +## System run params +export DGXNNODES=16 +export DGXNGPU=8 +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) +WALLTIME_MINUTES=15 + +## Set clocks and walltime for maxQ and minEDP runs +if [[ "${SET_MAXQ_CLK:-0}" == "1" ]]; then + export MAXQ_CLK=1350 + WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 2) # 50% longer walltime +elif [[ "${SET_MINEDP_CLK:-0}" == "1" ]]; then + export MINEDP_CLK=1410 + WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 3) # 33% longer walltime +fi +export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES} + 5 )) + +## network flags +export SBATCH_NETWORK=sharp +export NCCL_COLLNET_ENABLE=1 diff --git a/samples/dlrm/config_DGXH100_1x8x6912.sh b/samples/dlrm/config_DGXH100_1x8x6912.sh new file mode 100644 index 0000000000..0d72dfcc76 --- /dev/null +++ b/samples/dlrm/config_DGXH100_1x8x6912.sh @@ -0,0 +1,28 @@ +## DL params +export RUN_SCRIPT="train.py" +export BATCHSIZE=55296 +export BATCHSIZE_EVAL=262144 +export LEARNING_RATE=0.004 +export USE_MIXED_PRECISION=true +export SCALER=16348 +export SHARDING_PLAN=auto +export MEM_COMM_BW_RATIO=7 +export GEN_LOSS_SUMMARY=true +export MINIMUM_TRAINING_TIME=10 +export DP_SHARDING_THRESHOLD=0.008 + +## System run params +export DGXNNODES=1 +export DGXNGPU=8 +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) +WALLTIME_MINUTES=15 + +## Set clocks and walltime for maxQ and minEDP runs +if [[ "${SET_MAXQ_CLK:-0}" == "1" ]]; then + export MAXQ_CLK=1320 + WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 2) # 50% longer walltime +elif [[ "${SET_MINEDP_CLK:-0}" == "1" ]]; then + export MINEDP_CLK=1665 + WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 3) # 33% longer walltime +fi +export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES} + 5 )) diff --git a/samples/dlrm/config_DGXH100_8x8x2112.sh b/samples/dlrm/config_DGXH100_8x8x2112.sh new file mode 100644 index 0000000000..8db218a4c4 --- /dev/null +++ b/samples/dlrm/config_DGXH100_8x8x2112.sh @@ -0,0 +1,32 @@ +## DL params +export RUN_SCRIPT="train.py" +export BATCHSIZE=135168 +export BATCHSIZE_EVAL=1048576 +export LEARNING_RATE=0.0034 +export USE_MIXED_PRECISION=true +export SCALER=20480 +export SHARDING_PLAN=hier_auto +export MEM_COMM_BW_RATIO=67 +export GEN_LOSS_SUMMARY=true +export MINIMUM_TRAINING_TIME=10 +export DP_SHARDING_THRESHOLD=0.0125 + +## System run params +export DGXNNODES=8 +export DGXNGPU=8 +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) +WALLTIME_MINUTES=15 + +## Set clocks and walltime for maxQ and minEDP runs +if [[ "${SET_MAXQ_CLK:-0}" == "1" ]]; then + export MAXQ_CLK=1275 + WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 2) # 50% longer walltime +elif [[ "${SET_MINEDP_CLK:-0}" == "1" ]]; then + export MINEDP_CLK=1530 + WALLTIME_MINUTES=$(expr ${WALLTIME_MINUTES} + ${WALLTIME_MINUTES} / 3) # 33% longer walltime +fi +export WALLTIME=$((${NEXP:-1} * ${WALLTIME_MINUTES} + 5 )) + +## network flags +export SBATCH_NETWORK=sharp +export NCCL_COLLNET_ENABLE=1 diff --git a/samples/dlrm/dgx_a100.py b/samples/dlrm/dgx_a100.py deleted file mode 100644 index 28e37eb8af..0000000000 --- a/samples/dlrm/dgx_a100.py +++ /dev/null @@ -1,229 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -# 1. Create Solver, DataReaderParams and Optimizer -solver = hugectr.CreateSolver( - max_eval_batches=51, - batchsize_eval=1769472, - batchsize=55296, - vvgpu=[[0, 1, 2, 3, 4, 5, 6, 7]], - repeat_dataset=True, - lr=24.0, - warmup_steps=2750, - decay_start=49315, - decay_steps=27772, - decay_power=2.0, - end_lr=0.0, - use_mixed_precision=True, - scaler=1024, - use_cuda_graph=True, - gen_loss_summary=False, - train_intra_iteration_overlap=True, - train_inter_iteration_overlap=True, - eval_intra_iteration_overlap=True, # doesn't do anything - eval_inter_iteration_overlap=True, - all_reduce_algo=hugectr.AllReduceAlgo.OneShot, - grouped_all_reduce=False, - num_iterations_statistics=20, - metrics_spec={hugectr.MetricsType.AUC: 0.8025}, - perf_logging=True, - drop_incomplete_batch=False, -) - -batchsize = 55296 -num_reading_threads = 32 -num_batches_per_threads = 4 -expected_io_block_size = batchsize * 10 -io_depth = 2 -io_alignment = 512 -bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize -max_nr_per_threads = num_batches_per_threads * ( - bytes_size_per_batches // expected_io_block_size + 2 -) - -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.RawAsync, - source=["/raid/datasets/criteo/mlperf/40m.limit_preshuffled/train_data.bin"], - eval_source="/raid/datasets/criteo/mlperf/40m.limit_preshuffled/test_data.bin", - check_type=hugectr.Check_t.Non, - num_samples=4195197692, - eval_num_samples=89137319, - cache_eval_data=51, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - # max_nr_per_threads = num_batches_per_threads * (bytes_size_per_batches / io_block_size + 2) - # max_nr_per_threads = 4 * (55296 * 160 / 552960 + 2 ) = 4 * 18 = 72 - async_param=hugectr.AsyncParam( - num_reading_threads, - num_batches_per_threads, - max_nr_per_threads, - io_depth, - io_alignment, - True, - hugectr.Alignment_t.Auto, - multi_hot_reader=False, - is_dense_float=False, - ), -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True -) -# 2. Initialize the Model instance -model = hugectr.Model(solver, reader, optimizer) -# 3. Construct the Model graph -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.HybridSparseEmbedding, - workspace_size_per_gpu_in_mb=15000, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - embedding_vec_size=128, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - hybrid_embedding_param=hugectr.HybridEmbeddingParam( - 2, - -1, - 0.03, - 1.3e11, - 2.6e11, - 1.0, - hugectr.CommunicationType.NVLink_SingleNode, - hugectr.HybridEmbeddingType.Distributed, - ), - ) -) - -compute_config = hugectr.DenseLayerComputeConfig( - async_wgrad=True, -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["dense"], - top_names=["mlp1"], - num_outputs=[512, 256, 128], - compute_config=compute_config, - act_type=hugectr.Activation_t.Relu, - use_bias=True, - ) -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["mlp1", "sparse_embedding1"], - top_names=["interaction1", "interaction_grad"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["interaction1", "interaction_grad"], - top_names=["mlp2"], - num_outputs=[1024, 1024, 512, 256, 1], - compute_config=compute_config, - use_bias=True, - activations=[ - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Non, - ], - ) -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["mlp2", "label"], - top_names=["loss"], - ) -) -# 4. Dump the Model graph to JSON -model.graph_to_json(graph_config_file="dlrm.json") -# 5. Compile & Fit -model.compile() -model.summary() -model.fit( - max_iter=75868, display=1000, eval_interval=3793, snapshot=2000000, snapshot_prefix="dlrm" -) diff --git a/samples/dlrm/dgx_a100_14x8x640.py b/samples/dlrm/dgx_a100_14x8x640.py deleted file mode 100644 index 0e207dab6d..0000000000 --- a/samples/dlrm/dgx_a100_14x8x640.py +++ /dev/null @@ -1,248 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -# 1. Create Solver, DataReaderParams and Optimizer -solver = hugectr.CreateSolver( - max_eval_batches=50, - batchsize_eval=1792000, - batchsize=71680, - vvgpu=[ - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - ], - repeat_dataset=True, - lr=26.0, - warmup_steps=2500, - decay_start=46821, - decay_steps=15406, - decay_power=2.0, - end_lr=0.0, - use_mixed_precision=True, - scaler=1024, - use_cuda_graph=True, - gen_loss_summary=False, - train_intra_iteration_overlap=True, - train_inter_iteration_overlap=True, - eval_intra_iteration_overlap=True, - eval_inter_iteration_overlap=True, - all_reduce_algo=hugectr.AllReduceAlgo.OneShot, - grouped_all_reduce=True, - num_iterations_statistics=20, - metrics_spec={hugectr.MetricsType.AUC: 0.8025}, - perf_logging=True, - drop_incomplete_batch=False, -) - -batchsize = 71680 -num_reading_threads = 32 -num_batches_per_threads = 4 -expected_io_block_size = batchsize * 10 -io_depth = 2 -io_alignment = 512 -bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize -max_nr_per_threads = num_batches_per_threads * ( - bytes_size_per_batches // expected_io_block_size + 2 -) - -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.RawAsync, - source=["/raid/datasets/criteo/mlperf/40m.limit_preshuffled/train_data.bin"], - eval_source="/raid/datasets/criteo/mlperf/40m.limit_preshuffled/test_data.bin", - check_type=hugectr.Check_t.Non, - num_samples=4195197692, - eval_num_samples=89137319, - cache_eval_data=50, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - async_param=hugectr.AsyncParam( - num_reading_threads, - num_batches_per_threads, - max_nr_per_threads, - io_depth, - io_alignment, - True, - hugectr.Alignment_t.Auto, - multi_hot_reader=False, - is_dense_float=False, - ), -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True -) -# 2. Initialize the Model instance -model = hugectr.Model(solver, reader, optimizer) -# 3. Construct the Model graph -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)], - ) -) - -# Use mean num of infrequent plus 10-sigma guardband -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.HybridSparseEmbedding, - workspace_size_per_gpu_in_mb=1500, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - embedding_vec_size=128, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - hybrid_embedding_param=hugectr.HybridEmbeddingParam( - 2, - 16640 + 1290, - 0.01, - 1.3e11, - 23.75e9, - 0.5, - hugectr.CommunicationType.IB_NVLink_Hier, - hugectr.HybridEmbeddingType.Distributed, - ), - ) -) -compute_config_bottom = hugectr.DenseLayerComputeConfig( - async_wgrad=True, - fuse_wb=False, -) - -compute_config_top = hugectr.DenseLayerComputeConfig( - async_wgrad=True, - fuse_wb=True, -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["dense"], - top_names=["mlp1"], - num_outputs=[512, 256, 128], - compute_config=compute_config_bottom, - act_type=hugectr.Activation_t.Relu, - use_bias=True, - ) -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["mlp1", "sparse_embedding1"], - top_names=["interaction1", "interaction_grad"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["interaction1", "interaction_grad"], - top_names=["mlp2"], - num_outputs=[1024, 1024, 512, 256, 1], - compute_config=compute_config_top, - use_bias=True, - activations=[ - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Non, - ], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["mlp2", "label"], - top_names=["loss"], - ) -) -# 4. Dump the Model graph to JSON -model.graph_to_json(graph_config_file="dlrm.json") -# 5. Compile & Fit -model.compile() -model.summary() -model.fit( - max_iter=58527, display=1000, eval_interval=2926, snapshot=2000000, snapshot_prefix="dlrm" -) diff --git a/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py b/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py deleted file mode 100755 index da09611afb..0000000000 --- a/samples/dlrm/dgx_a100_8x8x1120_ib_nvlink.py +++ /dev/null @@ -1,244 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -# 1. Create Solver, DataReaderParams and Optimizer -solver = hugectr.CreateSolver( - max_eval_batches=125, - batchsize_eval=716800, - batchsize=71680, - vvgpu=[ - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - [0, 1, 2, 3, 4, 5, 6, 7], - ], - repeat_dataset=True, - lr=26.0, - warmup_steps=2500, - decay_start=46821, - decay_steps=15406, - decay_power=2.0, - end_lr=0.0, - use_mixed_precision=True, - scaler=1024, - use_cuda_graph=False, - gen_loss_summary=False, - train_intra_iteration_overlap=True, - train_inter_iteration_overlap=True, - eval_intra_iteration_overlap=False, - eval_inter_iteration_overlap=False, - all_reduce_algo=hugectr.AllReduceAlgo.NCCL, - grouped_all_reduce=True, - num_iterations_statistics=20, - metrics_spec={hugectr.MetricsType.AUC: 0.8025}, - perf_logging=True, - drop_incomplete_batch=False, -) - -batchsize = 71680 -num_reading_threads = 32 -num_batches_per_threads = 4 -expected_io_block_size = batchsize * 10 -io_depth = 2 -io_alignment = 512 -bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize -max_nr_per_threads = num_batches_per_threads * ( - bytes_size_per_batches // expected_io_block_size + 2 -) - -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.RawAsync, - source=["/data/train_data.bin"], - eval_source="/data/test_data.bin", - check_type=hugectr.Check_t.Non, - num_samples=4195197692, - eval_num_samples=89137319, - cache_eval_data=125, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - async_param=hugectr.AsyncParam( - num_reading_threads, - num_batches_per_threads, - max_nr_per_threads, - io_depth, - io_alignment, - True, - hugectr.Alignment_t.Auto, - multi_hot_reader=False, - is_dense_float=False, - ), -) - -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True -) -# 2. Initialize the Model instance -model = hugectr.Model(solver, reader, optimizer) -# 3. Construct the Model graph -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)], - ) -) - -# Use mean num of infrequent plus 10-sigma guardband -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.HybridSparseEmbedding, - workspace_size_per_gpu_in_mb=1500, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - embedding_vec_size=128, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - hybrid_embedding_param=hugectr.HybridEmbeddingParam( - 2, - 16640 + 1290, - 0.01, - 130e9, - 25e9, - 1, - hugectr.CommunicationType.IB_NVLink, - hugectr.HybridEmbeddingType.Distributed, - ), - ) -) - -compute_config_bottom = hugectr.DenseLayerComputeConfig( - async_wgrad=True, - fuse_wb=False, -) - -compute_config_top = hugectr.DenseLayerComputeConfig( - async_wgrad=True, - fuse_wb=True, -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["dense"], - top_names=["mlp1"], - num_outputs=[512, 256, 128], - compute_config=compute_config_bottom, - act_type=hugectr.Activation_t.Relu, - use_bias=True, - ) -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["mlp1", "sparse_embedding1"], - top_names=["interaction1", "interaction_grad"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["interaction1", "interaction_grad"], - top_names=["mlp2"], - num_outputs=[1024, 1024, 512, 256, 1], - compute_config=compute_config_top, - use_bias=True, - activations=[ - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Non, - ], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["mlp2", "label"], - top_names=["loss"], - ) -) -# 4. Dump the Model graph to JSON -model.graph_to_json(graph_config_file="dlrm.json") -# 5. Compile & Fit -model.compile() -model.summary() -model.fit( - max_iter=58527, display=1000, eval_interval=2926, snapshot=2000000, snapshot_prefix="dlrm" -) diff --git a/samples/dlrm/dgx_a100_ib_nvlink.py b/samples/dlrm/dgx_a100_ib_nvlink.py deleted file mode 100755 index 3e34d649c6..0000000000 --- a/samples/dlrm/dgx_a100_ib_nvlink.py +++ /dev/null @@ -1,228 +0,0 @@ -""" - Copyright (c) 2023, NVIDIA CORPORATION. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import hugectr -from mpi4py import MPI - -# 1. Create Solver, DataReaderParams and Optimizer -solver = hugectr.CreateSolver( - max_eval_batches=323, - batchsize_eval=276480, - batchsize=55296, - vvgpu=[[0, 1, 2, 3, 4, 5, 6, 7]], - repeat_dataset=True, - lr=24.0, - warmup_steps=2750, - decay_start=49315, - decay_steps=27772, - decay_power=2.0, - end_lr=0.0, - use_mixed_precision=True, - scaler=1024, - use_cuda_graph=False, - gen_loss_summary=False, - train_intra_iteration_overlap=True, - train_inter_iteration_overlap=True, - eval_intra_iteration_overlap=False, - eval_inter_iteration_overlap=False, - all_reduce_algo=hugectr.AllReduceAlgo.NCCL, - grouped_all_reduce=False, - num_iterations_statistics=20, - metrics_spec={hugectr.MetricsType.AUC: 0.8025}, - perf_logging=True, - drop_incomplete_batch=False, -) - -batchsize = 55296 -num_reading_threads = 32 -num_batches_per_threads = 4 -expected_io_block_size = batchsize * 10 -io_depth = 2 -io_alignment = 512 -bytes_size_per_batches = (26 + 1 + 13) * 4 * batchsize -max_nr_per_threads = num_batches_per_threads * ( - bytes_size_per_batches // expected_io_block_size + 2 -) - -reader = hugectr.DataReaderParams( - data_reader_type=hugectr.DataReaderType_t.RawAsync, - source=["/data/train_data.bin"], - eval_source="/data/test_data.bin", - check_type=hugectr.Check_t.Non, - num_samples=4195197692, - eval_num_samples=89137319, - cache_eval_data=323, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - async_param=hugectr.AsyncParam( - num_reading_threads, - num_batches_per_threads, - max_nr_per_threads, - io_depth, - io_alignment, - True, - hugectr.Alignment_t.Auto, - multi_hot_reader=False, - is_dense_float=False, - ), -) -optimizer = hugectr.CreateOptimizer( - optimizer_type=hugectr.Optimizer_t.SGD, update_type=hugectr.Update_t.Local, atomic_update=True -) -# 2. Initialize the Model instance -model = hugectr.Model(solver, reader, optimizer) -# 3. Construct the Model graph -model.add( - hugectr.Input( - label_dim=1, - label_name="label", - dense_dim=13, - dense_name="dense", - data_reader_sparse_param_array=[hugectr.DataReaderSparseParam("data1", 1, True, 26)], - ) -) -model.add( - hugectr.SparseEmbedding( - embedding_type=hugectr.Embedding_t.HybridSparseEmbedding, - workspace_size_per_gpu_in_mb=15000, - slot_size_array=[ - 39884406, - 39043, - 17289, - 7420, - 20263, - 3, - 7120, - 1543, - 63, - 38532951, - 2953546, - 403346, - 10, - 2208, - 11938, - 155, - 4, - 976, - 14, - 39979771, - 25641295, - 39664984, - 585935, - 12972, - 108, - 36, - ], - embedding_vec_size=128, - combiner="sum", - sparse_embedding_name="sparse_embedding1", - bottom_name="data1", - optimizer=optimizer, - hybrid_embedding_param=hugectr.HybridEmbeddingParam( - 2, - -1, - 0.03, - 130e9, - 260e9, - 0.25, - hugectr.CommunicationType.IB_NVLink, - hugectr.HybridEmbeddingType.Distributed, - ), - ) -) - -compute_config = hugectr.DenseLayerComputeConfig( - async_wgrad=True, -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["dense"], - top_names=["mlp1"], - num_outputs=[512, 256, 128], - compute_config=compute_config, - act_type=hugectr.Activation_t.Relu, - use_bias=True, - ) -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.Interaction, - bottom_names=["mlp1", "sparse_embedding1"], - top_names=["interaction1", "interaction_grad"], - ) -) -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.MLP, - bottom_names=["interaction1", "interaction_grad"], - top_names=["mlp2"], - num_outputs=[1024, 1024, 512, 256, 1], - compute_config=compute_config, - use_bias=True, - activations=[ - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Relu, - hugectr.Activation_t.Non, - ], - ) -) - -model.add( - hugectr.DenseLayer( - layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, - bottom_names=["mlp2", "label"], - top_names=["loss"], - ) -) -# 4. Dump the Model graph to JSON -model.graph_to_json(graph_config_file="dlrm.json") -# 5. Compile & Fit -model.compile() -model.summary() -model.fit( - max_iter=75868, display=1000, eval_interval=3793, snapshot=2000000, snapshot_prefix="dlrm" -) diff --git a/samples/dlrm/mlperf_logger/__init__.py b/samples/dlrm/mlperf_logger/__init__.py new file mode 100644 index 0000000000..54e9dec7d6 --- /dev/null +++ b/samples/dlrm/mlperf_logger/__init__.py @@ -0,0 +1,3 @@ +from .callbacks import LoggingCallback +from .param_info import param_info +from .utils import * diff --git a/samples/dlrm/mlperf_logger/callbacks.py b/samples/dlrm/mlperf_logger/callbacks.py new file mode 100644 index 0000000000..1359b44f80 --- /dev/null +++ b/samples/dlrm/mlperf_logger/callbacks.py @@ -0,0 +1,118 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from time import perf_counter +from typing import Dict + +import mlperf_logging.mllog.constants as mlperf_constants +from mlperf_common.logging import MLLoggerWrapper + +import hugectr + + +class LoggingCallback(hugectr.TrainingCallback): + def __init__( + self, + mllogger: MLLoggerWrapper, + auc_threshold: float, + max_iter: int, + batch_size: int, + ): + self.mllogger = mllogger + self.auc_threshold = auc_threshold + self.iter_per_epoch = max_iter + self.batch_size = batch_size + self._success = False + self._start_time = -1.0 + self._total_time = -1.0 + self._throughput = -1.0 + self._hit_auc_iter = max_iter + self.minimum_training_time = 0 + super().__init__() + + def _compute_stats(self, current_iter: int): + self._total_time = perf_counter() - self._start_time + self._throughput = (current_iter + 1) * self.batch_size / self._total_time + + def on_training_start(self): + self._start_time = perf_counter() + self.mllogger.log_init_stop_run_start() + self.mllogger.start( + key=mlperf_constants.EPOCH_START, + metadata={mlperf_constants.EPOCH_NUM: 0}, + ) + + def on_training_end(self, current_iter: int): + epoch_num = current_iter / self.iter_per_epoch + self.mllogger.end( + key=mlperf_constants.EPOCH_STOP, + metadata={mlperf_constants.EPOCH_NUM: epoch_num}, + ) + if not self._success: + self.mllogger.log_run_stop(status=mlperf_constants.ABORTED, epoch_num=epoch_num) + self._compute_stats(current_iter) + if self.minimum_training_time > 0: + output_max_iter = current_iter + 1 + else: + output_max_iter = self.iter_per_epoch + if self.mllogger.comm_handler.global_rank() == 0: + if self._success: + print( + f"Hit target accuracy AUC {self.auc_threshold:.5f} at " + f"{self._hit_auc_iter} / {output_max_iter} iterations with batchsize {self.batch_size} " + f"in {self._total_time:.2f}s. Average speed is {self._throughput:.2f} records/s." + ) + else: + print( + f"Finish {current_iter + 1} iterations with " + f"batchsize: {self.batch_size} in {self._total_time:.2f}s." + ) + self.mllogger.event( + key="tracked_stats", + metadata={"step": current_iter / self.iter_per_epoch}, + value={"throughput": self._throughput}, + ) + + def on_eval_start(self, current_iter: int) -> bool: + self.mllogger.start( + key=mlperf_constants.EVAL_START, + metadata={mlperf_constants.EPOCH_NUM: current_iter / self.iter_per_epoch}, + ) + return False + + def on_eval_end(self, current_iter: int, eval_results: Dict[str, float]) -> bool: + epoch_num = current_iter / self.iter_per_epoch + auc = eval_results["AUC"] + self.mllogger.event( + key=mlperf_constants.EVAL_ACCURACY, + value=auc, + metadata={mlperf_constants.EPOCH_NUM: epoch_num}, + ) + self.mllogger.end( + key=mlperf_constants.EVAL_STOP, + metadata={mlperf_constants.EPOCH_NUM: epoch_num}, + ) + if not self._success: + self._success = auc >= self.auc_threshold + if self._success: + self.mllogger.log_run_stop(status=mlperf_constants.SUCCESS, epoch_num=epoch_num) + self._hit_auc_iter = current_iter + self._total_time = perf_counter() - self._start_time + if self.minimum_training_time > 0: + if self._total_time < self.minimum_training_time * 60: + return False + else: + return True + else: + return self._success diff --git a/samples/dlrm/mlperf_logger/param_info.py b/samples/dlrm/mlperf_logger/param_info.py new file mode 100644 index 0000000000..bcb7bd6dd0 --- /dev/null +++ b/samples/dlrm/mlperf_logger/param_info.py @@ -0,0 +1,70 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import Namespace + +import mlperf_logging.mllog.constants as mllog_constants +from mlperf_common.logging import MLLoggerWrapper + +# Parameters not supported in HugeCTR: +ADAGRAD_LR_DECAY = 0 +WEIGHT_DECAY = 0 +GRADIENT_ACC_STEPS = 1 + + +def param_info(mllogger: MLLoggerWrapper, args: Namespace): + mllogger.event( + key=mllog_constants.GLOBAL_BATCH_SIZE, + value=args.batchsize, + ) + mllogger.event( + key=mllog_constants.OPT_NAME, + value=args.optimizer, + ) + mllogger.event( + key=mllog_constants.OPT_BASE_LR, + value=args.lr, + ) + mllogger.event( + key=mllog_constants.OPT_ADAGRAD_LR_DECAY, + value=ADAGRAD_LR_DECAY, + ) + mllogger.event( + key=mllog_constants.OPT_WEIGHT_DECAY, + value=WEIGHT_DECAY, + ) + mllogger.event( + key=mllog_constants.OPT_ADAGRAD_INITIAL_ACCUMULATOR_VALUE, + value=args.init_accu, + ) + mllogger.event( + key=mllog_constants.OPT_ADAGRAD_EPSILON, + value=args.eps, + ) + mllogger.event( + key=mllog_constants.OPT_LR_WARMUP_STEPS, + value=args.warmup_steps, + ) + mllogger.event( + key=mllog_constants.OPT_LR_DECAY_START_STEP, + value=args.decay_start, + ) + mllogger.event( + key=mllog_constants.OPT_LR_DECAY_STEPS, + value=args.decay_steps, + ) + mllogger.event( + key=mllog_constants.GRADIENT_ACCUMULATION_STEPS, + value=GRADIENT_ACC_STEPS, + ) diff --git a/tools/io_benchmark/CMakeLists.txt b/samples/dlrm/mlperf_logger/utils.py similarity index 56% rename from tools/io_benchmark/CMakeLists.txt rename to samples/dlrm/mlperf_logger/utils.py index 85f979e4fd..a81e599dd0 100644 --- a/tools/io_benchmark/CMakeLists.txt +++ b/samples/dlrm/mlperf_logger/utils.py @@ -1,25 +1,20 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # -# Copyright (c) 2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# +# +# http://www.apache.org/licenses/LICENSE-2.0 +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -cmake_minimum_required(VERSION 3.20) -find_package(CUDAToolkit) +import os -file(GLOB data_reader_bench_src - main.cpp -) -add_executable(io_bench ${data_reader_bench_src}) -target_link_libraries(io_bench PUBLIC CUDA::nvml huge_ctr_shared) -target_compile_features(io_bench PUBLIC cxx_std_17 cuda_std_17) +def get_row_count(data_path: str, num_columns: int, bytes_per_value: int) -> int: + """Get number of rows for a dataset in raw format.""" + return os.path.getsize(data_path) // (num_columns * bytes_per_value) diff --git a/samples/dlrm/preprocessing/convert_to_raw.py b/samples/dlrm/preprocessing/convert_to_raw.py new file mode 100644 index 0000000000..c49fb0cdeb --- /dev/null +++ b/samples/dlrm/preprocessing/convert_to_raw.py @@ -0,0 +1,251 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +import time + +import numpy as np + +""" +Script to convert the reference TorchRec NumPy dataset to a binary raw format for HugeCTR training. + +The script requires a machine with about 200GB RAM as it reads all three +day_*_labels.npy, day_*_dense.npy and day_*_sparse_multi_hot.npz files into memory. +It should complete in about 5h hours (depending on I/O bandwidth). + +For the MLPerf Training v3.0 the expected md5sum of the output files are: + +| file | md5sum | +|:---------------|:---------------------------------| +| test_data.bin | cf636876d8baf0776287be23b31c2f14 | +| train_data.bin | 4d48daf07cc244f6fa933b832d7fe5a3 | +| val_data.bin | c7ca591ad3fd2b09b75d99fa4fc210e2 | +""" + +INPUT_LABELS_FILE = "day_{day}_labels.npy" +INPUT_DENSE_FILE = "day_{day}_dense.npy" +INPUT_SPARSE_FILE = "day_{day}_sparse_multi_hot.npz" +OUTPUT_FILE = "{stage}_data.bin" +NUM_DAYS = 24 +NUM_SPARSE = 26 +TRAIN, VAL, TEST = "train", "val", "test" +STAGES = (TRAIN, VAL, TEST) +LAST_DAY_TEST_VAL_SPLIT_POINT = 89_137_319 + + +class DataConverter: + def __init__( + self, + input_dir_labels_and_dense: str, + input_dir_sparse_multihot: str, + output_dir: str, + stage: str, + buffer_size: int, + chunk_size: int, + logger: logging.Logger, + logging_interval: int, + ): + self.input_dir_labels_and_dense = input_dir_labels_and_dense + self.input_dir_sparse_multihot = input_dir_sparse_multihot + self.output_file = os.path.join(output_dir, OUTPUT_FILE.format(stage=stage)) + self.logger = logger + self.logging_interval = logging_interval + self.stage = stage + self.buffer_size = buffer_size + self.chunk_size = chunk_size + self.days = self._get_days_for_stage() + self.slice_ = self._get_slice_for_stage() + + def _get_days_for_stage(self): + if self.stage == TRAIN: + return list(range(NUM_DAYS - 1)) + else: + return [NUM_DAYS - 1] + + def _get_slice_for_stage(self): + slice_ = None + if self.stage == VAL: + slice_ = slice(None, LAST_DAY_TEST_VAL_SPLIT_POINT) + elif self.stage == TEST: + slice_ = slice(LAST_DAY_TEST_VAL_SPLIT_POINT, None) + self.logger.debug(f"stage = {self.stage}, slice_ = {slice_}") + return slice_ + + def _read_metadata(self, f): + np.lib.format.read_magic(f) + shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(f) + assert not fortran_order, "C-like index order expected" + self.logger.debug(f"Data shape = {shape}") + return shape, dtype + + def _load_data_for_day(self, day): + labels_file = INPUT_LABELS_FILE.format(day=day) + dense_file = INPUT_DENSE_FILE.format(day=day) + sparse_file = INPUT_SPARSE_FILE.format(day=day) + + self.logger.debug(f"Loading {labels_file}...") + with open(os.path.join(self.input_dir_labels_and_dense, labels_file), "rb") as f: + _, dtype = self._read_metadata(f) + label = np.fromfile(f, dtype=dtype) + self.logger.debug("Loading done") + + self.logger.debug(f"Loading {dense_file}...") + with open(os.path.join(self.input_dir_labels_and_dense, dense_file), "rb") as f: + shape, dtype = self._read_metadata(f) + dense = np.fromfile(f, dtype=dtype).reshape(shape, order="C") + self.logger.debug("Loading done") + + self.logger.debug(f"Loading {sparse_file}...") + sparse_dict = np.load(os.path.join(self.input_dir_sparse_multihot, sparse_file)) + sparse_list = [sparse_dict[str(i)] for i in range(NUM_SPARSE)] + self.logger.debug("Loading done") + + if self.slice_ is not None: + self.logger.debug("Slicing data...") + label = label[self.slice_] + dense = dense[self.slice_] + sparse_list = [sparse[self.slice_] for sparse in sparse_list] + self.logger.debug("Slicing done") + + return label, dense, sparse_list + + def save(self): + self.logger.info(f"Writing data to {self.output_file}...") + samples_total = 0 + start_time = time.perf_counter() + with open(self.output_file, "wb", buffering=self.buffer_size) as out: + write = out.write + for day in self.days: + self.logger.info(f"Processing data for day = {day}...") + label, dense, sparse_list = self._load_data_for_day(day) + # We concatenate sparse features as it saves time on writing + # data below. It is done in chunks to save memory. + start = 0 + end = self.chunk_size + while start < len(label): + self.logger.debug("Concatenating sparse features...") + sparse = np.concatenate( + [sparse_feat[start:end] for sparse_feat in sparse_list], axis=1 + ) + self.logger.debug("Concatenating done") + for samples_total, (label_row, dense_row, sparse_row) in enumerate( + zip(label[start:end], dense[start:end], sparse), samples_total + 1 + ): + write(label_row.tobytes()) + write(dense_row.tobytes()) + write(sparse_row.tobytes()) + if samples_total % self.logging_interval == 0: + self.logger.info(f"Number of samples done: {samples_total:,}") + start = end + end += self.chunk_size + end_time = time.perf_counter() + self.logger.info(f"Creating {self.output_file} done.") + self.logger.info( + f"Total number of samples done for stage = {self.stage}: {samples_total:,}" + ) + self.logger.info(f"Throughput: {samples_total / (end_time - start_time):.2f} [samples/sec]") + + +def get_logger(level): + logger = logging.getLogger(__name__) + logger.setLevel(level) + s_handler = logging.StreamHandler() + log_format = logging.Formatter("[%(asctime)s][%(levelname)s]: %(message)s") + s_handler.setFormatter(log_format) + logger.addHandler(s_handler) + return logger + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="NumPy to Raw format conversion script.") + parser.add_argument( + "--input_dir_labels_and_dense", + type=str, + required=True, + help="Input directory with labels and dense data", + ) + parser.add_argument( + "--input_dir_sparse_multihot", + type=str, + required=True, + help="Input directory with sparse multi-hot data", + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Output directory for the raw binary dataset", + ) + parser.add_argument( + "--stages", + type=str, + choices=STAGES, + default=STAGES, + nargs="+", + help="Stages to process", + ) + parser.add_argument( + "--buffer_size", + type=int, + default=2_147_483_647, + help="Buffer size for writing data", + ) + parser.add_argument( + "--chunk_size", + type=int, + default=2_000_000, + help="Chunk size for concatenating sparse features before saving", + ) + parser.add_argument( + "--logging_level", + type=int, + default=logging.INFO, + help="Logging level", + ) + parser.add_argument( + "--logging_interval", + type=int, + default=10_000_000, + help="Logging interval for the number of samples done", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + logger = get_logger(level=args.logging_level) + logger.info("NumPy to Raw format conversion script") + logger.info(f"args are = {vars(args)}") + + os.makedirs(args.output_dir, exist_ok=True) + for stage in args.stages: + converter = DataConverter( + input_dir_labels_and_dense=args.input_dir_labels_and_dense, + input_dir_sparse_multihot=args.input_dir_sparse_multihot, + output_dir=args.output_dir, + stage=stage, + buffer_size=args.buffer_size, + chunk_size=args.chunk_size, + logger=logger, + logging_interval=args.logging_interval, + ) + converter.save() + + logger.info("Done.") + + +if __name__ == "__main__": + main() diff --git a/samples/dlrm/preprocessing/md5sums_raw_dataset.txt b/samples/dlrm/preprocessing/md5sums_raw_dataset.txt new file mode 100644 index 0000000000..9317cae385 --- /dev/null +++ b/samples/dlrm/preprocessing/md5sums_raw_dataset.txt @@ -0,0 +1,3 @@ +cf636876d8baf0776287be23b31c2f14 test_data.bin +4d48daf07cc244f6fa933b832d7fe5a3 train_data.bin +c7ca591ad3fd2b09b75d99fa4fc210e2 val_data.bin diff --git a/samples/dlrm/requirements.txt b/samples/dlrm/requirements.txt new file mode 100644 index 0000000000..370ae43460 --- /dev/null +++ b/samples/dlrm/requirements.txt @@ -0,0 +1,3 @@ +git+https://github.com/mlcommons/logging.git@3.1.0-rc1 +git+https://github.com/NVIDIA/mlperf-common.git +mpi4py==3.1.5 diff --git a/samples/dlrm/run.sub b/samples/dlrm/run.sub new file mode 100755 index 0000000000..c13fe88b99 --- /dev/null +++ b/samples/dlrm/run.sub @@ -0,0 +1,140 @@ +#!/bin/bash + +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#SBATCH --job-name mlperf-dlrm:hugectr +#SBATCH -t 00:30:00 + +set -euxo pipefail + +# Vars without defaults +: "${DGXSYSTEM:?DGXSYSTEM not set}" +: "${CONT:?CONT not set}" +: "${DATADIR:?DATADIR not set}" + +# Vars with defaults +: "${MLPERF_RULESET:=3.1.0}" +: "${MLPERF_CLUSTER_NAME:='unknown'}" +: "${NEXP:=10}" +: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}" +: "${CLEAR_CACHES:=1}" +: "${CHECK_COMPLIANCE:=1}" +: "${API_LOG_DIR:=./api_logs}" # apiLog.sh output dir +: "${ABSLOGDIR:=${PWD}/results}" +: "${POWERCMDDIR:=' '}" +: "${DATADIR_VAL:=${DATADIR}}" +: "${MOUNTS:=${DATADIR}:/data,${DATADIR_VAL}:/data_val}" +: "${LOGDIR:=./results}" + +export MODEL_NAME="recommendation" +export MODEL_FRAMEWORK="pytorch" +LOG_BASE="${DATESTAMP}" +SPREFIX="${MODEL_NAME}_${MODEL_FRAMEWORK}_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}" + + +readonly _logfile_base="${LOGDIR}/${DATESTAMP}" +readonly _cont_name="${MODEL_NAME}_${SLURM_JOB_ID}" +_cont_mounts=${MOUNTS} + +if [ "${API_LOGGING:-}" -eq 1 ]; then + API_LOG_DIR=${API_LOG_DIR}/${MODEL_FRAMEWORK}/${MODEL_NAME}/${DGXSYSTEM} + mkdir -p ${API_LOG_DIR} + _cont_mounts="${_cont_mounts},${API_LOG_DIR}:/logs" + + # Create JSON file for cuDNN + JSON_MODEL_NAME="MLPERF_${MODEL_NAME}_${MODEL_FRAMEWORK}_train" + JSON_README_LINK="${README_PREFIX}/${MODEL_NAME}/${MODEL_FRAMEWORK}/README.md" + JSON_FMT='{model_name: $mn, readme_link: $rl, configs: {($dt): [$bs]}, sweep: {($dt): [$bs]}}' + JSON_OUTPUT="${JSON_MODEL_NAME}.cudnn.json" + jq -n --indent 4 --arg mn $JSON_MODEL_NAME --arg rl $JSON_README_LINK --arg dt $APILOG_PRECISION --arg bs $BATCHSIZE "$JSON_FMT" > ${API_LOG_DIR}/$JSON_OUTPUT +fi +if [ "${JET:-0}" -eq 1 ]; then + _cont_mounts="${_cont_mounts},${JET_DIR}:/root/.jet,${LOGDIR}:/results" +fi + +# make sure the results directory exists on the host +( umask 0002; mkdir -p "${LOGDIR}" ) + +# Setup container +echo MELLANOX_VISIBLE_DEVICES="${MELLANOX_VISIBLE_DEVICES:-}" +srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-name="${_cont_name}" true +srun -N1 -n1 --container-name="${_cont_name}" ibv_devinfo --list +srun -N1 -n1 --container-name="${_cont_name}" nvidia-smi topo -m + +#ssh to nodes for power measurements +NODELIST=$(scontrol show hostnames ${SLURM_JOB_NODELIST}) +NODELIST=(${NODELIST[*]}) +if [ -f "$POWERCMDDIR/power_monitor.sh" ]; then + ( umask 0002; mkdir -p "${ABSLOGDIR}" ) + for i in "${NODELIST[@]}" + do + ssh $i 'export NODENAME='"'$i'"';export ABSLOGDIR='"'$ABSLOGDIR'"';export SLURM_JOB_NODELIST='"'$SLURM_JOB_NODELIST'"';export SLURM_JOB_ID='"'$SLURM_JOB_ID'"';POWERCMDDIR='"'$POWERCMDDIR'"';bash ${POWERCMDDIR}/power_monitor.sh' & +# break + done +fi + +if [[ "${SET_MAXQ_CLK:-}" == "1" ]] || [[ "${SET_MINEDP_CLK:-}" == "1" ]]; then + if [[ "${SET_MAXQ_CLK:-}" == "1" ]]; then + GPCCLK=${MAXQ_CLK} + fi + if [[ "${SET_MINEDP_CLK:-}" == "1" ]]; then + GPCCLK=${MINEDP_CLK} + fi + for i in "${NODELIST[@]}" + do + ssh $i 'export GPCCLK='"'$GPCCLK'"';sudo nvidia-smi -lgc ${GPCCLK}' + done +fi + +# Run experiments +for _experiment_index in $(seq -w 1 "${NEXP}"); do + ( + echo ":::DLPAL ${CONT} ${SLURM_JOB_ID} ${SLURM_JOB_NUM_NODES} ${SLURM_JOB_NODELIST} ${MLPERF_CLUSTER_NAME} ${DGXSYSTEM}" + + # Print system info + echo ":::SYSJSON $(srun --ntasks=1 --container-name="${_cont_name}" mlperf-sysjson.sh)" + + if [[ $CLEAR_CACHES == 1 ]]; then + srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" + srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${_cont_name}" python3 -c " +import mlperf_logging.mllog as mllog +mllogger = mllog.get_mllogger() +mllogger.event(key=mllog.constants.CACHE_CLEAR, value=True)" + fi + echo "Beginning trial ${_experiment_index} of ${NEXP}" + srun --mpi="${SLURM_MPI_TYPE:-pmix}" --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 \ + --container-name="${_cont_name}" --container-mounts="${_cont_mounts}" \ + ./run_and_time.sh + ) |& tee "${_logfile_base}_raw_${_experiment_index}.log" + + # Sorting the MLPerf compliance logs by timestamps + grep ":::.L..." "${_logfile_base}_raw_${_experiment_index}.log" | sort -k5 -n -s | tee "${_logfile_base}_${_experiment_index}.log" + if [ "${CHECK_COMPLIANCE}" -eq 1 ]; then + srun --ntasks=1 --nodes=1 --container-name="${_cont_name}" \ + --container-mounts="$(realpath ${LOGDIR}):/results" \ + --container-workdir="/results" \ + python3 -m mlperf_logging.compliance_checker --usage training \ + --ruleset "${MLPERF_RULESET}" \ + --log_output "/results/compliance_${DATESTAMP}.out" \ + "/results/${DATESTAMP}_${_experiment_index}.log" \ + || true + fi + + if [ "${JET:-0}" -eq 1 ]; then + JET_CREATE=${JET_CREATE:-}" --data workload.spec.nodes=${DGXNNODES} --data workload.spec.name=${MODEL_NAME}_${MODEL_FRAMEWORK}_${DGXSYSTEM} --data workload.key=${MODEL_NAME}_${MODEL_FRAMEWORK}_${DGXSYSTEM} --mllogger " + srun -N1 -n1 --container-name="${_cont_name}" --container-mounts="${_cont_mounts}" bash -c "${JET_CREATE} /results/${DATESTAMP}_${_experiment_index}.log --asset /results/slurm-${SLURM_JOB_ID}.out --data source_image.name=${CONT} --data slurm.job=${SLURM_JOB_ID} && ${JET_UPLOAD}" + fi + +done diff --git a/samples/dlrm/run_and_time.sh b/samples/dlrm/run_and_time.sh new file mode 100755 index 0000000000..8ce27f9703 --- /dev/null +++ b/samples/dlrm/run_and_time.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# runs benchmark and reports time to convergence + +# default value for DLRM_BIND only if it is not already defined +#: ${DLRM_BIND:="numactl --membind=1,3,5,7"} +: ${DLRM_BIND:=} + +set -ex + +ARGS="" +[ -n "${OPTIMIZER:-}" ] && ARGS+=" --optimizer ${OPTIMIZER}" +[ -n "${BATCHSIZE:-}" ] && ARGS+=" --batchsize ${BATCHSIZE}" +[ -n "${BATCHSIZE_EVAL:-}" ] && ARGS+=" --batchsize_eval ${BATCHSIZE_EVAL}" +[ -n "${LEARNING_RATE:-}" ] && ARGS+=" --lr ${LEARNING_RATE}" +[ -n "${WARMUP_STEPS:-}" ] && ARGS+=" --warmup_steps ${WARMUP_STEPS}" +[ -n "${DECAY_START:-}" ] && ARGS+=" --decay_start ${DECAY_START}" +[ -n "${DECAY_STEPS:-}" ] && ARGS+=" --decay_steps ${DECAY_STEPS}" +[ "$ENABLE_TF32_COMPUTE" = true ] && ARGS+=" --enable_tf32_compute" +[ "$USE_MIXED_PRECISION" = true ] && ARGS+=" --use_mixed_precision" +[ -n "${SCALER:-}" ] && ARGS+=" --scaler ${SCALER}" +[ "$GEN_LOSS_SUMMARY" = true ] && ARGS+=" --gen_loss_summary" +[ "$USE_ALGORITHM_SEARCH" = false ] && ARGS+=" --disable_algorithm_search" +[ -n "${SHARDING_PLAN:-}" ] && ARGS+=" --sharding_plan ${SHARDING_PLAN}" +[ -n "${DP_SHARDING_THRESHOLD:-}" ] && ARGS+=" --dp_sharding_threshold ${DP_SHARDING_THRESHOLD}" +[ -n "${MAX_ITER:-}" ] && ARGS+=" --max_iter ${MAX_ITER}" +[ -n "${DISPLAY_INTERVAL:-}" ] && ARGS+=" --display_interval ${DISPLAY_INTERVAL}" +[ -n "${EVAL_INTERVAL:-}" ] && ARGS+=" --eval_interval ${EVAL_INTERVAL}" +[ -n "${MAX_EVAL_BATCHES:-}" ] && ARGS+=" --max_eval_batches ${MAX_EVAL_BATCHES}" +[ -n "${AUC_THRESHOLD:-}" ] && ARGS+=" --auc_threshold ${AUC_THRESHOLD}" +[ -n "${DGXNGPU:-}" ] && ARGS+=" --num_gpus_per_node ${DGXNGPU}" +[ -n "${MEM_COMM_BW_RATIO:-}" ] && ARGS+=" --mem_comm_bw_ratio ${MEM_COMM_BW_RATIO}" +[ -n "${SEED:-}" ] && ARGS+=" --seed ${SEED}" +[ -n "${MLPERF_POWER_TRAIN_AFTER_RUN_STOP:-}" ] && ARGS+=" --minimum_training_time ${MINIMUM_TRAINING_TIME:-0}" + +readonly node_rank="${SLURM_NODEID:-0}" +readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}" + +if [ "$LOGGER" = "apiLog.sh" ]; +then + LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}" + if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ]; + then + LOGGER=$LOGGER + else + LOGGER="" + fi +fi + +echo "DLRM_BIND is set to \"${DLRM_BIND}\"" +${LOGGER} ${DLRM_BIND} python3 ${RUN_SCRIPT} ${ARGS} | tee /tmp/dlrm_hugectr.log + + ret_code=${PIPESTATUS[0]} + if [[ $ret_code != 0 ]]; then exit $ret_code; fi diff --git a/samples/dlrm/run_with_docker.sh b/samples/dlrm/run_with_docker.sh new file mode 100755 index 0000000000..a80eab6225 --- /dev/null +++ b/samples/dlrm/run_with_docker.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#SBATCH --job-name dlrm.hugectr +#SBATCH -t 00:30:00 + +set -euxo pipefail + +# Vars without defaults +: "${DGXSYSTEM:?DGXSYSTEM not set}" +: "${CONT:?CONT not set}" +: "${DATADIR:?DATADIR not set}" + +# Vars with defaults +: "${NEXP:=1}" +: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}" +: "${CLEAR_CACHES:=1}" +: "${CHECK_COMPLIANCE:=1}" +: "${MLPERF_RULESET:=3.1.0}" +: "${MOUNTS:=${DATADIR}:/data}" +: "${LOGDIR:=./results}" +# default DLRM_BIND to null because we don't know what user's system actually is +: "${DLRM_BIND:=}" + +# Other vars +readonly _config_file="./config_${DGXSYSTEM}.sh" +readonly _logfile_base="${LOGDIR}/${DATESTAMP}" +readonly _cont_name=dlrm_hugectr +_cont_mounts=("--volume=${DATADIR}:/data" "--volume=${DATADIR}:/data_val" "--volume=${LOGDIR}:${LOGDIR}") + + +# Setup directories +mkdir -p "${LOGDIR}" + +# Get list of envvars to pass to docker +mapfile -t _config_env < <(env -i bash -c ". ${_config_file} && compgen -e" | grep -E -v '^(PWD|SHLVL)') +_config_env+=(DATADIR) +_config_env+=(DATASET_TYPE) +_config_env+=(DGXSYSTEM) +mapfile -t _config_env < <(for v in "${_config_env[@]}"; do echo "--env=$v"; done) + +# Cleanup container +cleanup_docker() { + docker container rm -f "${_cont_name}" || true +} +cleanup_docker +trap 'set -eux; cleanup_docker' EXIT + +# Setup container +nvidia-docker run --rm --init --detach \ + --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \ + --name="${_cont_name}" "${_cont_mounts[@]}" \ + "${CONT}" sleep infinity +# Make sure container has time to finish initialization +sleep 30 +docker exec -it "${_cont_name}" true + + +# Run experiments +for _experiment_index in $(seq 1 "${NEXP}"); do + ( + echo "Beginning trial ${_experiment_index} of ${NEXP}" + if [[ $CLEAR_CACHES == 1 ]]; then + bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" + docker exec -it "${_cont_name}" python3 -c " +import mlperf_logging.mllog as mllog +mllogger = mllog.get_mllogger() +mllogger.event(key=mllog.constants.CACHE_CLEAR, value=True)" + fi + + docker exec -it ${_config_env[@]} ${_cont_name} bash ./run_and_time.sh + ) |& tee "${_logfile_base}_${_experiment_index}.log" + + if [ "${CHECK_COMPLIANCE}" -eq 1 ]; then + docker exec -it "${_config_env[@]}" "${_cont_name}" \ + python3 -m mlperf_logging.compliance_checker --usage training \ + --ruleset "${MLPERF_RULESET}" \ + --log_output "/results/compliance_${DATESTAMP}.out" \ + "/results/${DATESTAMP}_${_experiment_index}.log" \ + || true + fi +done diff --git a/samples/dlrm/sharding/__init__.py b/samples/dlrm/sharding/__init__.py new file mode 100644 index 0000000000..2357451f46 --- /dev/null +++ b/samples/dlrm/sharding/__init__.py @@ -0,0 +1,2 @@ +from .generate_plan import generate_plan +from .planner import Cost, CostModel, Planner, ShardingState diff --git a/samples/dlrm/sharding/generate_plan.py b/samples/dlrm/sharding/generate_plan.py new file mode 100644 index 0000000000..dac8f2d3e3 --- /dev/null +++ b/samples/dlrm/sharding/generate_plan.py @@ -0,0 +1,131 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from argparse import Namespace +from itertools import chain +from typing import List + +from .planner import CostModel, Planner + + +def generate_plan( + slot_size_array: List[int], + multi_hot_sizes: List[int], + num_nodes: int, + num_gpus: int, + args: Namespace, + log_result: bool, +): + def sanity_check(shard_matrix, shard_strategy): + # mainly to make sure all the tables are sharded + msg = "Not all tables covered in the sharding plan" + assert set(chain(*shard_matrix)) == set(range(len(slot_size_array))), msg + shard_strategy_list = [x for strategy_pair in shard_strategy for x in strategy_pair[1]] + assert set(shard_strategy_list) == set(range(len(slot_size_array))), msg + + for table_list in shard_matrix: + if len(table_list) == 0: + raise Exception("Currently no empty shard list is allowed") + + def int_to_string(shard_matrix_int, shard_strategy_int): + shard_strategy, shard_matrix = [], [] + for pair in shard_strategy_int: + if len(pair[1]) != 0: + shard_strategy.append((pair[0], [str(x) for x in pair[1]])) + for sub_matrix_ in shard_matrix_int: + shard_matrix.append([str(x) for x in sub_matrix_]) + return shard_matrix, shard_strategy + + if args.sharding_plan in ["round_robin", "uniform"]: + # sharding strategies that don't exploit system configs + if args.sharding_plan == "round_robin": + mp_table = [i for i in range(len(slot_size_array))] + shard_matrix_ = [[] for _ in range(num_gpus)] + shard_strategy_ = [("mp", [i for i in mp_table])] + + for i, table_id in enumerate(mp_table): + target_gpu = i % num_gpus + shard_matrix_[target_gpu].append(table_id) + + elif args.sharding_plan == "uniform": + shard_matrix_ = [[x for x in range(len(slot_size_array))] for _ in range(num_gpus)] + shard_strategy_ = [("mp", [i for i in range(len(slot_size_array))])] + + elif args.sharding_plan in ["auto", "hier_auto"]: + # sharding strategies that exploit system configs + dram_cap = args.memory_cap_for_embedding + if args.optimizer == "adagrad": + byte_per_elem = 8 + elif args.optimizer == "sgd": + byte_per_elem = 4 + + if args.sharding_plan == "auto": + cost_model = CostModel( + 1, + args.mem_comm_bw_ratio / args.mem_comm_work_ratio, + args.ev_size * byte_per_elem * 1e-9, + dram_cap, + slot_size_array, + ) + planner = Planner( + multi_hot_sizes, + num_gpus, + cost_model, + log_result=log_result, + dp_threshold=args.dp_sharding_threshold, + ) + shard_strategy_, shard_matrix_ = planner.plan() + + elif args.sharding_plan == "hier_auto": + if num_nodes <= 1: + raise Exception( + "hier_auto plan is only applicable to configs with more than one node" + ) + cost_model = CostModel( + 1, + args.mem_comm_bw_ratio / args.mem_comm_work_ratio, + args.ev_size * byte_per_elem * 1e-9, + dram_cap * args.num_gpus_per_node, + slot_size_array, + ) + planner = Planner( + multi_hot_sizes, + num_nodes, + cost_model, + log_result=log_result, + dp_threshold=args.dp_sharding_threshold, + ) + shard_strategy_, shard_matrix_node_ = planner.plan() + shard_matrix_ = [] + for node_shard_matrix in shard_matrix_node_: + for i in range(args.num_gpus_per_node): + shard_matrix_.append(node_shard_matrix) + else: + raise Exception("unknown sharding plan") + + sanity_check(shard_matrix_, shard_strategy_) + shard_matrix, shard_strategy = int_to_string(shard_matrix_, shard_strategy_) + + if log_result: + logging.info("Provided system info: ") + logging.info("num_gpu_per_nodes: %d", args.num_gpus_per_node) + logging.info("Memory to communication BW ratio: %f", args.mem_comm_bw_ratio) + logging.info("Memory to communication work ratio: %f", args.mem_comm_work_ratio) + logging.info("DRAM capacity: %f GB", args.memory_cap_for_embedding) + logging.info("shard_matrix:") + logging.info(shard_matrix) + logging.info("\n") + + return shard_matrix, shard_strategy diff --git a/samples/dlrm/sharding/planner.py b/samples/dlrm/sharding/planner.py new file mode 100644 index 0000000000..c68a5d7b23 --- /dev/null +++ b/samples/dlrm/sharding/planner.py @@ -0,0 +1,327 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +from typing import List, Tuple + +import numpy as np + + +class ShardingState: + """ + Containing the state of a sharding process. + The plan iteratively update the sharding state based on a given heuristic and obtain + solutions. + """ + + def __init__( + self, + array_hotness: np.array, + num_bucket: int, + dp_table_id: np.array(int) = np.array([]), + ) -> None: + mp_table_id = np.setdiff1d(np.arange(array_hotness.size), dp_table_id) + array_hotness_mp = array_hotness[mp_table_id] + sorted_idx = np.argsort(array_hotness_mp)[::-1] + self.array_unshard_hotness = array_hotness + self.array_hotness = array_hotness_mp[sorted_idx] + self.num_bucket = num_bucket + self.array_table_id = mp_table_id[sorted_idx] + self.array_num_split = np.zeros(self.array_unshard_hotness.size, dtype=int) + self.array_num_split[mp_table_id] = 1 + self.shard_ll = [[] for i in range(self.num_bucket)] + + def split_hot_shard(self): + """ + split the shard with the largest hotness + """ + # shards are sorted based on the hotness. Find the first hot shard that + # can be split further + for shard_id in range(self.array_table_id.size): + table_id = self.array_table_id[shard_id] + hotness = self.array_unshard_hotness[table_id] + if self.array_num_split[table_id] * 2 <= self.num_bucket: + # if this table can be further split and we can put it into + # more buckets + idx = np.where(self.array_table_id == table_id)[0] + self.array_hotness = np.delete(self.array_hotness, idx) + self.array_table_id = np.delete(self.array_table_id, idx) + self.array_num_split[table_id] *= 2 + self.array_hotness = np.concatenate( + ( + self.array_hotness, + np.ones(self.array_num_split[table_id]) + * (hotness / self.array_num_split[table_id]), + ) + ) + self.array_table_id = np.concatenate( + ( + self.array_table_id, + np.ones(self.array_num_split[table_id], dtype=int) * table_id, + ) + ) + break + + # sort after splitting to maintain the shard hotness in order + sorted_idx = np.argsort(self.array_hotness)[::-1] + self.array_hotness = self.array_hotness[sorted_idx] + self.array_table_id = self.array_table_id[sorted_idx] + + def split_oom_shard(self, table_id: int) -> bool: + hotness = self.array_unshard_hotness[table_id] + if self.array_num_split[table_id] * 2 <= self.num_bucket: + idx = np.where(self.array_table_id == table_id)[0] + self.array_hotness = np.delete(self.array_hotness, idx) + self.array_table_id = np.delete(self.array_table_id, idx) + self.array_num_split[table_id] *= 2 + self.array_hotness = np.concatenate( + ( + self.array_hotness, + np.ones(self.array_num_split[table_id]) + * (hotness / self.array_num_split[table_id]), + ) + ) + self.array_table_id = np.concatenate( + (self.array_table_id, np.ones(self.array_num_split[table_id], dtype=int) * table_id) + ) + sorted_idx = np.argsort(self.array_hotness)[::-1] + self.array_hotness = self.array_hotness[sorted_idx] + self.array_table_id = self.array_table_id[sorted_idx] + return True + else: + return False + + def update_split_num(self): + self.array_num_split = np.zeros_like(self.array_unshard_hotness) + for shard_list in self.shard_ll: + for table_id in shard_list: + self.array_num_split[table_id] += 1 + + def reset_shard_ll(self): + self.shard_ll = [[] for i in range(self.num_bucket)] + + def push_bucket( + self, + bucket_id: int, + table_id: int, + ) -> None: + self.shard_ll[bucket_id].append(table_id) + + def pop_bucket( + self, + bucket_id: int, + ) -> None: + self.shard_ll[bucket_id].pop() + + +class Cost: + def __init__( + self, + cost: np.array(float), + hotness_cost: np.array(float), + table_cost: np.array(float), + mem_cost: np.array(float), + ) -> None: + self.cost = cost + self.hotness_cost = hotness_cost + self.table_cost = table_cost + self.mem_cost = mem_cost + + +class CostModel: + def __init__( + self, + hotness_cost: float, + table_cost: float, + mem_cost: float, + mem_capacity: float, + table_size: List[int], + ) -> None: + self.unit_hotness_cost = hotness_cost + self.unit_table_cost = table_cost + self.unit_mem_cost = mem_cost + self.mem_capacity = mem_capacity + self.array_table_size = np.array(table_size) + + def get_cost( + self, + ss: ShardingState, + ) -> Tuple[Cost, bool]: + list_cost = [] + list_hotness_cost = [] + list_table_cost = [] + list_mem_cost = [] + + for shard_list in ss.shard_ll: + hotness_cost = ( + self.unit_hotness_cost + * ( + ss.array_unshard_hotness[shard_list] / np.array(ss.array_num_split)[shard_list] + ).sum() + ) + table_cost = self.unit_table_cost * len(shard_list) + mem_cost = ( + self.unit_mem_cost + * ( + self.array_table_size[shard_list] / np.array(ss.array_num_split)[shard_list] + ).sum() + ) + list_cost.append(hotness_cost + table_cost) + list_hotness_cost.append(hotness_cost) + list_table_cost.append(table_cost) + list_mem_cost.append(mem_cost) + + return ( + Cost( + np.array(list_cost), + np.array(list_hotness_cost), + np.array(list_table_cost), + np.array(list_mem_cost), + ), + max(list_mem_cost) > self.mem_capacity, + ) + + def deduct_mem_cap_for_dp( + self, + dp_table_id: list, + ) -> None: + self.mem_capacity -= self.array_table_size[dp_table_id].sum() * self.unit_mem_cost + if self.mem_capacity < 0: + raise Exception("OOM due to DP. Please considering increase the DP threshold") + + +class Planner: + """ + The planner work out a series of plans iteratively. + In each iteration, the planner tries to split the hottest shard and place the shards into + a bucket based on a give heuristic. When the shard is too large to fit into the best bucket + suggested by the heuristic, it finds the next best bucket until it iterates through all the + buckets. In that case, it tries to split the shard further. If the shard cannot be split + further, the planner aborts and returns the default sharding plan. + """ + + def __init__( + self, + list_hotness: list, + num_bucket: int, + cost_model: CostModel, + dp_threshold: int = 0, + max_search_iter: int = 20, + log_result: bool = False, + ) -> None: + self.array_hotness = np.array(list_hotness) + self.num_bucket = num_bucket + self.cost_model = cost_model + self.list_candidate = [] + self.max_search_iter = max_search_iter + self.log_result = log_result + + # Create the default sharding plan. Throw if even this default sharding plan cannot fit, as + # it should be the most memory-efficient + sharding_state_default = ShardingState(self.array_hotness, self.num_bucket) + for b in range(self.num_bucket): + for t in range(self.array_hotness.size): + sharding_state_default.push_bucket(b, t) + sharding_state_default.update_split_num() + cost, oom = self.cost_model.get_cost(sharding_state_default) + if oom: + raise Exception("OOM even with the most memory-efficient sharding plan") + self.list_candidate.append( + ( + cost.cost.max(), + cost.hotness_cost, + cost.table_cost, + cost.mem_cost, + sharding_state_default.shard_ll, + ) + ) + + # Create DP sharding plan based on the DP threshold + self.dp_table_id = np.where( + cost_model.array_table_size < dp_threshold / cost_model.unit_mem_cost + )[0] + self.mp_table_id = np.setdiff1d(np.arange(self.array_hotness.size), self.dp_table_id) + self.sharding_state = ShardingState(self.array_hotness, self.num_bucket, self.dp_table_id) + self.cost_model.deduct_mem_cap_for_dp(self.dp_table_id) + + logging.basicConfig(level=logging.INFO, format="%(message)s") + + def greedy_plan(self, ss): + """ + This is a heuristic based on greedy policy. The shard is placed to the bucket with the + lowest hotness cost + """ + array_cost = np.zeros(ss.num_bucket) + ss.reset_shard_ll() + for i in range(ss.array_hotness.size): + sorted_idx = np.argsort(array_cost) + sharded = False + for bucket_id in sorted_idx: + if ss.array_table_id[i] not in ss.shard_ll[bucket_id]: + # for now, only uniform sharding is supported. Hence cannot put two shards + # from the same table into the same bucket + ss.push_bucket(bucket_id, ss.array_table_id[i]) + cost, oom = self.cost_model.get_cost(ss) + if not oom: + sharded = True + array_cost = cost.cost + break + else: + # Current bucket cannot fit. Iterate to the next best bucket + ss.pop_bucket(bucket_id) + if not sharded: + # This means the shard is too large to fit within any bucket + return ss.array_table_id[i], ss, cost + return None, ss, cost + + def plan(self): + t0 = time.time() + for i in range(self.max_search_iter): + oom_table_id, self.sharding_state, cost = self.greedy_plan(self.sharding_state) + if oom_table_id is None: + self.list_candidate.append( + ( + cost.cost.max(), + cost.hotness_cost, + cost.table_cost, + cost.mem_cost, + self.sharding_state.shard_ll, + ) + ) + self.sharding_state.split_hot_shard() + else: + oom_table_can_split = self.sharding_state.split_oom_shard(oom_table_id) + if not oom_table_can_split: + break + + self.list_candidate.sort(key=lambda x: x[0]) + + shard_strategy = [("mp", self.mp_table_id.tolist())] + shard_strategy.append(("dp", self.dp_table_id.tolist())) + shard_matrix = self.list_candidate[0][-1] + for table_id in self.dp_table_id: + for shard_list in shard_matrix: + shard_list.append(table_id) + if self.log_result: + logging.info("Planner took %f sec" % (time.time() - t0)) + logging.info(shard_strategy) + logging.info(shard_matrix) + logging.info("hotness cost is:") + logging.info(self.list_candidate[0][1]) + logging.info("table cost is:") + logging.info(self.list_candidate[0][2]) + logging.info("mem cost is:") + logging.info(self.list_candidate[0][3]) + return shard_strategy, shard_matrix diff --git a/samples/dlrm/train.py b/samples/dlrm/train.py new file mode 100644 index 0000000000..c7289b8577 --- /dev/null +++ b/samples/dlrm/train.py @@ -0,0 +1,485 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import math + +import mlperf_logging.mllog.constants as mllog_constants +from mlperf_common.frameworks.hugectr import HCTRCommunicationHandler +from mlperf_common.logging import MLLoggerWrapper +from mpi4py import MPI + +import hugectr +import mlperf_logger +import sharding + +TRAIN_NUM_SAMPLES = 4195197692 +EVAL_NUM_SAMPLES = 89137319 +TABLE_SIZE_ARRAY = [ + 40000000, + 39060, + 17295, + 7424, + 20265, + 3, + 7122, + 1543, + 63, + 40000000, + 3067956, + 405282, + 10, + 2209, + 11938, + 155, + 4, + 976, + 14, + 40000000, + 40000000, + 40000000, + 590152, + 12973, + 108, + 36, +] +MULTI_HOT_SIZES = [ + 3, + 2, + 1, + 2, + 6, + 1, + 1, + 1, + 1, + 7, + 3, + 8, + 1, + 6, + 9, + 5, + 1, + 1, + 1, + 12, + 100, + 27, + 10, + 3, + 1, + 1, +] +NUM_TABLE = len(TABLE_SIZE_ARRAY) +NUM_DENSE = 13 + +mllogger = MLLoggerWrapper(HCTRCommunicationHandler(), value=None) +mllogger.start(key=mllog_constants.INIT_START) + +parser = argparse.ArgumentParser(description="HugeCTR DCN V2 model training script.") +parser.add_argument( + "--optimizer", + help="Optimizer to use", + type=str, + choices=["adagrad", "sgd"], + default="adagrad", +) +parser.add_argument( + "--batchsize", + help="Batch size for training", + type=int, + default=8192, +) +parser.add_argument( + "--batchsize_eval", + help="Batch size for evaluation", + type=int, + default=16384, +) +parser.add_argument( + "--max_eval_batches", + help="The number of evaluation batches to use", + type=int, + default=None, +) +parser.add_argument( + "--lr", + help="Learning rate", + type=float, + default=0.005, +) +parser.add_argument( + "--eps", + help="Epsilon value for Adagrad", + type=float, + default=1e-8, +) +parser.add_argument( + "--init_accu", + help="Initial accumulator value for Adagrad", + type=float, + default=0.0, +) +parser.add_argument( + "--warmup_steps", + help="Warmup steps", + type=int, + default=0, +) +parser.add_argument( + "--decay_start", + help="Decay start", + type=int, + default=0, +) +parser.add_argument( + "--decay_steps", + help="Decay steps", + type=int, + default=0, +) +parser.add_argument( + "--use_mixed_precision", + action="store_true", +) +parser.add_argument( + "--scaler", + help="Loss scaling constant", + type=float, + default=1.0, +) +parser.add_argument( + "--enable_tf32_compute", + action="store_true", +) +parser.add_argument( + "--disable_algorithm_search", + help="Disables GEMM algorithm search for fully connected layers", + dest="use_algorithm_search", + action="store_false", +) +parser.add_argument( + "--gen_loss_summary", + help="Compute loss summary during training (loss = 0 if not set)", + action="store_true", +) +parser.add_argument( + "--max_iter", + help="Number of training iterations to run", + type=int, + default=None, +) +parser.add_argument( + "--display_interval", + help="Display throughput stats every number of iterations", + type=int, + default=100, +) +parser.add_argument( + "--eval_interval", + help="Evaluate every number of iterations given", + type=int, + default=None, +) +parser.add_argument( + "--auc_threshold", + help="AUC threshold to reach to stop training", + type=float, + default=0.80275, +) +parser.add_argument( + "--sharding_plan", + help="Sharding plan to use", + type=str, + choices=["round_robin", "uniform", "auto", "hier_auto"], + default="round_robin", +) + +parser.add_argument( + "--dp_sharding_threshold", + help="threshold for DP sharding in GiB.", + type=float, + default=0, +) + +parser.add_argument( + "--num_gpus_per_node", + help="The number of GPUs per node", + type=int, + default=8, +) +parser.add_argument( + "--mem_comm_bw_ratio", + help="The ratio between the communication and the memory bw of the system", + type=float, + default=3.35e12 / 450e9, +) +parser.add_argument( + "--mem_comm_work_ratio", + help="The ratio between the communication and the memory work of the network", + type=float, + default=8 / 2, +) +parser.add_argument( + "--memory_cap_for_embedding", + help="The amount of memory can be used for storing embedding in GB", + type=float, + default=60, +) +parser.add_argument( + "--ev_size", + help="The width of the embedding vector", + type=int, + default=128, +) +parser.add_argument( + "--seed", + help="The global seed for training.", + type=int, + default=0, +) +parser.add_argument( + "--minimum_training_time", + help="If set this vable bigger than 0, even hit the target AUC, training will continue until reach the minumum_training_time(minutes)", + type=int, + default=0, +) + +args = parser.parse_args() +comm = MPI.COMM_WORLD +num_nodes = comm.Get_size() +rank = comm.Get_rank() +num_gpus = num_nodes * args.num_gpus_per_node +is_rank_zero = rank == 0 +# If specify arg.minimum_training_time, set max_iter to a bigger value. +if args.minimum_training_time > 0: + args.max_iter = 1000000 + +logging.basicConfig(level=logging.INFO, format="%(message)s") + +# Dependent parameters (if not set) +iter_per_epoch = TRAIN_NUM_SAMPLES / args.batchsize +if args.max_iter is None: + args.max_iter = math.ceil(iter_per_epoch) +if args.eval_interval is None: + args.eval_interval = math.floor(0.05 * iter_per_epoch) +if args.max_eval_batches is None: + args.max_eval_batches = math.ceil(EVAL_NUM_SAMPLES / args.batchsize_eval) +iter_per_epoch = math.ceil(iter_per_epoch) + +# Log submission metadata and relevant hyperparameters +mllogger.mlperf_submission_log(mllog_constants.DLRM_DCNv2, num_nodes, "NVIDIA") +mlperf_logger.param_info(mllogger, args) + +shard_matrix, shard_strategy = sharding.generate_plan( + TABLE_SIZE_ARRAY, MULTI_HOT_SIZES, num_nodes, num_gpus, args, is_rank_zero +) + +# 0. Callback for logging evaluation AUC +logging_callback = mlperf_logger.LoggingCallback( + mllogger, + args.auc_threshold, + iter_per_epoch, + args.batchsize, +) +logging_callback.minimum_training_time = args.minimum_training_time + +# 1. Create Solver, DataReaderParams and Optimizer +solver = hugectr.CreateSolver( + model_name=mllog_constants.DLRM_DCNv2, + seed=args.seed, + max_eval_batches=args.max_eval_batches, + batchsize_eval=args.batchsize_eval, + batchsize=args.batchsize, + vvgpu=[[x for x in range(args.num_gpus_per_node)] for _ in range(num_nodes)], + repeat_dataset=True, + lr=args.lr, + warmup_steps=args.warmup_steps, + decay_start=args.decay_start, + decay_steps=args.decay_steps, + decay_power=2.0, + end_lr=0.0, + use_mixed_precision=args.use_mixed_precision, + enable_tf32_compute=args.enable_tf32_compute, + scaler=args.scaler, + use_cuda_graph=True, + gen_loss_summary=args.gen_loss_summary, + train_intra_iteration_overlap=True, + train_inter_iteration_overlap=True, + eval_intra_iteration_overlap=False, + eval_inter_iteration_overlap=True, + all_reduce_algo=hugectr.AllReduceAlgo.NCCL, + grouped_all_reduce=True, + num_iterations_statistics=20, + perf_logging=False, + drop_incomplete_batch=True, + use_embedding_collection=True, + use_algorithm_search=args.use_algorithm_search, + training_callbacks=[logging_callback], +) + +optimizer = None +if args.optimizer == "adagrad": + optimizer = hugectr.CreateOptimizer( + optimizer_type=hugectr.Optimizer_t.AdaGrad, + update_type=hugectr.Update_t.Global, + initial_accu_value=args.init_accu, + epsilon=args.eps, + ) +elif args.optimizer == "sgd": + optimizer = hugectr.CreateOptimizer( + optimizer_type=hugectr.Optimizer_t.SGD, + update_type=hugectr.Update_t.Local, + atomic_update=True, + ) + +reader = hugectr.DataReaderParams( + data_reader_type=hugectr.DataReaderType_t.RawAsync, + source=["/data/train_data.bin"], + eval_source="/data_val/val_data.bin", + check_type=hugectr.Check_t.Non, + num_samples=TRAIN_NUM_SAMPLES, + eval_num_samples=EVAL_NUM_SAMPLES, + cache_eval_data=1, + slot_size_array=TABLE_SIZE_ARRAY, + async_param=hugectr.AsyncParam( + num_threads=1, + num_batches_per_thread=16, + shuffle=False, + multi_hot_reader=True, + is_dense_float=True, + ), +) + +# 2. Initialize the Model instance +model = hugectr.Model(solver, reader, optimizer) +# 3. Construct the Model graph +model.add( + hugectr.Input( + label_dim=1, + label_name="label", + dense_dim=NUM_DENSE, + dense_name="dense", + data_reader_sparse_param_array=[ + hugectr.DataReaderSparseParam("data{}".format(i), MULTI_HOT_SIZES[i], True, 1) + for i in range(NUM_TABLE) + ], + ) +) + +# create embedding table +embedding_table_list = [] +for i in range(NUM_TABLE): + embedding_table_list.append( + hugectr.EmbeddingTableConfig( + name=str(i), max_vocabulary_size=TABLE_SIZE_ARRAY[i], ev_size=args.ev_size + ) + ) +# create embedding planner and embedding collection +comm_strategy = ( + hugectr.CommunicationStrategy.Hierarchical + if num_nodes > 1 + else hugectr.CommunicationStrategy.Uniform +) +ebc_config = hugectr.EmbeddingCollectionConfig(use_exclusive_keys=True, comm_strategy=comm_strategy) +ebc_config.embedding_lookup( + table_config=[embedding_table_list[i] for i in range(NUM_TABLE)], + bottom_name=["data{}".format(i) for i in range(NUM_TABLE)], + top_name="sparse_embedding", + combiner=["sum" for _ in range(NUM_TABLE)], +) + +ebc_config.shard(shard_matrix=shard_matrix, shard_strategy=shard_strategy) + +model.add(ebc_config) + +# configure compute knobs for bottom & top MLP layers +compute_config = hugectr.DenseLayerComputeConfig( + async_wgrad=True, + fuse_wb=False, +) + +model.add( + hugectr.DenseLayer( + layer_type=hugectr.Layer_t.MLP, + bottom_names=["dense"], + top_names=["mlp1"], + num_outputs=[512, 256, 128], + act_type=hugectr.Activation_t.Relu, + compute_config=compute_config, + ) +) +model.add( + hugectr.DenseLayer( + layer_type=hugectr.Layer_t.Concat, + bottom_names=["sparse_embedding", "mlp1"], + top_names=["concat1"], + ) +) +model.add( + hugectr.DenseLayer( + layer_type=hugectr.Layer_t.MultiCross, + bottom_names=["concat1"], + top_names=["interaction1"], + projection_dim=512, + num_layers=3, + compute_config=compute_config, + ) +) +model.add( + hugectr.DenseLayer( + layer_type=hugectr.Layer_t.MLP, + bottom_names=["interaction1"], + top_names=["mlp2"], + num_outputs=[1024, 1024, 512, 256, 1], + activations=[ + hugectr.Activation_t.Relu, + hugectr.Activation_t.Relu, + hugectr.Activation_t.Relu, + hugectr.Activation_t.Relu, + hugectr.Activation_t.Non, + ], + compute_config=compute_config, + ) +) +model.add( + hugectr.DenseLayer( + layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, + bottom_names=["mlp2", "label"], + top_names=["loss"], + ) +) +# 4. Compile & Fit +model.compile() +model.summary() +num_columns = 1 + NUM_DENSE + sum(MULTI_HOT_SIZES) # +1 for the label +mllogger.event( + key=mllog_constants.TRAIN_SAMPLES, + value=mlperf_logger.get_row_count("/data/train_data.bin", num_columns, 4), + metadata={mllog_constants.EPOCH_NUM: 0.0}, +) +mllogger.event( + key=mllog_constants.EVAL_SAMPLES, + value=mlperf_logger.get_row_count("/data_val/val_data.bin", num_columns, 4), + metadata={mllog_constants.EPOCH_NUM: 0.0}, +) +model.fit( + max_iter=args.max_iter, + display=args.display_interval, + eval_interval=args.eval_interval, + snapshot=2000000, + snapshot_prefix="dlrm", +) diff --git a/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py b/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py index a52af1932a..c274023d80 100644 --- a/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py +++ b/sparse_operation_kit/sparse_operation_kit/dynamic_variable.py @@ -486,7 +486,6 @@ def export(var): ) # sort_indice_tensor = tf.argsort(indices) with tf.device("CPU"): - indices = tf.identity(indices) values = tf.identity(values) return indices, values diff --git a/test/utest/communication/ar_oneshot_test.cu b/test/utest/communication/ar_oneshot_test.cu index 802e5aa165..2e4d2563df 100644 --- a/test/utest/communication/ar_oneshot_test.cu +++ b/test/utest/communication/ar_oneshot_test.cu @@ -17,10 +17,11 @@ #ifndef ENABLE_MPI #include +#include #include #include #include -#include +#include #include #include #include @@ -116,9 +117,10 @@ struct arTest { use_mixed_precision_ = true; } - resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); - resource_manager_->set_ar_comm(AllReduceAlgo::ONESHOT, use_mixed_precision_); - ar_comm_ = resource_manager_->get_ar_comm(); + resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); + collective_manager_ = std::make_shared(resource_manager_); + collective_manager_->set_ar_comm(AllReduceAlgo::ONESHOT, use_mixed_precision_); + ar_comm_ = collective_manager_->get_ar_comm(); init_buffers(); } @@ -129,6 +131,7 @@ struct arTest { bool use_mixed_precision_; AllReduceInPlaceComm* ar_comm_; std::shared_ptr resource_manager_; + std::shared_ptr collective_manager_; std::vector> h_ar_buff_; std::vector> d_ar_buff_; diff --git a/test/utest/communication/ib_comms_a2a_v_integ_test.cu b/test/utest/communication/ib_comms_a2a_v_integ_test.cu index c454adade0..8edb23166c 100644 --- a/test/utest/communication/ib_comms_a2a_v_integ_test.cu +++ b/test/utest/communication/ib_comms_a2a_v_integ_test.cu @@ -19,12 +19,13 @@ #include #include +#include #include #include #include #include #include -#include +#include #include #include #include @@ -119,9 +120,10 @@ struct IbCommsTest { for (int i = 0; i < num_procs_; i++) { vvgpu.push_back(device_list); } - resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); - resource_manager_->init_ib_comm(); - ib_comm_ = resource_manager_->get_ib_comm(); + resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); + collective_manager_ = std::make_shared(resource_manager_); + collective_manager_->init_ib_comm(); + ib_comm_ = collective_manager_->get_ib_comm(); init_buffers(); gen_uniform_size(max_size_); @@ -488,6 +490,7 @@ struct IbCommsTest { bool inter_graph_captured_ = false; std::shared_ptr resource_manager_; + std::shared_ptr collective_manager_; IbComm* ib_comm_; // TODO: Make it shared so we have only one instance of ibcomm HierA2AvCollHandle coll_handle_; diff --git a/test/utest/communication/ib_comms_a2a_v_test.cu b/test/utest/communication/ib_comms_a2a_v_test.cu index b1ebd933d8..95a6511dcc 100644 --- a/test/utest/communication/ib_comms_a2a_v_test.cu +++ b/test/utest/communication/ib_comms_a2a_v_test.cu @@ -18,12 +18,13 @@ #include +#include #include #include #include #include #include -#include +#include #include #include #include @@ -91,9 +92,11 @@ struct IbCommsTest { for (int i = 0; i < num_procs_; i++) { vvgpu.push_back(device_list); } - resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); - resource_manager_->init_ib_comm(); - ib_comm_ = resource_manager_->get_ib_comm(); + resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); + collective_manager_ = std::make_shared(resource_manager_); + + collective_manager_->init_ib_comm(); + ib_comm_ = collective_manager_->get_ib_comm(); comm_stream_.resize(num_gpus_); comm_events_.resize(num_gpus_); @@ -408,6 +411,8 @@ struct IbCommsTest { int num_procs_ = 1; std::shared_ptr resource_manager_; + std::shared_ptr collective_manager_; + IbComm* ib_comm_; // TODO: Make it shared so we have only one instance of ibcomm HierA2AvCollHandle coll_handle_; diff --git a/test/utest/communication/ib_comms_ar_test.cu b/test/utest/communication/ib_comms_ar_test.cu index 2e8e89bdd0..357a63949a 100644 --- a/test/utest/communication/ib_comms_ar_test.cu +++ b/test/utest/communication/ib_comms_ar_test.cu @@ -18,12 +18,13 @@ #include +#include #include #include #include #include #include -#include +#include #include #include #include @@ -115,9 +116,10 @@ struct IbCommsTest { for (int i = 0; i < num_procs_; i++) { vvgpu.push_back(device_list); } - resource_manager_ = ResourceManagerExt::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); - resource_manager_->init_ib_comm(); - ib_comm_ = resource_manager_->get_ib_comm(); + resource_manager_ = ResourceManagerCore::create(vvgpu, 0, DeviceMap::LOCAL_FIRST); + collective_manager_ = std::make_shared(resource_manager_); + collective_manager_->init_ib_comm(); + ib_comm_ = collective_manager_->get_ib_comm(); init_buffers(); } @@ -129,6 +131,7 @@ struct IbCommsTest { int num_procs_; std::shared_ptr resource_manager_; + std::shared_ptr collective_manager_; IbComm* ib_comm_; // TODO: Make it shared so we have only one instance of ibcomm std::vector> h_ar_buff_; diff --git a/test/utest/data_distributor/data_distributor_tests.cpp b/test/utest/data_distributor/data_distributor_tests.cpp index 588245074e..8793ef58eb 100644 --- a/test/utest/data_distributor/data_distributor_tests.cpp +++ b/test/utest/data_distributor/data_distributor_tests.cpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include using namespace HugeCTR; using namespace embedding; @@ -87,7 +87,7 @@ void test_data_distributor(const std::vector& device_list, auto emb_type = core23::ToScalarType::value; // doesn't matter auto wgrad_type = HugeCTR::core23::ToScalarType::value; // doesn't matter - auto resource_manager = ResourceManagerExt::create({device_list}, 424242); + auto resource_manager = ResourceManagerCore::create({device_list}, 424242); auto core_list = get_core_resource_managers(resource_manager); int num_gpus = device_list.size(); int num_lookup = lookup_params.size(); diff --git a/test/utest/data_reader/CMakeLists.txt b/test/utest/data_reader/CMakeLists.txt index 5bd9ab037f..63b8755c8e 100644 --- a/test/utest/data_reader/CMakeLists.txt +++ b/test/utest/data_reader/CMakeLists.txt @@ -16,10 +16,6 @@ cmake_minimum_required(VERSION 3.20) find_package(CUDAToolkit) -file(GLOB async_reader_src - data_reader_async_adapter_test.cpp - data_reader_async_test.cpp -) if (NOT DISABLE_CUDF) file(GLOB data_reader_test_src data_reader_parquet_test.cpp @@ -30,13 +26,10 @@ if (NOT DISABLE_CUDF) target_link_libraries(data_reader_test PUBLIC /usr/local/cuda/lib64/stubs/libcuda.so) endif() -add_executable(async_reader ${async_reader_src}) add_executable(multi_hot_async_data_reader_test multi_hot_async_data_reader_test.cpp) add_executable(batch_locations_test batch_locations_test.cpp) add_executable(v2_async_reader_test data_reader_v2_async_test.cpp) add_executable(benchmark_async_reader data_reader_benchmark.cu) -target_link_libraries(async_reader PUBLIC CUDA::nvml huge_ctr_shared gtest gtest_main) -target_link_libraries(async_reader PUBLIC /usr/local/cuda/lib64/stubs/libcuda.so) target_link_libraries(v2_async_reader_test PUBLIC CUDA::nvml huge_ctr_shared gtest gtest_main) target_link_libraries(v2_async_reader_test PUBLIC /usr/local/cuda/lib64/stubs/libcuda.so) target_link_libraries(multi_hot_async_data_reader_test PUBLIC CUDA::nvml huge_ctr_shared gtest gtest_main /usr/local/cuda/lib64/stubs/libcuda.so) diff --git a/test/utest/data_reader/data_reader_async_adapter_test.cpp b/test/utest/data_reader/data_reader_async_adapter_test.cpp deleted file mode 100644 index 328b4dd55a..0000000000 --- a/test/utest/data_reader/data_reader_async_adapter_test.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace HugeCTR::hybrid_embedding; - -size_t global_seed = 321654; -size_t io_alignment = 4096; -// threads = 32. -const size_t num_batches = 10; -template -void reader_adapter_test(std::vector device_list, size_t batch_size, int num_threads, - int batches_per_thread, int label_dim, int dense_dim, int sparse_dim, - int num_passes, int seed, bool wait_for_gpu_idle = false, - bool shuffle = false) { - using DataReaderType = AsyncReader; - - const std::string fname = "__tmp_test.dat"; - size_t io_block_size = io_alignment * 8; - int bytes_per_batch = sizeof(int) * (label_dim + dense_dim + sparse_dim) * batch_size; - int actual_nr_requests = 2; - for (int io_blk = io_alignment;; io_blk += io_alignment) { - actual_nr_requests = batches_per_thread * num_threads * (bytes_per_batch / io_blk + 2); - if (actual_nr_requests <= 1023) { - io_block_size = io_blk; - break; - } - } - - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_threads = " << num_threads << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: num_batches_per_thread = " << batches_per_thread - << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_block_size = " << io_block_size << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_nr_requests = " << actual_nr_requests << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_depth = " << 2 << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: io_alignment = " << io_alignment << std::endl; - HCTR_LOG_S(INFO, ROOT) << "AsyncReader: shuffle = " << (shuffle ? "ON" : "OFF") << std::endl; - - const bool mixed_precision = true; - const float epsilon = mixed_precision ? 1e0f : 1e-3f; - - HCTR_LIB_THROW(nvmlInit_v2()); - - std::vector> vvgpu; - vvgpu.push_back(device_list); - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); - - size_t local_gpu_count = resource_manager->get_local_gpu_count(); - const int sample_dim = label_dim + dense_dim + sparse_dim; - const size_t file_size = num_batches * batch_size * sample_dim; - - std::vector ref_data(file_size); - -#pragma omp parallel - { - std::mt19937 gen(seed + omp_get_thread_num()); - std::uniform_int_distribution dis(10000, 99999); - std::uniform_real_distribution disf(0.1, 1.1); - -#pragma omp for - for (size_t i = 0; i < num_batches * batch_size; i++) { - for (int j = 0; j < label_dim; j++) { - ref_data[i * sample_dim + j] = dis(gen); - } - - for (int j = 0; j < dense_dim; j++) { - ref_data[i * sample_dim + label_dim + j] = dis(gen); - } - - for (int j = 0; j < sparse_dim; j++) { - auto dtype_ref = - reinterpret_cast(ref_data.data() + i * sample_dim + label_dim + dense_dim); - dtype_ref[j] = dis(gen); - } - } - } - - { - std::ofstream fout(fname); - fout.write((char*)ref_data.data(), file_size * sizeof(int)); - } - - std::vector params{ - DataReaderSparseParam("dummy", std::vector(sparse_dim, 1), true, sparse_dim)}; - - DataReaderType data_reader(fname, batch_size, label_dim, dense_dim, params, true, - resource_manager, num_threads, batches_per_thread, io_block_size, 2, - io_alignment, shuffle, wait_for_gpu_idle); - - auto label_tensors = data_reader.get_label_tensor23s(); - auto dense_tensors = data_reader.get_dense_tensor23s(); - auto sparse_tensors = data_reader.get_value_tensor23s(); - - data_reader.start(); - - for (int pass = 0; pass < num_passes; pass++) { - size_t total_read = 0; - for (size_t batch = 0; batch < num_batches; batch++) { - size_t sz = data_reader.read_a_batch_to_device(); - HCTR_LOG_S(INFO, ROOT) << "iter " << batch << " batchsize " << sz << std::endl; - - std::vector device_batch_offsets(local_gpu_count + 1); - size_t total_offset = 0; - for (size_t id = 0; id < local_gpu_count + 1; id++) { - device_batch_offsets[id] = total_offset; - if (id < local_gpu_count) { - total_offset += data_reader.get_current_batchsize_per_device(id); - } - } - - //#pragma omp parallel for num_threads(local_gpu_count) - for (size_t id = 0; id < local_gpu_count; id++) { - auto device = resource_manager->get_local_gpu(id); - CudaDeviceContext context(device->get_device_id()); - - std::vector labels(label_tensors[id].num_elements()); - std::vector<__half> denses(dense_tensors[id].num_elements()); - std::vector sparses(sparse_tensors[id].get_value_tensor().num_elements()); - - core23::copy_sync(labels, label_tensors[id]); - core23::copy_sync(denses, dense_tensors[id]); - core23::copy_sync(sparses, sparse_tensors[id].get_value_tensor()); - - auto cur_ref = ref_data.data() + total_read * sample_dim; - - for (size_t sample = device_batch_offsets[id]; sample < device_batch_offsets[id + 1]; - sample++) { - for (int j = 0; j < label_dim; j++) { - ASSERT_EQ((float)cur_ref[sample * sample_dim + j], - labels[(sample - device_batch_offsets[id]) * label_dim + j]); - } - - for (int j = 0; j < dense_dim; j++) { - ASSERT_NEAR(std::log((double)cur_ref[sample * sample_dim + label_dim + j] + 1.0), - (double)denses[(sample - device_batch_offsets[id]) * dense_dim + j], - epsilon); - } - } - - for (size_t sample = 0; sample < sz; sample++) { - for (int j = 0; j < sparse_dim; j++) { - auto dtype_ref = cur_ref + sample * sample_dim + label_dim + dense_dim; - ASSERT_EQ(static_cast(dtype_ref[j]), sparses[sample * sparse_dim + j]); - } - } - } - - total_read += sz; - } - } -} - -class MPIEnvironment : public ::testing::Environment { - protected: - virtual void SetUp() { test::mpi_init(); } - virtual void TearDown() { test::mpi_finalize(); } - virtual ~MPIEnvironment(){}; -}; - -::testing::Environment* const mpi_env = ::testing::AddGlobalTestEnvironment(new MPIEnvironment); - -// device_list batch threads batch_per_thread label dense sparse num_passes seed -// -TEST(reader_adapter_test, dgxa100_longlong) { - reader_adapter_test({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 26, 1, - global_seed += 128); -} - -TEST(reader_adapter_test, test1) { - reader_adapter_test({0}, 100, 1, 1, 2, 1, 1, 1, global_seed += 128); -} -TEST(reader_adapter_test, test2) { - reader_adapter_test({0}, 100, 1, 1, 2, 1, 1, 2, global_seed += 128); -} -TEST(reader_adapter_test, test3) { - reader_adapter_test({0}, 100, 1, 1, 2, 3, 1, 3, global_seed += 128); -} -TEST(reader_adapter_test, test4) { - reader_adapter_test({0}, 100, 1, 1, 2, 3, 6, 7, global_seed += 128); -} -TEST(reader_adapter_test, test5) { - reader_adapter_test({0}, 1012, 2, 1, 2, 3, 7, 18, global_seed += 128); -} -TEST(reader_adapter_test, test6) { - reader_adapter_test({0}, 101256, 2, 1, 2, 3, 7, 8, global_seed += 128); -} -TEST(reader_adapter_test, test7) { - reader_adapter_test({0}, 101256, 2, 4, 2, 3, 7, 5, global_seed += 128); -} -TEST(reader_adapter_test, test8) { - reader_adapter_test({0}, 101256, 2, 3, 3, 3, 9, 2, global_seed += 128); -} -TEST(reader_adapter_test, test9) { - reader_adapter_test({0}, 101256, 4, 4, 1, 8, 6, 4, global_seed += 128); -} -TEST(reader_adapter_test, test10) { - reader_adapter_test({0, 1}, 10, 2, 2, 7, 2, 1, 21, global_seed += 128); -} -TEST(reader_adapter_test, test11) { - reader_adapter_test({1, 4}, 6000, 3, 2, 7, 13, 26, 1, global_seed += 128); -} -TEST(reader_adapter_test, dgxa100_48slots) { - reader_adapter_test({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 48, 1, - global_seed += 128); -} -TEST(reader_adapter_test, dgxa100_48slots_wait_for_idle) { - reader_adapter_test({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 48, 1, - global_seed += 128, true); -} -TEST(reader_adapter_test, dgxa100_26slots) { - reader_adapter_test({0, 1, 2, 3, 4, 5, 6, 7}, 256, 32, 4, 1, 1, 800, 1, - global_seed += 128); -} diff --git a/test/utest/data_reader/data_reader_async_test.cpp b/test/utest/data_reader/data_reader_async_test.cpp deleted file mode 100644 index e2a4e3cdde..0000000000 --- a/test/utest/data_reader/data_reader_async_test.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -void reader_test(std::vector device_list, size_t file_size, size_t batch_size, int num_threads, - int batches_per_thread, int io_block_size, int io_depth, int wait_time_us) { - const std::string fname = "__tmp_test.dat"; - char* ref_data; - char* read_data; - - HCTR_LIB_THROW(nvmlInit_v2()); - - std::vector> vvgpu; - vvgpu.push_back(device_list); - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); - - HCTR_LIB_THROW(cudaMallocManaged(&ref_data, file_size)); - HCTR_LIB_THROW(cudaMallocManaged(&read_data, file_size)); - -#pragma omp parallel - { - std::mt19937 gen(424242 + omp_get_thread_num()); - // std::uniform_int_distribution dis(0, 255); - std::uniform_int_distribution dis('a', 'z'); - -#pragma omp for - for (size_t i = 0; i < file_size; i++) { - ref_data[i] = dis(gen); - } - } - - { - std::ofstream fout(fname); - fout.write(ref_data, file_size); - } - - AsyncReaderImpl reader_impl(fname, batch_size, resource_manager.get(), num_threads, - batches_per_thread, io_block_size, io_depth, 4096); - - reader_impl.load_async(); - - size_t total_sz = 0; - while (true) { - BatchDesc desc = reader_impl.get_batch(); - size_t sz = desc.size_bytes; - - if (sz > 0) { - HCTR_LIB_THROW( - cudaMemcpy(read_data + total_sz, desc.dev_data[0], sz, cudaMemcpyDeviceToDevice)); - total_sz += sz; - usleep(wait_time_us); - reader_impl.finalize_batch(); - } else { - break; - } - if (total_sz >= file_size) { - break; - } - } - - ASSERT_EQ(total_sz, file_size); - for (size_t i = 0; i < std::min(file_size, total_sz); i++) { - // HCTR_LOG_S(DEBUG, WORLD) << "Symbols differ at index " << i << " : expected " - // << ref_data[i] << " got " << read_data[i] << std::endl; - ASSERT_EQ(ref_data[i], read_data[i]) << "Symbols differ at index " << i << " : expected " - << ref_data[i] << " got " << read_data[i]; - } - - cudaFree(ref_data); - cudaFree(read_data); -} - -// device_list file_size batch threads batch_per_thread io_block io_depth wait_time -// -TEST(reader_test, test1) { reader_test({0}, 100, 20, 1, 1, 4096 * 2, 1, 0); } -TEST(reader_test, test2) { reader_test({0}, 100, 20, 2, 1, 4096 * 2, 1, 0); } -TEST(reader_test, test3) { reader_test({0}, 1012, 20, 2, 1, 4096 * 2, 1, 0); } -TEST(reader_test, test4) { reader_test({0}, 1012, 32, 2, 2, 4096 * 2, 1, 0); } -TEST(reader_test, test5) { reader_test({0}, 10120, 32, 2, 2, 4096 * 2, 2, 0); } -TEST(reader_test, test6) { reader_test({0}, 101256, 1000, 2, 4, 4096 * 2, 2, 0); } -TEST(reader_test, test7) { reader_test({0}, 101256, 1000, 2, 4, 4096 * 2, 2, 100); } -TEST(reader_test, test8) { reader_test({0}, 101256, 1000, 2, 4, 4096 * 2, 2, 1000); } -TEST(reader_test, test9) { reader_test({0, 1}, 100, 20, 2, 1, 4096 * 2, 1, 0); } -TEST(reader_test, test10) { reader_test({0, 1}, 101256, 1000, 2, 4, 4096 * 2, 2, 0); } -TEST(reader_test, test11) { reader_test({0, 1}, 101256, 1000, 2, 4, 4096 * 2, 2, 100); } -TEST(reader_test, test12) { reader_test({0, 1}, 101256, 1000, 2, 4, 4096 * 2, 2, 1000); } -TEST(reader_test, test13) { reader_test({0, 1}, 1014252, 14352, 6, 4, 4096 * 2, 2, 0); } -TEST(reader_test, test14) { reader_test({0, 1, 2, 3}, 100980, 1980, 4, 4, 4096 * 2, 2, 1000); } -TEST(reader_test, test15) { reader_test({0, 1, 2, 3, 4}, 101256, 7616, 8, 4, 4096 * 2, 2, 0); } -TEST(reader_test, test16) { - reader_test({0, 1, 2, 3, 4, 5, 6, 7}, 8012516, 38720, 8, 4, 4096 * 2, 2, 0); -} -TEST(reader_test, test17) { - reader_test({0, 1, 2, 3, 4, 5, 6, 7}, 8012516, 38720, 16, 4, 4096 * 2, 2, 0); -} -TEST(reader_test, test18) { - reader_test({0, 1, 2, 3, 4, 5, 6, 7}, 18012516, 38720, 8, 4, 4096 * 2, 2, 2000); -} diff --git a/test/utest/data_reader/data_reader_benchmark.cu b/test/utest/data_reader/data_reader_benchmark.cu index 8d12668dbc..8c76838fdd 100644 --- a/test/utest/data_reader/data_reader_benchmark.cu +++ b/test/utest/data_reader/data_reader_benchmark.cu @@ -19,14 +19,13 @@ #include #include -#include #include #include #include #include #include #include -#include +#include #include #include #include @@ -69,7 +68,7 @@ void reader_test(std::vector device_list, size_t batch_size_bytes, int num_ std::vector> vvgpu; vvgpu.push_back(device_list); - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); + const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242); MultiHot::FileSource source; source.name = fname; diff --git a/test/utest/data_reader/data_reader_parquet_test.cpp b/test/utest/data_reader/data_reader_parquet_test.cpp index f23d4314b0..2a8253058e 100644 --- a/test/utest/data_reader/data_reader_parquet_test.cpp +++ b/test/utest/data_reader/data_reader_parquet_test.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -334,7 +334,7 @@ void data_reader_group_iter_strided_batch_test_impl(int num_files, long long sam ASSERT_TRUE(num_files % device_list.size() == 0); int files_per_worker = num_files / device_list.size(); - const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0); const DataReaderSparseParam param = {"distributed", std::vector(slot_num, max_nnz), false, slot_num}; std::vector params; @@ -506,7 +506,7 @@ void data_reader_group_iter_squential_batch_test_impl(int num_files, long long s vvgpu.push_back(device_list); } - const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0); const DataReaderSparseParam param = {"distributed", std::vector(slot_num, max_nnz), false, slot_num}; std::vector params; @@ -645,7 +645,7 @@ void data_reader_group_epoch_strided_batch_test_impl(int num_files, long long sa vvgpu.push_back(device_list); } - const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0); const DataReaderSparseParam param = {"distributed", std::vector(slot_num, max_nnz), false, slot_num}; std::vector params; @@ -862,7 +862,7 @@ void data_reader_group_epoch_squential_batch_test_impl(int num_files, long long vvgpu.push_back(device_list); } - const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0); const DataReaderSparseParam param = {"distributed", std::vector(slot_num, max_nnz), false, slot_num}; std::vector params; @@ -1020,7 +1020,7 @@ void data_reader_worker_test_impl(const int num_files, const long long sample_pe for (int i = 0; i < numprocs; i++) { vvgpu.push_back(device_list); } - auto gpu_resource_group = ResourceManagerExt::create(vvgpu, 0); + auto gpu_resource_group = ResourceManagerCore::create(vvgpu, 0); // const int num_devices = 1; const DataReaderSparseParam param = {"localized", std::vector(slot_num, max_nnz), true, slot_num}; diff --git a/test/utest/data_reader/data_reader_v2_async_test.cpp b/test/utest/data_reader/data_reader_v2_async_test.cpp index e9b532c0f8..1fd0902087 100644 --- a/test/utest/data_reader/data_reader_v2_async_test.cpp +++ b/test/utest/data_reader/data_reader_v2_async_test.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -41,7 +41,7 @@ void reader_test(std::vector device_list, size_t file_size, size_t batch_si std::vector> vvgpu; vvgpu.push_back(device_list); - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); + const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242); HCTR_LIB_THROW(cudaMallocHost(&ref_data, file_size)); HCTR_LIB_THROW(cudaMallocHost(&read_data, file_size)); diff --git a/test/utest/data_reader/multi_hot_async_data_reader_test.cpp b/test/utest/data_reader/multi_hot_async_data_reader_test.cpp index c0eaab6e32..d9d8fb30d1 100644 --- a/test/utest/data_reader/multi_hot_async_data_reader_test.cpp +++ b/test/utest/data_reader/multi_hot_async_data_reader_test.cpp @@ -20,13 +20,12 @@ #include #include #include -#include #include #include #include #include #include -#include +#include #include #include #include @@ -34,7 +33,6 @@ using namespace HugeCTR; using namespace HugeCTR::MultiHot; -using namespace HugeCTR::hybrid_embedding; size_t global_seed = 321654; size_t num_batches = 13; @@ -69,7 +67,7 @@ void async_data_reader_test(std::vector device_list, size_t batch_size, std::vector> vvgpu; vvgpu.push_back(device_list); - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); + const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242); size_t local_gpu_count = resource_manager->get_local_gpu_count(); const int sample_dim = label_dim + dense_dim + (total_sparse_dim * (sizeof(dtype) / sizeof(int))); diff --git a/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu b/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu index 10e077a914..e9f1e58b43 100644 --- a/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu +++ b/test/utest/embedding/distributed_slot_sparse_embedding_hash_test.cu @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -166,7 +166,7 @@ void train_and_test(const std::vector &device_list, const Optimizer_t &opti for (int i = 0; i < numprocs; i++) { vvgpu.push_back(device_list); } - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0); if (pid == 0) { // re-generate the dataset files @@ -472,7 +472,7 @@ void load_and_dump(const std::vector &device_list, const Optimizer_t &optim std::vector> vvgpu; vvgpu.push_back(device_list); - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0); // re-generate the dataset files { @@ -652,7 +652,7 @@ void load_and_dump_file(const std::vector &device_list, const Optimizer_t & for (int i = 0; i < numprocs; i++) { vvgpu.push_back(device_list); } - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0); if (pid == 0) { // re-generate the dataset files diff --git a/test/utest/embedding/hybrid_embedding/data_test.cpp b/test/utest/embedding/hybrid_embedding/data_test.cpp deleted file mode 100644 index 3fed01d324..0000000000 --- a/test/utest/embedding/hybrid_embedding/data_test.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace hybrid_embedding; - -namespace { -template -void data_test() { - size_t batch_size = 4; - size_t num_iterations = 2; - std::vector table_sizes{100, 10, 10, 20}; - std::vector data_in{99, 3, 7, 19, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, - 3, 3, 3, 3, 50, 2, 4, 10, 2, 2, 2, 2, 1, 1, 1, 1}; - std::vector data_to_unique_categories_ref{ - 99, 103, 117, 139, 0, 100, 110, 120, 1, 101, 111, 121, 2, 102, 112, 122, - 3, 103, 113, 123, 50, 102, 114, 130, 2, 102, 112, 122, 1, 101, 111, 121}; - - Tensor2 d_data_in; - // HCTR_LOG_S(DEBUG, WORLD) << "debug2" << std::endl; - std::shared_ptr> buff = GeneralBuffer2::create(); - buff->reserve({batch_size * num_iterations * table_sizes.size()}, &d_data_in); - buff->allocate(); - upload_tensor(data_in, d_data_in, 0); - // HCTR_LOG_S(DEBUG, WORLD) << "debug3" << std::endl; - Data data(table_sizes, batch_size, num_iterations); - // HCTR_LOG_S(DEBUG, WORLD) << "debug" << std::endl; - data.data_to_unique_categories(d_data_in, 0); - // HCTR_LOG_S(DEBUG, WORLD) << "debug1" << std::endl; - std::vector data_to_unique_categories_ret; - download_tensor(data_to_unique_categories_ret, data.samples, 0); - EXPECT_THAT(data_to_unique_categories_ret, - ::testing::ElementsAreArray(data_to_unique_categories_ref)); -}; - -} // namespace - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -void test_raw_data(dtype *d_raw_data, size_t num_samples, size_t num_tables, size_t num_iterations, - const std::vector &table_sizes) { - size_t num_elements = num_samples * num_tables * num_iterations; - std::cout << " test_raw_data:\tnum_samples " << num_samples << " num_tables " << num_tables - << std::endl; - std::vector h_raw_data(num_elements, (dtype)0); - cudaStream_t stream = 0; - HCTR_LIB_THROW(cudaMemcpyAsync(h_raw_data.data(), d_raw_data, num_elements * sizeof(dtype), - cudaMemcpyDeviceToHost, stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - for (size_t iteration = 0; iteration < num_iterations; ++iteration) { - for (size_t sample = 0; sample < num_samples; ++sample) { - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - size_t category = (size_t) - h_raw_data[iteration * num_samples * num_tables + sample * num_tables + embedding]; - if (category >= table_sizes[embedding]) { - std::cout << " sample " << sample << " embedding " << embedding << " category " - << category << " table sizes " << table_sizes[embedding] << std::endl; - } - EXPECT_TRUE(category < table_sizes[embedding]); - } - } - } -} - -template -void test_samples(dtype *d_raw_data, Data &data) { - const size_t num_iterations = data.num_iterations; - const size_t num_samples = data.batch_size; - const size_t num_tables = data.table_sizes.size(); - - size_t num_elements = num_iterations * num_samples * num_tables; - - const size_t num_categories = EmbeddingTableFunctors::get_num_categories(data.table_sizes); - std::vector embedding_offsets; - EmbeddingTableFunctors::get_embedding_offsets(embedding_offsets, data.table_sizes); - - cudaStream_t stream = 0; - std::vector h_raw_data(num_elements, (dtype)0); - HCTR_LIB_THROW(cudaMemcpyAsync(h_raw_data.data(), d_raw_data, num_elements * sizeof(dtype), - cudaMemcpyDeviceToHost, stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - std::vector h_samples; - download_tensor(h_samples, data.samples, stream); - - for (size_t iteration = 0; iteration < num_iterations; ++iteration) { - for (size_t sample = 0; sample < num_samples; ++sample) { - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - size_t indx = iteration * num_samples * num_tables + sample * num_tables + embedding; - size_t unique_category = (size_t)h_samples[indx]; - size_t category_samples = (size_t)unique_category - embedding_offsets[embedding]; - size_t category_data = (size_t)h_raw_data[indx]; - - EXPECT_TRUE(category_samples == category_data); - EXPECT_TRUE(unique_category < num_categories); - } - } - } -} - -template void test_raw_data(uint32_t *d_raw_data, size_t num_samples, size_t num_tables, - size_t num_iterations, - const std::vector &table_sizes); -template void test_raw_data(long long *d_raw_data, size_t num_samples, size_t num_tables, - size_t num_iterations, - const std::vector &table_sizes); -template void test_samples(uint32_t *d_raw_data, Data &data); -template void test_samples(long long *d_raw_data, Data &data); - -/** - * Tests we pad the incomplete batch (e.g last batch in eval) with NULL category - */ -template -void test_padding() { - const size_t batch_size = 8; - const size_t current_batch_size = 5; - - std::vector table_sizes{100, 10, 10, 20}; - std::vector data_in{99, 3, 7, 19, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; - std::vector data_to_unique_categories_ref{ - 99, 103, 117, 139, 0, 100, 110, 120, 1, 101, 111, 121, 2, 102, 112, 122, 3, 103, 113, 123}; - - Tensor2 d_data_in; - std::shared_ptr> buff = GeneralBuffer2::create(); - buff->reserve({current_batch_size * table_sizes.size()}, &d_data_in); - buff->allocate(); - upload_tensor(data_in, d_data_in, 0); - - d_data_in.reset_shape({current_batch_size, table_sizes.size()}); - - Data data(table_sizes, batch_size, 1); - data.data_to_unique_categories(d_data_in, 0); - std::vector data_to_unique_categories_ret; - download_tensor(data_to_unique_categories_ret, data.samples, 0); - - const auto NULL_category = EmbeddingTableFunctors::get_num_categories(table_sizes); - - // Ensure valid samples calculated correctly - size_t i; - for (i = 0; i < current_batch_size * table_sizes.size(); ++i) { - EXPECT_TRUE(data_to_unique_categories_ret[i] == data_to_unique_categories_ref[i]); - } - - // Ensure padded correctly - for (; i < batch_size * table_sizes.size(); ++i) { - EXPECT_TRUE(data_to_unique_categories_ret[i] == NULL_category); - } -} -} // namespace hybrid_embedding - -} // namespace HugeCTR - -TEST(data_test, uint32) { data_test(); }; -TEST(data_test, long_long) { data_test(); }; -TEST(data_test, incomplete_batch_uint32) { test_padding(); } -TEST(data_test, incomplete_batch_long_long) { test_padding(); } diff --git a/test/utest/embedding/hybrid_embedding/data_test.hpp b/test/utest/embedding/hybrid_embedding/data_test.hpp deleted file mode 100644 index e4cb78d38a..0000000000 --- a/test/utest/embedding/hybrid_embedding/data_test.hpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -std::shared_ptr> create_data_from_distribution( - const std::vector> &distribution, const size_t batch_size, - const size_t num_iterations) { - std::vector table_sizes(distribution.size()); - size_t num_categories = (size_t)0; - for (size_t i = 0; i < distribution.size(); ++i) { - table_sizes[i] = distribution[i].size(); - num_categories += table_sizes[i]; - } - - std::vector acc_prob(num_categories); - double sum_p = 0.; - size_t category = (size_t)0; - for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding) { - for (size_t em_category = 0; em_category < table_sizes[embedding]; ++em_category) { - sum_p += distribution[embedding][em_category]; - acc_prob[category++] = sum_p; - } - } - - return std::make_shared>(table_sizes, batch_size, num_iterations); -} - -template -void test_raw_data(dtype *raw_data, size_t num_samples, size_t num_tables, size_t num_iterations, - const std::vector &table_sizes); - -template -void test_samples(dtype *raw_data, Data &data); - -} // namespace hybrid_embedding - -} // namespace HugeCTR \ No newline at end of file diff --git a/test/utest/embedding/hybrid_embedding/end_to_end_test.cpp b/test/utest/embedding/hybrid_embedding/end_to_end_test.cpp deleted file mode 100644 index 7978c126f8..0000000000 --- a/test/utest/embedding/hybrid_embedding/end_to_end_test.cpp +++ /dev/null @@ -1,766 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// all your base are belong to us -#define private public -#define protected public -#include - -using namespace HugeCTR; - -constexpr bool debug_print = false; -int global_seed = 0; - -template -void end_to_end_impl(std::vector device_list, HybridEmbeddingInputGenerator *generator, - size_t batch_size, size_t embedding_vec_size, double bw_ratio_a2a_over_ar, - size_t seed, size_t num_evals) { - constexpr double epsilon = sizeof(emtype) < 4 ? 1e-2 : 1e-3; - - const int rank{core23::MpiInitService::get().world_rank()}; - const int num_procs{core23::MpiInitService::get().world_size()}; - - HCTR_LIB_THROW(nvmlInit_v2()); - - std::vector> vvgpu; - - // if there are multi-node, we assume each node has the same gpu device_list - for (int i = 0; i < num_procs; i++) { - vvgpu.push_back(device_list); - } - const auto resource_manager = ResourceManagerExt::create(vvgpu, seed); - - size_t total_gpu_count = resource_manager->get_global_gpu_count(); - size_t local_gpu_count = resource_manager->get_local_gpu_count(); - size_t local_batch_size = batch_size / total_gpu_count; - assert(batch_size % total_gpu_count == 0); - - auto table_sizes = generator->get_table_sizes(); - size_t num_tables = table_sizes.size(); - size_t total_categories = std::accumulate(table_sizes.begin(), table_sizes.end(), 0); - HCTR_LOG(INFO, WORLD, "total categories: %lu\n", total_categories); - - size_t num_init_batches = 50; - - SparseTensors inputs; - SparseTensors inits; - for (size_t i = 0; i < local_gpu_count; i++) { - CudaDeviceContext context(resource_manager->get_local_gpu(i)->get_device_id()); - auto buf = GeneralBuffer2::create(); - Tensor2 value_tensor; - buf->reserve({batch_size, num_tables}, &value_tensor); - auto dummy_row_offset_tensor = Tensor2(); - std::shared_ptr dummy_nnz(new size_t); - inputs.emplace_back(SparseTensor(value_tensor, dummy_row_offset_tensor, dummy_nnz)); - - buf->reserve({num_init_batches * batch_size, num_tables}, &value_tensor); - inits.emplace_back(SparseTensor(value_tensor, dummy_row_offset_tensor, dummy_nnz)); - buf->allocate(); - } - - const float lr = 0.42f; - - GpuLearningRateSchedulers lr_scheds; - for (size_t i = 0; i < local_gpu_count; i++) { - lr_scheds.emplace_back(new GpuLearningRateScheduler(2 * lr, 2, 0, 1, 2.f, 0.f, - resource_manager->get_local_gpu(i))); - lr_scheds.back()->update(); - } - - HybridSparseEmbeddingParams params = { - batch_size, - batch_size, - num_init_batches, - 2 * num_tables * batch_size, - -1, - 0.01, // p_max_dup ? - embedding_vec_size, - num_tables, - generator->get_table_sizes(), - num_procs == 1 ? hybrid_embedding::CommunicationType::NVLink_SingleNode - : hybrid_embedding::CommunicationType::IB_NVLink, - 1.0, - bw_ratio_a2a_over_ar, - 1.0, - HybridEmbeddingType::Distributed, - OptParams{Optimizer_t::SGD, lr, {}, Update_t::Global, 1.0f}}; - - std::vector>> placeholder( - resource_manager->get_local_gpu_count(), NULL); - auto embedding = std::make_unique>( - inputs, inputs, params, placeholder, lr_scheds, false, resource_manager); - - // Table offsets - std::vector table_offsets(num_tables); - size_t total = 0; - for (size_t table = 0; table < num_tables; table++) { - table_offsets[table] = total; - total += generator->get_table_sizes()[table]; - } - - auto initial_input = generator->generate_categorical_input(num_init_batches * batch_size); - - if (debug_print) { - std::map unique_cat; - HCTR_LOG(INFO, ROOT, "Generated INIT unique categories: "); - for (size_t i = 0; i < num_init_batches * batch_size; i++) { - for (size_t j = 0; j < num_tables; j++) { - unique_cat[initial_input[i * num_tables + j] + table_offsets[j]] = 1; - } - } - for (auto c : unique_cat) { - HCTR_PRINT(INFO, " %d", static_cast(c.first)); - } - HCTR_PRINT(INFO, "\n"); - } - - for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) { - CudaDeviceContext context(resource_manager->get_local_gpu(lgpu)->get_device_id()); - auto stream = resource_manager->get_local_gpu(lgpu)->get_stream(); - upload_tensor(initial_input, inits[lgpu].get_value_tensor(), stream); - } - size_t tmp_size = 0; - embedding->init_model(inits, tmp_size); - - size_t num_frequent = embedding->model_[0].num_frequent; - if (rank == 0) { - HCTR_LOG(INFO, WORLD, "Number of frequent categories: %ld\n", num_frequent); - } - std::vector num_infrequent(local_gpu_count); - for (size_t i = 0; i < local_gpu_count; i++) { - num_infrequent[i] = embedding->model_[i].h_infrequent_model_table_offsets[num_tables]; - // if (debug_print) { - HCTR_LOG(INFO, WORLD, "local_gpu = %ld, Number of infrequent categories: %ld\n", i, - num_infrequent[i]); - //} - } - - std::vector full_emb_table(total_categories * embedding_vec_size); - { - std::mt19937 gen(seed + 2); - std::uniform_real_distribution distr(-1, 1); - for (auto &e : full_emb_table) { - e = distr(gen); - } - } - - // Set frequent embeddings - for (size_t device = 0; device < local_gpu_count; device++) { - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - - std::vector h_frequent_categories; - download_tensor(h_frequent_categories, embedding->model_[device].frequent_categories, 0); - - for (size_t i = 0; i < num_frequent; ++i) { - dtype cat = h_frequent_categories[i]; - HCTR_LIB_THROW(cudaMemcpy(embedding->frequent_embeddings_single_node_[device] - .frequent_data_.frequent_embedding_vectors_.get_ptr() + - i * embedding_vec_size, - full_emb_table.data() + cat * embedding_vec_size, - sizeof(float) * embedding_vec_size, cudaMemcpyHostToDevice)); - } - - if (debug_print && device == 0) { - HCTR_LOG(INFO, ROOT, "Frequent categories: "); - for (size_t i = 0; i < num_frequent; i++) { - HCTR_PRINT(INFO, " %d", h_frequent_categories[i]); - } - HCTR_PRINT(INFO, "\n"); - } - } - - // Set infrequent embeddings - for (size_t device = 0; device < local_gpu_count; device++) { - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - int global_id = resource_manager->get_local_gpu(device)->get_global_id(); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - - size_t num_infrequent = embedding->model_[device].h_infrequent_model_table_offsets[num_tables]; - - float *h_infrequent_embedding_vectors; - dtype *h_category_location; - HCTR_LIB_THROW(cudaMallocHost((void **)&h_infrequent_embedding_vectors, - (num_infrequent + 1) * embedding_vec_size * sizeof(float))); - HCTR_LIB_THROW( - cudaMallocHost((void **)&h_category_location, total_categories * 2 * sizeof(dtype))); - - HCTR_LIB_THROW(cudaMemcpy(h_category_location, - embedding->model_[device].category_location.get_ptr(), - total_categories * 2 * sizeof(dtype), cudaMemcpyDeviceToHost)); - - if (debug_print) { - HCTR_LOG(INFO, ROOT, "Category location array:\n"); - for (size_t i = 0; i < total_categories; i++) { - HCTR_PRINT(INFO, " (%d, %d)\n", h_category_location[2 * i], - h_category_location[2 * i + 1]); - } - } - - for (size_t i = 0; i < total_categories; ++i) { - if (static_cast(h_category_location[2 * i]) == global_id && - static_cast(h_category_location[2 * i + 1]) < total_categories) { - auto loc = h_category_location[2 * i + 1]; - memcpy(h_infrequent_embedding_vectors + loc * embedding_vec_size, - full_emb_table.data() + i * embedding_vec_size, sizeof(float) * embedding_vec_size); - /* - if(device == 0) - { - HCTR_LOG(INFO, WORLD, "i = %ld, loc = %d, embed[0] = %f\n", i, loc, - *(h_infrequent_embedding_vectors+loc*embedding_vec_size)); - } - */ - } - } - - if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - HCTR_LIB_THROW(cudaMemcpy(embedding->infrequent_embeddings_single_node_[device] - .infrequent_embedding_vectors_.get_ptr(), - h_infrequent_embedding_vectors, - num_infrequent * embedding_vec_size * sizeof(float), - cudaMemcpyHostToDevice)); - } - - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) { - HCTR_LIB_THROW(cudaMemcpy(embedding->infrequent_embeddings_ib_nvlink_[device] - .infrequent_embedding_vectors_.get_ptr(), - h_infrequent_embedding_vectors, - num_infrequent * embedding_vec_size * sizeof(float), - cudaMemcpyHostToDevice)); - } - - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - HCTR_LIB_THROW(cudaMemcpy(embedding->infrequent_embeddings_ib_nvlink_hier_[device] - .infrequent_embedding_vectors_.get_ptr(), - h_infrequent_embedding_vectors, - num_infrequent * embedding_vec_size * sizeof(float), - cudaMemcpyHostToDevice)); - } - // HCTR_LOG(INFO, WORLD, "gpu = %ld, num_infrequent = %ld, infrequent_embedding_vectors_ = - // 0x%lx\n", device, num_infrequent, - // (size_t)(embedding->infrequent_embeddings_[device].infrequent_embedding_vectors_.get_ptr())); - HCTR_LIB_THROW(cudaFreeHost(h_infrequent_embedding_vectors)); - HCTR_LIB_THROW(cudaFreeHost(h_category_location)); - } - - if (debug_print) { - HCTR_LOG(INFO, ROOT, "Generated full embedding table\n"); - for (size_t i = 0; i < full_emb_table.size(); i++) { - HCTR_PRINT(INFO, "%8.5f ", full_emb_table[i]); - if (i % embedding_vec_size == embedding_vec_size - 1) { - HCTR_PRINT(INFO, "\n"); - } - } - HCTR_PRINT(INFO, "\n"); - } - - auto outputs = embedding->get_train_output_tensors(); - //====================================================================================== - // Do the forward step - //====================================================================================== - auto input = generator->generate_categorical_input(batch_size); - for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) { - CudaDeviceContext context(resource_manager->get_local_gpu(lgpu)->get_device_id()); - auto stream = resource_manager->get_local_gpu(lgpu)->get_stream(); - upload_tensor(input, inputs[lgpu].get_value_tensor(), stream); - } - - if (debug_print) { - HCTR_LOG(INFO, ROOT, "Generated input:\n"); - HCTR_PRINT(INFO, " Table sizes: "); - for (auto sz : generator->get_table_sizes()) { - HCTR_PRINT(INFO, "%ld ", sz); - } - HCTR_PRINT(INFO, "\n"); - HCTR_PRINT(INFO, " Input:\n"); - for (size_t i = 0; i < batch_size; i++) { - HCTR_PRINT(INFO, " [ "); - for (size_t j = 0; j < num_tables; j++) { - HCTR_PRINT(INFO, "%7d ", input[i * num_tables + j]); - } - HCTR_PRINT(INFO, " ]\n"); - } - } - - embedding->forward(true); - - if (debug_print) { - const int device = 0; - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - int global_id = resource_manager->get_local_gpu(device)->get_global_id(); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - - { - std::vector tmp; - if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - download_tensor( - tmp, embedding->infrequent_embeddings_single_node_[device].indices_->model_indices_, 0); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) { - download_tensor( - tmp, embedding->infrequent_embeddings_ib_nvlink_[device].indices_->model_indices_, 0); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - download_tensor( - tmp, embedding->infrequent_embeddings_ib_nvlink_hier_[device].indices_->model_indices_, - 0); - } - - // download_tensor(tmp, embedding->infrequent_embeddings_[device].indices_->model_indices_, - // 0); - - HCTR_LOG(INFO, ROOT, "Instance %d model indices: ", global_id); - for (size_t j = 0; j < tmp.size(); j++) { - HCTR_PRINT(INFO, " %d", static_cast(tmp[j])); - } - HCTR_PRINT(INFO, "\n"); - - HCTR_LOG(INFO, ROOT, "Instance %d model indices OFFSETS: ", global_id); - for (int j = 0; j < num_procs + 1; j++) { - if (embedding->embedding_params_.communication_type == - CommunicationType::NVLink_SingleNode) { - HCTR_PRINT(INFO, " %u", - embedding->infrequent_embeddings_single_node_[device] - .indices_->model_indices_offsets_.get_ptr()[j]); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) { - HCTR_PRINT(INFO, " %u", - embedding->infrequent_embeddings_ib_nvlink_[device] - .indices_->model_indices_offsets_.get_ptr()[j]); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - HCTR_PRINT(INFO, " %u", - embedding->infrequent_embeddings_ib_nvlink_hier_[device] - .indices_->model_indices_offsets_.get_ptr()[j]); - } - } - HCTR_PRINT(INFO, "\n"); - - int num_batch_frequent; - HCTR_LIB_THROW(cudaMemcpy(&num_batch_frequent, - embedding->frequent_embeddings_single_node_[device] - .indices_->d_num_frequent_sample_indices_.get_ptr(), - sizeof(uint32_t), cudaMemcpyDeviceToHost)); - HCTR_LOG(INFO, ROOT, "Instance %d found %d frequent categories in positions: ", global_id, - num_batch_frequent); - download_tensor( - tmp, - embedding->frequent_embeddings_single_node_[device].indices_->frequent_sample_indices_, - 0); - for (int j = 0; j < num_batch_frequent; j++) { - HCTR_PRINT(INFO, " %d", static_cast(tmp[j])); - } - HCTR_PRINT(INFO, "\n"); - } - - { - std::vector tmp; - if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - download_tensor( - tmp, embedding->infrequent_embeddings_single_node_[device].indices_->network_indices_, - 0); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) { - download_tensor( - tmp, embedding->infrequent_embeddings_ib_nvlink_[device].indices_->network_indices_, 0); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - download_tensor( - tmp, - embedding->infrequent_embeddings_ib_nvlink_hier_[device].indices_->network_indices_, 0); - } - - HCTR_LOG(INFO, ROOT, "Instance %d network indices: ", global_id); - for (size_t j = 0; j < tmp.size(); j++) { - HCTR_PRINT(INFO, " %d", static_cast(tmp[j])); - } - HCTR_PRINT(INFO, "\n"); - - HCTR_LOG(INFO, ROOT, "Instance %d network indices OFFSETS: ", global_id); - for (int j = 0; j < num_procs + 1; j++) { - // HCTR_PRINT(INFO, " %d", - //(int)embedding->infrequent_embeddings_[device] - //.indices_->network_indices_offsets_.get_ptr()[j]); - - if (embedding->embedding_params_.communication_type == - CommunicationType::NVLink_SingleNode) { - HCTR_PRINT(INFO, " %u", - embedding->infrequent_embeddings_single_node_[device] - .indices_->network_indices_offsets_.get_ptr()[j]); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) { - HCTR_PRINT(INFO, " %u", - embedding->infrequent_embeddings_ib_nvlink_[device] - .indices_->network_indices_offsets_.get_ptr()[j]); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - HCTR_PRINT(INFO, " %u", - embedding->infrequent_embeddings_ib_nvlink_hier_[device] - .indices_->network_indices_offsets_.get_ptr()[j]); - } - } - HCTR_PRINT(INFO, "\n"); - } - } - - // Check - for (size_t device = 0; device < local_gpu_count; device++) { - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - int global_id = resource_manager->get_local_gpu(device)->get_global_id(); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - - std::vector h_output; - std::vector expected(embedding_vec_size); - ASSERT_EQ(local_batch_size, embedding->get_batch_size_per_gpu(true)); - - download_tensor(h_output, Tensor2::stretch_from(outputs[device]), 0); - ASSERT_EQ(h_output.size() % embedding_vec_size, 0); - ASSERT_EQ(h_output.size(), local_batch_size * num_tables * embedding_vec_size); - - for (size_t i = 0; i < h_output.size() / embedding_vec_size; i++) { - size_t table = i % num_tables; - size_t cat_id = table_offsets[table] + input[i + global_id * local_batch_size * num_tables]; - auto expected_ptr = full_emb_table.data() + cat_id * embedding_vec_size; - auto actual_ptr = h_output.data() + i * embedding_vec_size; - - if (debug_print) { - HCTR_LOG(INFO, ROOT, " Instance %d sample %ld slot %ld comparing category %ld: ", global_id, - i, table, cat_id); - for (size_t j = 0; j < embedding_vec_size; j++) { - HCTR_PRINT(INFO, " (%8.5f : %8.5f) ", static_cast(actual_ptr[j]), - static_cast(expected_ptr[j])); - } - HCTR_PRINT(INFO, "\n"); - } - - for (size_t j = 0; j < embedding_vec_size; j++) { - expected[j] = (emtype)expected_ptr[j]; - } - - ASSERT_EQ(memcmp(expected.data(), actual_ptr, embedding_vec_size * sizeof(emtype)), 0) - << "Data mismatch on instance " << global_id << " in sample " << i / num_tables - << " feature " << table << std::endl; - } - } - - //====================================================================================== - // Do the backward step and update - //====================================================================================== - for (size_t device = 0; device < local_gpu_count; device++) { - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - - std::vector h_output(local_batch_size * num_tables * embedding_vec_size); - - // Per-GPU generator - std::mt19937 gen(seed + 3 + resource_manager->get_local_gpu(device)->get_global_id()); - std::uniform_real_distribution distr(-1, 1); - for (auto &grad : h_output) { - grad = (emtype)distr(gen); - } - upload_tensor(h_output, Tensor2::stretch_from(outputs[device]), 0); - } - - // We can't allreduce __half type with MPI, so need to recreate all the output tensors locally. - std::vector gradients(total_categories * embedding_vec_size, 0); - for (size_t device = 0; device < total_gpu_count; device++) { - std::mt19937 gen(seed + 3 + device); - std::uniform_real_distribution distr(-1, 1); - - for (size_t i = 0; i < local_batch_size * num_tables; i++) { - size_t table = i % num_tables; - size_t cat_id = table_offsets[table] + input[i + device * local_batch_size * num_tables]; - auto grad_ptr = gradients.data() + cat_id * embedding_vec_size; - - for (size_t j = 0; j < embedding_vec_size; j++) { - grad_ptr[j] += distr(gen); - } - } - } - - if (debug_print) { - HCTR_LOG(INFO, ROOT, "Generated embedding gradients"); - for (size_t i = 0; i < gradients.size(); i++) { - if (i % embedding_vec_size == 0) { - HCTR_PRINT(INFO, "\nRank %d cat %ld :: ", rank, i / embedding_vec_size); - } - HCTR_PRINT(INFO, "%8.5f ", static_cast(gradients[i])); - } - HCTR_PRINT(INFO, "\n"); - } - - embedding->backward(); - embedding->update_params(); - - // Check - // Check frequent embeddings - for (size_t device = 0; device < local_gpu_count; device++) { - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - int global_id = resource_manager->get_local_gpu(device)->get_global_id(); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - - std::vector h_frequent_categories; - download_tensor(h_frequent_categories, embedding->model_[device].frequent_categories, 0); - - float *h_frequent_embedding_vectors; - HCTR_LIB_THROW( - cudaMallocHost((void **)&h_frequent_embedding_vectors, embedding_vec_size * sizeof(float))); - - // Only checking the categories that the instance owns - size_t chunk = num_frequent / resource_manager->get_global_gpu_count(); - ASSERT_EQ(num_frequent % resource_manager->get_global_gpu_count(), 0); - - size_t start = device * chunk; - size_t end = (device + 1) * chunk; - for (size_t i = start; i < end; ++i) { - dtype cat_id = h_frequent_categories[i]; - HCTR_LIB_THROW(cudaMemcpy(h_frequent_embedding_vectors, - embedding->frequent_embeddings_single_node_[device] - .frequent_data_.frequent_embedding_vectors_.get_ptr() + - i * embedding_vec_size, - sizeof(float) * embedding_vec_size, cudaMemcpyDeviceToHost)); - for (size_t j = 0; j < embedding_vec_size; j++) { - ASSERT_NEAR(static_cast(h_frequent_embedding_vectors[j]), - static_cast(full_emb_table.data()[cat_id * embedding_vec_size + j]) - - static_cast(gradients.data()[cat_id * embedding_vec_size + j]) * lr, - epsilon) - << "Gradient (frequent) mismatch on instance " << global_id << " in category " << cat_id - << " dimension " << j << "/" << embedding_vec_size << std::endl; - } - } - HCTR_LIB_THROW(cudaFreeHost(h_frequent_embedding_vectors)); - } - - // Check infrequent embeddings - for (size_t device = 0; device < local_gpu_count; device++) { - CudaDeviceContext context(resource_manager->get_local_gpu(device)->get_device_id()); - int global_id = resource_manager->get_local_gpu(device)->get_global_id(); - - size_t num_infrequent = embedding->model_[device].h_infrequent_model_table_offsets[num_tables]; - - float *h_infrequent_embedding_vectors; - dtype *h_category_location; - HCTR_LIB_THROW(cudaMallocHost((void **)&h_infrequent_embedding_vectors, - num_infrequent * embedding_vec_size * sizeof(float))); - HCTR_LIB_THROW( - cudaMallocHost((void **)&h_category_location, total_categories * 2 * sizeof(dtype))); - - HCTR_LIB_THROW(cudaMemcpy(h_category_location, - embedding->model_[device].category_location.get_ptr(), - total_categories * 2 * sizeof(dtype), cudaMemcpyDeviceToHost)); - - // if (embedding_params_.) - // cudaMemcpy(h_infrequent_embedding_vectors, - // embedding->infrequent_embeddings_[device].infrequent_embedding_vectors_.get_ptr(), - // num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost); - - if (embedding->embedding_params_.communication_type == CommunicationType::NVLink_SingleNode) { - cudaMemcpy(h_infrequent_embedding_vectors, - embedding->infrequent_embeddings_single_node_[device] - .infrequent_embedding_vectors_.get_ptr(), - num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink) { - cudaMemcpy(h_infrequent_embedding_vectors, - embedding->infrequent_embeddings_ib_nvlink_[device] - .infrequent_embedding_vectors_.get_ptr(), - num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost); - } - if (embedding->embedding_params_.communication_type == CommunicationType::IB_NVLink_Hier) { - cudaMemcpy(h_infrequent_embedding_vectors, - embedding->infrequent_embeddings_ib_nvlink_hier_[device] - .infrequent_embedding_vectors_.get_ptr(), - num_infrequent * embedding_vec_size * sizeof(float), cudaMemcpyDeviceToHost); - } - - for (size_t cat_id = 0; cat_id < total_categories; ++cat_id) { - if (static_cast(h_category_location[2 * cat_id]) == global_id) { - auto local_cat_id = h_category_location[2 * cat_id + 1]; - - for (size_t j = 0; j < embedding_vec_size; j++) { - ASSERT_NEAR( - static_cast( - h_infrequent_embedding_vectors[local_cat_id * embedding_vec_size + j]), - static_cast(full_emb_table.data()[cat_id * embedding_vec_size + j]) - - static_cast(gradients.data()[cat_id * embedding_vec_size + j]) * lr, - epsilon) - << "Gradient (infrequent) mismatch on instance " << global_id << " in category " - << cat_id << " dimension " << j << "/" << embedding_vec_size << std::endl; - } - } - } - - HCTR_LIB_THROW(cudaFreeHost(h_infrequent_embedding_vectors)); - HCTR_LIB_THROW(cudaFreeHost(h_category_location)); - } -} - -template -void end_to_end(std::vector device_list, size_t num_tables, size_t total_categories, - size_t batch_size, size_t embedding_vec_size, double bw_ratio_a2a_over_ar, - size_t seed = 42, size_t num_evals = 1) { - const int num_procs{core23::MpiInitService::get().world_size()}; - size_t num_total_gpus = num_procs * device_list.size(); - - HybridEmbeddingConfig test_config = { - static_cast(num_procs), - num_total_gpus, - num_tables, - embedding_vec_size, - static_cast(total_categories), - {}, // irrelevant here - 1.0f, // irrelevant here - num_procs == 1 ? hybrid_embedding::CommunicationType::NVLink_SingleNode - : hybrid_embedding::CommunicationType::IB_NVLink, - }; - - auto generator = std::make_unique>(test_config, seed + 1); - end_to_end_impl(device_list, generator.get(), batch_size, embedding_vec_size, - bw_ratio_a2a_over_ar, seed, num_evals); -} - -template -void end_to_end(std::vector device_list, std::vector table_sizes, size_t batch_size, - size_t embedding_vec_size, double bw_ratio_a2a_over_ar, size_t seed = 42, - size_t num_evals = 1) { - const int num_procs{core23::MpiInitService::get().world_size()}; - size_t num_total_gpus = num_procs * device_list.size(); - - HybridEmbeddingConfig test_config = { - static_cast(num_procs), - num_total_gpus, - 0, // irrelevant here - embedding_vec_size, - {}, // irrelevant here - {}, // irrelevant here - 1.0f, // irrelevant here - num_procs == 1 ? hybrid_embedding::CommunicationType::NVLink_SingleNode - : hybrid_embedding::CommunicationType::IB_NVLink, - }; - - auto generator = - std::make_unique>(test_config, table_sizes, seed + 1); - end_to_end_impl(device_list, generator.get(), batch_size, embedding_vec_size, - bw_ratio_a2a_over_ar, seed, num_evals); -} - -class MPIEnvironment : public ::testing::Environment { - protected: - virtual void SetUp() { test::mpi_init(); } - virtual void TearDown() { test::mpi_finalize(); } - virtual ~MPIEnvironment(){}; -}; - -::testing::Environment *const mpi_env = ::testing::AddGlobalTestEnvironment(new MPIEnvironment); -// -TEST(hybrid_e2e, test1) { end_to_end({0}, 2, 16, 20, 2, 1.0e10, global_seed); } -TEST(hybrid_e2e, test2) { end_to_end({0}, 2, 16, 20, 2, 1.0e-10, global_seed++); } -TEST(hybrid_e2e, test3) { - end_to_end({0, 1}, 2, 128, 20, 2, 1.0e10, global_seed++); -} -TEST(hybrid_e2e, test4) { - end_to_end({0, 1}, 2, 128, 20, 2, 1.0e-10, global_seed++); -} -TEST(hybrid_e2e, test5) { end_to_end({0, 1}, 2, 128, 20, 2, 1.0, global_seed++); } -TEST(hybrid_e2e, test6) { end_to_end({0, 1}, 7, 128, 20, 2, 1.0, global_seed++); } -TEST(hybrid_e2e, test7) { - end_to_end({0, 1, 2}, 3, 192, 96, 5, 1.0, global_seed++); -} -TEST(hybrid_e2e, test8) { - end_to_end({0, 1, 2, 3}, 6, 651, 96, 128, 1.5, global_seed++); -} -TEST(hybrid_e2e, test9) { - end_to_end({0, 1, 2, 3}, 18, 6531, 256, 64, 1.7, global_seed++); -} -TEST(hybrid_e2e, test10) { - end_to_end({0, 1, 2, 3, 4, 5, 6, 7}, 18, 6531, 256, 64, 1.7, global_seed++); -} -TEST(hybrid_e2e, test11) { - end_to_end({0, 1, 2, 3, 4, 5, 6, 7}, 26, 16531, 512, 48, 1.33, global_seed++); -} -TEST(hybrid_e2e, test12) { - end_to_end({0, 1, 6, 7}, 13, 21345, 256, 32, 0.6, global_seed++); -} -TEST(hybrid_e2e, test13) { - std::vector slot_size_array{ - 39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, - 38532951, 2953546, 403346, 10, 2208, 11938, 155, 4, 976, - 14, 39979771, 25641295, 39664984, 585935, 12972, 108, 36}; - // for (auto& s : slot_size_array) { - // s = s/16 + 1; - // } - - end_to_end({0, 1, 2, 3, 4, 5, 6, 7}, slot_size_array, 1024, 128, 1.9 / 1.3, - global_seed++); -} - -TEST(hybrid_e2e, test21) { end_to_end({0}, 2, 16, 20, 2, 1.0e10, global_seed++); } -TEST(hybrid_e2e, test22) { - end_to_end({0}, 2, 16, 20, 2, 1.0e-10, global_seed++); -} -TEST(hybrid_e2e, test23) { - end_to_end({0, 1}, 2, 128, 20, 2, 1.0e10, global_seed++); -} -TEST(hybrid_e2e, test24) { - end_to_end({0, 1}, 2, 128, 20, 2, 1.0e-10, global_seed++); -} -TEST(hybrid_e2e, test25) { - end_to_end({0, 1}, 2, 128, 20, 2, 1.0, global_seed++); -} -TEST(hybrid_e2e, test26) { - end_to_end({0, 1}, 7, 128, 20, 2, 1.0, global_seed++); -} -TEST(hybrid_e2e, test27) { - end_to_end({0, 1, 2}, 3, 192, 96, 5, 1.0, global_seed++); -} -TEST(hybrid_e2e, test28) { - end_to_end({0, 1, 2, 3}, 6, 651, 96, 128, 1.5, global_seed++); -} -TEST(hybrid_e2e, test29) { - end_to_end({0, 1, 2, 3}, 18, 6531, 256, 64, 1.7, global_seed++); -} -TEST(hybrid_e2e, test30) { - end_to_end({0, 1, 2, 3, 4, 5, 6, 7}, 18, 6531, 256, 64, 1.7, global_seed++); -} -TEST(hybrid_e2e, test31) { - end_to_end({0, 1, 2, 3, 4, 5, 6, 7}, 26, 16531, 512, 48, 1.33, global_seed++); -} -TEST(hybrid_e2e, test32) { - end_to_end({0, 1, 6, 7}, 13, 21345, 256, 32, 0.6, global_seed++); -} -TEST(hybrid_e2e, test33) { - std::vector slot_size_array{ - 39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, - 38532951, 2953546, 403346, 10, 2208, 11938, 155, 4, 976, - 14, 39979771, 25641295, 39664984, 585935, 12972, 108, 36}; - // for (auto& s : slot_size_array) { - // s = s/16 + 1; - // } - - end_to_end({0, 1, 2, 3, 4, 5, 6, 7}, slot_size_array, 1024, 128, 1.9 / 1.3, - global_seed++); -} diff --git a/test/utest/embedding/hybrid_embedding/forward_test.cpp b/test/utest/embedding/hybrid_embedding/forward_test.cpp deleted file mode 100644 index 87e3c665ee..0000000000 --- a/test/utest/embedding/hybrid_embedding/forward_test.cpp +++ /dev/null @@ -1,475 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -/****************** Frequent and infrequent forward network ******************/ - -template -class ForwardNetworkTest : public HybridEmbeddingUnitTest { - protected: - bool single_node; - - public: - ForwardNetworkTest(const HybridEmbeddingConfig config, size_t batch_size, bool single_node, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed), - single_node(single_node) {} - - void run() { - uint32_t local_batch_size = ceildiv(this->batch_size, this->num_instances); - - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.generate_embedding_vectors(); - cpu_embedding.forward_network(); - if (!single_node) { - cpu_embedding.calculate_infrequent_model_indices(); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - cpu_embedding.forward_a2a_messages_hier(); - } else { - cpu_embedding.forward_a2a_messages(); - } - } - - /* Tensors for the interaction layer input and messages */ - std::shared_ptr> buff = GeneralBuffer2::create(); - std::vector> interaction_layer_input(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size}, - &interaction_layer_input[i]); - } - std::vector> received_messages(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({this->num_instances * local_batch_size * this->config.num_tables, - this->config.embedding_vec_size}, - &received_messages[i]); - } - buff->allocate(); - - /* In single-node case, make an array of the interaction mayer input pointers */ - std::vector interaction_layer_input_pointers_; - if (single_node) { - for (size_t i = 0; i < this->num_instances; i++) { - interaction_layer_input_pointers_.push_back(interaction_layer_input[i].get_ptr()); - } - } - - /* Frequent and infrequent forward_network */ - this->build_infrequent(); - this->build_frequent(); - for (size_t i = 0; i < this->num_instances; i++) { - upload_tensor(cpu_embedding.frequent_embedding_vectors[i], - this->get_frequent_embedding_data(i).frequent_embedding_vectors_, this->stream); - - if (single_node) { - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_, - this->stream); - } - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_, - this->stream); - } - if (this->config.comm_type == CommunicationType::IB_NVLink) { - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_, - this->stream); - } - } - - for (size_t i = 0; i < this->num_instances; i++) { - // this->frequent_embeddings[i].set_current_indices(&this->frequent_embedding_indices[i], - // this->stream); - this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]); - if (single_node) { - this->infrequent_embeddings_single_node[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - } - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - } - if (this->config.comm_type == CommunicationType::IB_NVLink) { - this->infrequent_embeddings_ib_nvlink[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - } - - if (single_node) { - this->frequent_embeddings_single_node[i].indices_->calculate_cache_masks(this->stream); - this->frequent_embeddings_single_node[i].indices_->calculate_model_cache_indices( - 80, this->stream); - this->frequent_embeddings_single_node[i].forward_model(this->stream); - } - } - for (size_t i = 0; i < this->num_instances; i++) { - this->get_frequent_embedding(i).indices_->calculate_frequent_sample_indices(this->stream); - if (single_node) { - this->frequent_embeddings_single_node[i].forward_network( - interaction_layer_input[i].get_ptr(), this->stream); - } else { - this->frequent_embeddings_multi_node[i].forward_network( - interaction_layer_input[i].get_ptr(), this->stream); - } - if (single_node) { - this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream); - HCTR_LIB_THROW(cudaMemcpyAsync(this->infrequent_embeddings_single_node[i] - .interaction_layer_input_pointers_train_.get_ptr(), - interaction_layer_input_pointers_.data(), - this->num_instances * sizeof(emtype *), - cudaMemcpyHostToDevice, this->stream)); - this->infrequent_embeddings_single_node[i].forward_network_direct(true, this->stream); - } else { - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_network_indices( - 80, this->stream); - } else { - this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_network_indices( - 80, this->stream); - } - // this->infrequent_embeddings[i].indices_->calculate_network_indices(80, this->stream); - upload_tensor(cpu_embedding.forward_received_messages[i], received_messages[i], - this->stream); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].hier_forward_network( - received_messages[i].get_ptr(), interaction_layer_input[i].get_ptr(), this->stream); - } else { // ib_nvlink - this->infrequent_embeddings_ib_nvlink[i].forward_network( - received_messages[i].get_ptr(), interaction_layer_input[i].get_ptr(), this->stream); - } - } - } - - std::vector> h_interaction_layer_input(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - download_tensor(h_interaction_layer_input[i], interaction_layer_input[i], this->stream); - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - ASSERT_TRUE(compare_array( - local_batch_size * this->config.num_tables * this->config.embedding_vec_size, - h_interaction_layer_input[i].data(), cpu_embedding.interaction_layer_input[i].data(), - 1e-2)); - } - } -}; - -/************** Frequent embedding forward model (single node) **************/ - -template -class FrequentForwardModelTest : public HybridEmbeddingUnitTest { - public: - FrequentForwardModelTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - uint32_t local_batch_size = ceildiv(this->batch_size, this->num_instances); - - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_frequent_network_cache_indices(); - cpu_embedding.generate_embedding_vectors(); - cpu_embedding.generate_gradients(); - cpu_embedding.frequent_reduce_gradients(); - - /* Tensors for the gradients */ - std::shared_ptr> buff = GeneralBuffer2::create(); - std::vector> gradients(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size}, - &gradients[i]); - } - buff->allocate(); - - /* Frequent update_model */ - this->build_frequent(); - std::vector frequent_partial_gradients_pointers(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - upload_tensor( - cpu_embedding.frequent_embedding_vectors[i], - this->frequent_embeddings_single_node[i].frequent_data_.frequent_embedding_vectors_, - this->stream); - upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream); - frequent_partial_gradients_pointers[i] = - this->frequent_embeddings_single_node[i].frequent_data_.get_gradients().get_ptr(); - this->frequent_embeddings_single_node[i].set_current_indices( - &this->frequent_embedding_indices[i]); - } - for (size_t i = 0; i < this->num_instances; i++) { - this->frequent_embeddings_single_node[i].indices_->calculate_cache_masks(this->stream); - this->frequent_embeddings_single_node[i].indices_->calculate_network_cache_indices( - this->stream); - this->frequent_embeddings_single_node[i].indices_->calculate_model_cache_indices( - 80, this->stream); - this->frequent_embeddings_single_node[i].indices_->calculate_frequent_sample_indices( - this->stream); - this->frequent_embeddings_single_node[i].local_reduce(gradients[i].get_ptr(), this->stream); - } - for (size_t i = 0; i < this->num_instances; i++) { - HCTR_LIB_THROW(cudaMemcpyAsync( - this->frequent_embeddings_single_node[i].partial_gradients_pointers_.get_ptr(), - frequent_partial_gradients_pointers.data(), this->num_instances * sizeof(emtype *), - cudaMemcpyHostToDevice, this->stream)); - this->frequent_embeddings_single_node[i].update_model_direct(this->dev_lr, 1.f, this->stream); - } - - /* Set cache to zero for easy comparison with CPU version */ - if (sizeof(emtype) != sizeof(float)) { - for (size_t i = 0; i < this->num_instances; i++) { - HCTR_LIB_THROW(cudaMemsetAsync( - this->frequent_embeddings_single_node[i].get_embedding_vectors_cache().get_ptr(), 0, - this->config.num_frequent * this->config.embedding_vec_size * sizeof(emtype), - this->stream)); - } - } - - /* Frequent forward_model */ - for (size_t i = 0; i < this->num_instances; i++) { - this->frequent_embeddings_single_node[i].forward_model(this->stream); - } - - std::vector> updated_vectors_cache(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - download_tensor(updated_vectors_cache[i], - this->frequent_embeddings_single_node[i].get_embedding_vectors_cache(), - this->stream); - } - - /* Reference update_model */ - cpu_embedding.frequent_update_single_node(); - - /* Reference forward_model */ - cpu_embedding.frequent_forward_model(); - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - ASSERT_TRUE(compare_array(this->config.num_frequent * this->config.embedding_vec_size, - updated_vectors_cache[i].data(), - cpu_embedding.frequent_embedding_vectors_cache[i].data(), 5e-2)); - } - } -}; - -/**************************** Test instantiations ****************************/ - -static const HybridEmbeddingConfig config_uint32 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_int64 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_uint32_single_node = { - 1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode}; -static const HybridEmbeddingConfig config_int64_single_node = { - 1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode}; - -// Edge cases: no frequent, all frequent -static const HybridEmbeddingConfig config_no_freq = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_all_freq = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_no_freq_single_node = { - 1, 8, 10, 128, 1000, 0, 0.5f, CommunicationType::NVLink_SingleNode}; -static const HybridEmbeddingConfig config_all_freq_single_node = { - 1, 8, 10, 128, 1000, 1000, 0.5f, CommunicationType::NVLink_SingleNode}; - -// Hierarchical A2A -static const HybridEmbeddingConfig config_uint32_hier = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_int64_hier = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_no_freq_hier = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_all_freq_hier = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink_Hier}; - -/* hybrid_embedding_forward_network_test */ - -TEST(hybrid_embedding_forward_network_test, uint32_half_64) { - ForwardNetworkTest(config_uint32, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, int64_half_64) { - ForwardNetworkTest(config_int64, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, uint32_half_2048) { - ForwardNetworkTest(config_uint32, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, int64_half_2048) { - ForwardNetworkTest(config_int64, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, uint32_float_64) { - ForwardNetworkTest(config_uint32, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, int64_float_64) { - ForwardNetworkTest(config_int64, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, uint32_float_2048) { - ForwardNetworkTest(config_uint32, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, int64_float_2048) { - ForwardNetworkTest(config_int64, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, uint32_float_128_no_freq) { - ForwardNetworkTest(config_no_freq, 128, false).run(); -} - -TEST(hybrid_embedding_forward_network_test, uint32_float_128_all_freq) { - ForwardNetworkTest(config_all_freq, 128, false).run(); -} - -/* hybrid_embedding_forward_network_single_node_test */ - -TEST(hybrid_embedding_forward_network_single_node_test, uint32_half_64) { - ForwardNetworkTest(config_uint32_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, int64_half_64) { - ForwardNetworkTest(config_int64_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, uint32_half_2048) { - ForwardNetworkTest(config_uint32_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, int64_half_2048) { - ForwardNetworkTest(config_int64_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_64) { - ForwardNetworkTest(config_uint32_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, int64_float_64) { - ForwardNetworkTest(config_int64_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_2048) { - ForwardNetworkTest(config_uint32_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, int64_float_2048) { - ForwardNetworkTest(config_int64_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_128_no_freq) { - ForwardNetworkTest(config_no_freq_single_node, 128, true).run(); -} - -TEST(hybrid_embedding_forward_network_single_node_test, uint32_float_128_all_freq) { - ForwardNetworkTest(config_all_freq_single_node, 128, true).run(); -} - -/* hybrid_embedding_forward_network_hier_test */ - -TEST(hybrid_embedding_forward_network_hier_test, uint32_half_64) { - ForwardNetworkTest(config_uint32_hier, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, int64_half_64) { - ForwardNetworkTest(config_int64_hier, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, uint32_half_2048) { - ForwardNetworkTest(config_uint32_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, int64_half_2048) { - ForwardNetworkTest(config_int64_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, uint32_float_64) { - ForwardNetworkTest(config_uint32_hier, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, int64_float_64) { - ForwardNetworkTest(config_int64_hier, 64, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, uint32_float_2048) { - ForwardNetworkTest(config_uint32_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, int64_float_2048) { - ForwardNetworkTest(config_int64_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, uint32_float_128_no_freq) { - ForwardNetworkTest(config_no_freq_hier, 128, false).run(); -} - -TEST(hybrid_embedding_forward_network_hier_test, uint32_float_128_all_freq) { - ForwardNetworkTest(config_all_freq_hier, 128, false).run(); -} - -/* hybrid_embedding_frequent_forward_model_test */ - -TEST(hybrid_embedding_frequent_forward_model_test, uint32_half_64) { - FrequentForwardModelTest(config_uint32_single_node, 64).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, int64_half_64) { - FrequentForwardModelTest(config_int64_single_node, 64).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, uint32_half_2048) { - FrequentForwardModelTest(config_uint32_single_node, 2048).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, int64_half_2048) { - FrequentForwardModelTest(config_int64_single_node, 2048).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_64) { - FrequentForwardModelTest(config_uint32_single_node, 64).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, int64_float_64) { - FrequentForwardModelTest(config_int64_single_node, 64).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_2048) { - FrequentForwardModelTest(config_uint32_single_node, 2048).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, int64_float_2048) { - FrequentForwardModelTest(config_int64_single_node, 2048).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_128_no_freq) { - FrequentForwardModelTest(config_no_freq_single_node, 128).run(); -} - -TEST(hybrid_embedding_frequent_forward_model_test, uint32_float_128_all_freq) { - FrequentForwardModelTest(config_all_freq_single_node, 128).run(); -} diff --git a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp b/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp deleted file mode 100644 index 16e45de486..0000000000 --- a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.cpp +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace HugeCTR::hybrid_embedding; - -namespace utils { -template -struct TypeConvertFunc; - -template <> -struct TypeConvertFunc<__half, float> { - static inline __half convert(float val) { return __float2half(val); } -}; - -template <> -struct TypeConvertFunc { - static inline float convert(__half val) { return __half2float(val); } -}; - -template <> -struct TypeConvertFunc { - static inline float convert(float val) { return val; } -}; - -template -static bool lesser_by_first(const std::pair& a, const std::pair& b) { - return (a.first < b.first); -} - -} // namespace utils - -template -void HybridEmbeddingCpu::calculate_infrequent_model_indices() { - model_indices.resize(num_instances); - model_indices_offsets.resize(num_instances); - - for (uint32_t model_id = 0; model_id < num_instances; model_id++) { - model_indices[model_id].resize(batch_size * num_tables); - model_indices_offsets[model_id].resize(num_instances + 1); - - // Prefix sum - uint32_t sum = 0; - for (uint32_t j = 0; j < batch_size; j++) { - if (j % local_batch_size == 0) { - model_indices_offsets[model_id][j / local_batch_size] = sum; - } - for (uint32_t i = 0; i < num_tables; i++) { - uint32_t idx = j * num_tables + i; - - dtype category = samples[idx]; - bool mask = category_location[2 * category] == model_id; - - sum += static_cast(mask); - - if (mask) model_indices[model_id][sum - 1] = idx; - } - } - // Total size stored at the end of the offsets vector - model_indices_offsets[model_id][num_instances] = sum; - model_indices[model_id].resize(sum); - } -} - -template -void HybridEmbeddingCpu::calculate_infrequent_network_indices() { - network_indices.resize(num_instances); - network_indices_offsets.resize(num_instances); - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - network_indices[network_id].resize(local_batch_size * num_tables); - network_indices_offsets[network_id].resize(num_instances + 1); - - std::vector> network_sources_indices = - std::vector>(local_batch_size * num_tables); - - // Prefix sum only of this GPU's sub-batch - uint32_t sum = 0; - for (uint32_t j = local_batch_size * network_id; - j < std::min(batch_size, local_batch_size * (network_id + 1)); j++) { - for (uint32_t i = 0; i < num_tables; i++) { - uint32_t idx = j * num_tables + i; - dtype category = samples[idx]; - dtype model_id = category_location[2 * category]; - bool mask = model_id < num_instances; - sum += static_cast(mask); - uint32_t local_mlp_index = (j - local_batch_size * network_id) * num_tables + i; - if (mask) - network_sources_indices[sum - 1] = - std::make_pair(static_cast(model_id), local_mlp_index); - } - } - // Sort by source only, otherwise stable - std::stable_sort(network_sources_indices.begin(), network_sources_indices.begin() + sum, - utils::lesser_by_first); - - // Retrieve indices - for (uint32_t idx = 0; idx < sum; idx++) { - network_indices[network_id][idx] = network_sources_indices[idx].second; - } - // Compute offsets - for (uint32_t i = 0; i < num_instances; i++) { - network_indices_offsets[network_id][i] = - std::lower_bound(network_sources_indices.begin(), network_sources_indices.begin() + sum, - std::make_pair(i, (uint32_t)0), utils::lesser_by_first) - - network_sources_indices.begin(); - } - // Total size stored at the end of the offsets vector - network_indices_offsets[network_id][num_instances] = sum; - network_indices[network_id].resize(sum); - } -} - -template -void HybridEmbeddingCpu::calculate_frequent_sample_indices() { - frequent_sample_indices.resize(num_instances); - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - frequent_sample_indices[network_id].resize(local_batch_size * num_tables); - - // Prefix sum only of this GPU's sub-batch - uint32_t sum = 0; - for (uint32_t j = local_batch_size * network_id; - j < std::min(batch_size, local_batch_size * (network_id + 1)); j++) { - for (uint32_t i = 0; i < num_tables; i++) { - uint32_t idx = j * num_tables + i; - - dtype category = samples[idx]; - dtype model_id = category_location[2 * category]; - bool mask = model_id == num_instances; - - sum += static_cast(mask); - - uint32_t local_mlp_index = (j - local_batch_size * network_id) * num_tables + i; - - if (mask) frequent_sample_indices[network_id][sum - 1] = local_mlp_index; - } - } - - frequent_sample_indices[network_id].resize(sum); - } -} - -template -void HybridEmbeddingCpu::calculate_frequent_model_cache_indices() { - const uint32_t num_frequent_per_model = num_frequent / num_instances; - - model_cache_indices.resize(num_instances); - model_cache_indices_offsets.resize(num_instances); - - for (uint32_t model_id = 0; model_id < num_instances; model_id++) { - model_cache_indices[model_id].resize(num_frequent); - model_cache_indices_offsets[model_id].resize(num_instances + 1); - - /* Compute the mask (for each network, frequent categories that belong to my model id) */ - std::vector network_frequent_mask = std::vector(num_frequent, false); - for (uint32_t i = 0; i < num_instances; i++) { - for (uint32_t j = 0; j < local_batch_size * num_tables; j++) { - uint32_t global_j = local_batch_size * num_tables * i + j; - - dtype category = samples[global_j]; - dtype frequent_index = category_location[2 * category + 1]; - - if (category_location[2 * category] == num_instances && - frequent_index / num_frequent_per_model == model_id) { - network_frequent_mask[i * num_frequent_per_model + - frequent_index % num_frequent_per_model] = true; - } - } - } - - /* Select categories according to the mask */ - uint32_t sum = 0; - for (uint32_t idx = 0; idx < num_frequent; idx++) { - bool mask = network_frequent_mask[idx]; - sum += static_cast(mask); - if (mask) model_cache_indices[model_id][sum - 1] = idx; - } - - /* Compute offsets */ - for (uint32_t i = 0; i < num_instances; i++) { - model_cache_indices_offsets[model_id][i] = - std::lower_bound(model_cache_indices[model_id].begin(), - model_cache_indices[model_id].begin() + sum, - i * num_frequent_per_model) - - model_cache_indices[model_id].begin(); - } - model_cache_indices_offsets[model_id][num_instances] = sum; - - /* Convert to buffer indices */ - for (uint32_t idx = 0; idx < sum; idx++) { - model_cache_indices[model_id][idx] = - model_cache_indices[model_id][idx] % num_frequent_per_model + - num_frequent_per_model * model_id; - } - - model_cache_indices[model_id].resize(sum); - } -} - -template -void HybridEmbeddingCpu::calculate_frequent_network_cache_indices() { - const uint32_t num_frequent_per_model = num_frequent / num_instances; - - if (network_cache_mask.size() == 0) calculate_frequent_network_cache_mask(); - - network_cache_indices.resize(num_instances); - network_cache_indices_offsets.resize(num_instances); - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - network_cache_indices[network_id].resize(num_frequent); - network_cache_indices_offsets[network_id].resize(num_instances + 1); - - uint32_t sum = 0; - for (uint32_t i = 0; i < num_frequent; ++i) { - if (network_cache_mask[network_id][i]) { - network_cache_indices[network_id][sum] = i; - sum++; - } - } - - /* Compute offsets */ - for (uint32_t i = 0; i < num_instances; i++) { - network_cache_indices_offsets[network_id][i] = - std::lower_bound(network_cache_indices[network_id].begin(), - network_cache_indices[network_id].begin() + sum, - i * num_frequent_per_model) - - network_cache_indices[network_id].begin(); - } - network_cache_indices_offsets[network_id][num_instances] = sum; - - network_cache_indices[network_id].resize(sum); - } -} - -template -void HybridEmbeddingCpu::calculate_frequent_network_cache_mask() { - network_cache_mask.resize(num_instances); - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - network_cache_mask[network_id].resize(num_frequent); - - for (uint32_t j = local_batch_size * network_id; - j < std::min(batch_size, local_batch_size * (network_id + 1)); j++) { - for (uint32_t i = 0; i < num_tables; i++) { - uint32_t idx = j * num_tables + i; - dtype category = samples[idx]; - if (category_location[2 * category] == num_instances) { - dtype frequent_index = category_location[2 * category + 1]; - network_cache_mask[network_id][frequent_index] = 1; - } - } - } - } -} - -template -void HybridEmbeddingCpu::generate_embedding_vectors() { - frequent_embedding_vectors.resize(num_instances); - infrequent_embedding_vectors.resize(num_instances); - - // Fixed seed for reproducibility - std::default_random_engine generator(1234UL); - std::uniform_real_distribution distribution(-10.0f, 10.0f); - - for (size_t i = 0; i < num_instances; i++) { - frequent_embedding_vectors[i].resize(num_frequent * embedding_vec_size); - infrequent_embedding_vectors[i].resize( - utils::ceildiv(num_categories - num_frequent, num_instances) * embedding_vec_size); - } - for (dtype category = 0; category < num_categories; category++) { - dtype model_id = category_location[2 * category]; - dtype location = category_location[2 * category + 1]; - if (model_id == num_instances) { - dtype freq_index = location; - HCTR_CHECK(freq_index < num_frequent); - for (uint32_t k = 0; k < embedding_vec_size; k++) { - float value = distribution(generator); - for (uint32_t i = 0; i < num_instances; i++) - frequent_embedding_vectors[i][freq_index * embedding_vec_size + k] = value; - } - } else { - for (uint32_t k = 0; k < embedding_vec_size; k++) - infrequent_embedding_vectors[model_id][location * embedding_vec_size + k] = - distribution(generator); - } - } -} - -template -void HybridEmbeddingCpu::generate_gradients() { - gradients.resize(num_instances); - - // Fixed seed for reproducibility - std::default_random_engine generator(1234UL); - std::uniform_real_distribution distribution(-10.0f, 10.0f); - - for (size_t i = 0; i < num_instances; i++) - gradients[i].resize(local_samples_size * embedding_vec_size); - for (size_t i = 0; i < num_instances; i++) { - for (size_t j = 0; j < local_samples_size; j++) { - for (size_t k = 0; k < embedding_vec_size; k++) { - gradients[i][j * embedding_vec_size + k] = - utils::TypeConvertFunc::convert(distribution(generator)); - } - } - } -} - -template -void HybridEmbeddingCpu::forward_a2a_messages() { - forward_sent_messages.resize(num_instances); - forward_received_messages.resize(num_instances); - - for (uint32_t i = 0; i < num_instances; i++) { - for (uint32_t j = 0; j < num_instances; j++) { - uint32_t k0 = model_indices_offsets[i][j]; - uint32_t k1 = model_indices_offsets[i][j + 1]; - for (uint32_t k = k0; k < k1; ++k) { - uint32_t model_indices_to_dst = model_indices[i][k]; - dtype category_to_dst = samples[model_indices_to_dst]; - uint32_t embedding_vec_indices = category_location[2 * category_to_dst + 1]; - for (uint32_t m = 0; m < embedding_vec_size; ++m) { - emtype value = utils::TypeConvertFunc::convert( - infrequent_embedding_vectors[i][embedding_vec_indices * embedding_vec_size + m]); - forward_received_messages[j].push_back(value); - forward_sent_messages[i].push_back(value); - } - } - } - } -} - -template -void HybridEmbeddingCpu::forward_a2a_messages_hier() { - forward_sent_messages.resize(num_instances); - forward_received_messages.resize(num_instances); - for (uint32_t i = 0; i < num_instances; i++) { - forward_received_messages[i].resize(num_instances * local_samples_size * embedding_vec_size); - forward_sent_messages[i].resize(num_instances * local_samples_size * embedding_vec_size); - } - - uint32_t instances_per_node = num_instances / num_nodes; - - for (uint32_t model_id = 0; model_id < num_instances; model_id++) { - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - uint32_t k0 = model_indices_offsets[model_id][network_id]; - uint32_t k1 = model_indices_offsets[model_id][network_id + 1]; - for (uint32_t k = k0; k < k1; ++k) { - uint32_t index = model_indices[model_id][k]; - dtype category = samples[index]; - uint32_t location = category_location[2 * category + 1]; - for (uint32_t m = 0; m < embedding_vec_size; ++m) { - emtype value = utils::TypeConvertFunc::convert( - infrequent_embedding_vectors[model_id][location * embedding_vec_size + m]); - forward_received_messages[network_id] - [(model_id * local_samples_size + k - k0) * embedding_vec_size + - m] = value; - forward_sent_messages - [model_id - model_id % instances_per_node + network_id % instances_per_node] - [((network_id - network_id % instances_per_node + model_id % instances_per_node) * - local_samples_size + - k - k0) * - embedding_vec_size + - m] = value; - } - } - } - } -} - -template -void HybridEmbeddingCpu::backward_a2a_messages() { - backward_sent_messages.resize(num_instances); - backward_received_messages.resize(num_instances); - - for (size_t i = 0; i < num_instances; i++) { - for (size_t j = 0; j < num_instances; j++) { - uint32_t k0 = model_indices_offsets[i][j]; - uint32_t k1 = model_indices_offsets[i][j + 1]; - for (size_t k = k0; k < k1; ++k) { - uint32_t index = model_indices[i][k]; - uint32_t local_index = index % local_samples_size; - for (uint32_t m = 0; m < embedding_vec_size; ++m) { - backward_sent_messages[j].push_back(gradients[j][local_index * embedding_vec_size + m]); - backward_received_messages[i].push_back( - gradients[j][local_index * embedding_vec_size + m]); - } - } - } - } -} - -template -void HybridEmbeddingCpu::backward_a2a_messages_hier() { - backward_sent_messages.resize(num_instances); - backward_received_messages.resize(num_instances); - for (uint32_t i = 0; i < num_instances; i++) { - backward_received_messages[i].resize(num_instances * local_samples_size * embedding_vec_size); - backward_sent_messages[i].resize(num_instances * local_samples_size * embedding_vec_size); - } - - uint32_t instances_per_node = num_instances / num_nodes; - - for (size_t model_id = 0; model_id < num_instances; model_id++) { - for (size_t network_id = 0; network_id < num_instances; network_id++) { - uint32_t k0 = model_indices_offsets[model_id][network_id]; - uint32_t k1 = model_indices_offsets[model_id][network_id + 1]; - for (size_t k = k0; k < k1; ++k) { - uint32_t index = model_indices[model_id][k]; - uint32_t local_index = index % local_samples_size; - for (uint32_t m = 0; m < embedding_vec_size; ++m) { - emtype value = gradients[network_id][local_index * embedding_vec_size + m]; - backward_received_messages[model_id][(network_id * local_samples_size + k - k0) * - embedding_vec_size + - m] = value; - backward_sent_messages - [network_id - network_id % instances_per_node + model_id % instances_per_node] - [((model_id - model_id % instances_per_node + network_id % instances_per_node) * - local_samples_size + - k - k0) * - embedding_vec_size + - m] = value; - } - } - } - } -} - -template -void HybridEmbeddingCpu::infrequent_update() { - for (size_t network_id = 0; network_id < num_instances; network_id++) { - for (size_t j = 0; j < local_samples_size; j++) { - dtype category = samples[network_id * local_samples_size + j]; - dtype model_id = category_location[2 * category]; - dtype location = category_location[2 * category + 1]; - if (model_id < num_instances) { - { - for (uint32_t k = 0; k < embedding_vec_size; k++) - infrequent_embedding_vectors[model_id][location * embedding_vec_size + k] -= - lr * utils::TypeConvertFunc::convert( - gradients[network_id][j * embedding_vec_size + k]); - } - } - } - } -} - -template -void HybridEmbeddingCpu::frequent_reduce_gradients() { - // Reduce to a float32 array - std::vector reduced_gradients_f32(num_frequent * embedding_vec_size, 0.0f); - for (size_t network_id = 0; network_id < num_instances; network_id++) { - for (size_t j = 0; j < local_samples_size; j++) { - dtype category = samples[network_id * local_samples_size + j]; - dtype model_id = category_location[2 * category]; - if (model_id == num_instances) { - dtype freq_index = category_location[2 * category + 1]; - HCTR_CHECK(freq_index < num_frequent); - for (uint32_t k = 0; k < embedding_vec_size; k++) { - reduced_gradients_f32[freq_index * embedding_vec_size + k] += - utils::TypeConvertFunc::convert( - gradients[network_id][j * embedding_vec_size + k]); - } - } - } - } - - // Copy to the emtype array - reduced_gradients.resize(num_frequent * embedding_vec_size); - for (size_t i = 0; i < num_frequent * embedding_vec_size; i++) { - reduced_gradients[i] = utils::TypeConvertFunc::convert(reduced_gradients_f32[i]); - } -} - -template -void HybridEmbeddingCpu::frequent_update() { - for (size_t model_id = 0; model_id < num_instances; model_id++) { - for (size_t i = 0; i < num_frequent * embedding_vec_size; i++) { - frequent_embedding_vectors[model_id][i] -= - lr * utils::TypeConvertFunc::convert(reduced_gradients[i]); - } - } -} - -template -void HybridEmbeddingCpu::frequent_update_single_node() { - uint32_t num_frequent_per_model = num_frequent / num_instances; - for (size_t network_id = 0; network_id < num_instances; network_id++) { - for (size_t j = 0; j < local_samples_size; j++) { - dtype category = samples[network_id * local_samples_size + j]; - dtype model_id = category_location[2 * category]; - if (model_id == num_instances) { - dtype freq_index = category_location[2 * category + 1]; - HCTR_CHECK(freq_index < num_frequent); - uint32_t frequent_model_id = freq_index / num_frequent_per_model; - for (uint32_t k = 0; k < embedding_vec_size; k++) - frequent_embedding_vectors[frequent_model_id][freq_index * embedding_vec_size + k] -= - lr * utils::TypeConvertFunc::convert( - gradients[network_id][j * embedding_vec_size + k]); - } - } - } -} - -template -void HybridEmbeddingCpu::forward_network() { - interaction_layer_input.resize(num_instances); - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - interaction_layer_input[network_id].resize(local_samples_size * embedding_vec_size); - - for (uint32_t i = 0; i < local_samples_size; i++) { - dtype category = samples[local_samples_size * network_id + i]; - dtype model_id = category_location[2 * category]; - dtype location = category_location[2 * category + 1]; - if (model_id == num_instances) { - dtype freq_index = location; - HCTR_CHECK(freq_index < num_frequent); - for (uint32_t k = 0; k < embedding_vec_size; k++) { - interaction_layer_input[network_id][embedding_vec_size * i + k] = - frequent_embedding_vectors[network_id][embedding_vec_size * freq_index + k]; - } - } else { - for (uint32_t k = 0; k < embedding_vec_size; k++) { - interaction_layer_input[network_id][embedding_vec_size * i + k] = - infrequent_embedding_vectors[model_id][embedding_vec_size * location + k]; - } - } - } - } -} - -template -void HybridEmbeddingCpu::frequent_forward_model() { - frequent_embedding_vectors_cache.resize(num_instances); - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - if (sizeof(emtype) != sizeof(float)) { - // Separate buffers, initialize with zeros - frequent_embedding_vectors_cache[network_id].resize(num_frequent * embedding_vec_size, - (emtype)0.0); - } else { - // Same buffers, copy previous values - frequent_embedding_vectors_cache[network_id].resize(num_frequent * embedding_vec_size); - for (size_t i = 0; i < num_frequent * embedding_vec_size; i++) { - frequent_embedding_vectors_cache[network_id][i] = - utils::TypeConvertFunc::convert( - frequent_embedding_vectors[network_id][i]); - } - } - } - - for (uint32_t network_id = 0; network_id < num_instances; network_id++) { - for (uint32_t model_id = 0; model_id < num_instances; model_id++) { - uint32_t i0 = network_cache_indices_offsets[network_id][model_id]; - uint32_t i1 = network_cache_indices_offsets[network_id][model_id + 1]; - for (uint32_t i = i0; i < i1; i++) { - uint32_t freq_index = network_cache_indices[network_id][i]; - for (uint32_t k = 0; k < embedding_vec_size; k++) { - frequent_embedding_vectors_cache[network_id][embedding_vec_size * freq_index + k] = - utils::TypeConvertFunc::convert( - frequent_embedding_vectors[model_id][embedding_vec_size * freq_index + k]); - } - } - } - } -} - -template class HybridEmbeddingCpu; -template class HybridEmbeddingCpu; -template class HybridEmbeddingCpu; -template class HybridEmbeddingCpu; \ No newline at end of file diff --git a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp b/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp deleted file mode 100644 index eaa091de82..0000000000 --- a/test/utest/embedding/hybrid_embedding/hybrid_embedding_cpu.hpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -using namespace HugeCTR; -using namespace HugeCTR::hybrid_embedding; - -namespace utils { -template -constexpr static inline IntType ceildiv(IntType a, IntType b) { - return (a + b - 1) / b; -} -} // namespace utils - -template -class HybridEmbeddingCpu { - public: - uint32_t num_instances; - uint32_t num_nodes; - uint32_t num_tables; - float lr; - - uint32_t batch_size; - uint32_t num_categories; - uint32_t num_frequent; - uint32_t embedding_vec_size; - const std::vector& category_location; - const std::vector& samples; - - uint32_t local_batch_size; - uint32_t local_samples_size; - - std::vector> model_indices; - std::vector> model_indices_offsets; - std::vector> network_indices; - std::vector> network_indices_offsets; - std::vector> frequent_sample_indices; - std::vector> model_cache_indices; - std::vector> model_cache_indices_offsets; - std::vector> network_cache_mask; - std::vector> network_cache_indices; - std::vector> network_cache_indices_offsets; - - std::vector> frequent_embedding_vectors; - std::vector> infrequent_embedding_vectors; - std::vector> gradients; - std::vector> frequent_embedding_vectors_cache; - - std::vector> forward_sent_messages; - std::vector> forward_received_messages; - - std::vector> backward_sent_messages; - std::vector> backward_received_messages; - - std::vector reduced_gradients; - - std::vector> interaction_layer_input; - - HybridEmbeddingCpu(const HybridEmbeddingConfig& config, size_t batch_size, - const std::vector& category_location, const std::vector& samples) - : num_instances(config.num_instances), - num_nodes(config.num_nodes), - num_tables(config.num_tables), - lr(config.lr), - batch_size(batch_size), - num_categories(config.num_categories), - num_frequent(config.num_frequent), - embedding_vec_size(config.embedding_vec_size), - category_location(category_location), - samples(samples), - local_batch_size(utils::ceildiv(batch_size, num_instances)), - local_samples_size(local_batch_size * num_tables) {} - - void calculate_infrequent_model_indices(); - void calculate_infrequent_network_indices(); - void calculate_frequent_sample_indices(); - void calculate_frequent_model_cache_indices(); - void calculate_frequent_network_cache_indices(); - void calculate_frequent_network_cache_mask(); - - void generate_embedding_vectors(); - void generate_gradients(); - - void forward_a2a_messages(); - void forward_a2a_messages_hier(); - void backward_a2a_messages(); - void backward_a2a_messages_hier(); - - void infrequent_update(); - void frequent_reduce_gradients(); - void frequent_update(); - void frequent_update_single_node(); - - void forward_network(); - void frequent_forward_model(); -}; diff --git a/test/utest/embedding/hybrid_embedding/indices_test.cpp b/test/utest/embedding/hybrid_embedding/indices_test.cpp deleted file mode 100644 index f3320d890b..0000000000 --- a/test/utest/embedding/hybrid_embedding/indices_test.cpp +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -/******************** Infrequent embedding: model indices ********************/ - -template -class CalculateModelIndicesTest : public HybridEmbeddingUnitTest { - public: - CalculateModelIndicesTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_infrequent_model_indices(); - - /* Compute indices */ - this->build_infrequent(); - std::vector> h_model_indices(this->num_instances); - std::vector> h_model_indices_offsets(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - if (this->config.comm_type == CommunicationType::NVLink_SingleNode) { - this->infrequent_embeddings_single_node[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream); - download_tensor(h_model_indices[i], - this->infrequent_embeddings_single_node[i].indices_->model_indices_, - this->stream); - download_tensor(h_model_indices_offsets[i], - this->infrequent_embeddings_single_node[i].indices_->model_indices_offsets_, - this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink) { - this->infrequent_embeddings_ib_nvlink[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_model_indices(this->stream); - download_tensor(h_model_indices[i], - this->infrequent_embeddings_ib_nvlink[i].indices_->model_indices_, - this->stream); - download_tensor(h_model_indices_offsets[i], - this->infrequent_embeddings_ib_nvlink[i].indices_->model_indices_offsets_, - this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_model_indices( - this->stream); - download_tensor(h_model_indices[i], - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->model_indices_, - this->stream); - download_tensor( - h_model_indices_offsets[i], - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->model_indices_offsets_, - this->stream); - } - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - h_model_indices[i].resize(h_model_indices_offsets[i][this->num_instances]); - EXPECT_THAT(h_model_indices[i], ::testing::ElementsAreArray(cpu_embedding.model_indices[i])); - EXPECT_THAT(h_model_indices_offsets[i], - ::testing::ElementsAreArray(cpu_embedding.model_indices_offsets[i])); - } - } -}; - -/******************* Infrequent embedding: network indices *******************/ - -template -class CalculateNetworkIndicesTest : public HybridEmbeddingUnitTest { - public: - CalculateNetworkIndicesTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_infrequent_network_indices(); - - /* Compute indices */ - this->build_infrequent(); - std::vector> h_network_indices(this->num_instances); - std::vector> h_network_indices_offsets(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - if (this->config.comm_type == CommunicationType::NVLink_SingleNode) { - this->infrequent_embeddings_single_node[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - this->infrequent_embeddings_single_node[i].indices_->calculate_network_indices( - 80, this->stream); - download_tensor(h_network_indices[i], - this->infrequent_embeddings_single_node[i].indices_->network_indices_, - this->stream); - download_tensor( - h_network_indices_offsets[i], - this->infrequent_embeddings_single_node[i].indices_->network_indices_offsets_, - this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink) { - this->infrequent_embeddings_ib_nvlink[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_network_indices(80, - this->stream); - download_tensor(h_network_indices[i], - this->infrequent_embeddings_ib_nvlink[i].indices_->network_indices_, - this->stream); - download_tensor(h_network_indices_offsets[i], - this->infrequent_embeddings_ib_nvlink[i].indices_->network_indices_offsets_, - this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_network_indices( - 80, this->stream); - download_tensor(h_network_indices[i], - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->network_indices_, - this->stream); - download_tensor( - h_network_indices_offsets[i], - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->network_indices_offsets_, - this->stream); - } - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - h_network_indices[i].resize(h_network_indices_offsets[i][this->num_instances]); - EXPECT_THAT(h_network_indices[i], - ::testing::ElementsAreArray(cpu_embedding.network_indices[i])); - EXPECT_THAT(h_network_indices_offsets[i], - ::testing::ElementsAreArray(cpu_embedding.network_indices_offsets[i])); - } - } -}; - -/**************** Frequent embedding: frequent sample indices ****************/ - -template -class CalculateFrequentSampleIndicesTest : public HybridEmbeddingUnitTest { - public: - CalculateFrequentSampleIndicesTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_frequent_sample_indices(); - /* Compute indices */ - this->build_frequent(); - std::vector> h_frequent_sample_indices(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]); - this->get_frequent_embedding(i).indices_->calculate_frequent_sample_indices(this->stream); - download_tensor(h_frequent_sample_indices[i], - this->get_frequent_embedding(i).indices_->frequent_sample_indices_, - this->stream); - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - uint32_t num_frequent_sample_indices; - HCTR_LIB_THROW(cudaMemcpyAsync( - &num_frequent_sample_indices, - this->get_frequent_embedding(i).indices_->d_num_frequent_sample_indices_.get_ptr(), - sizeof(uint32_t), cudaMemcpyDeviceToHost, this->stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(this->stream)); - h_frequent_sample_indices[i].resize(num_frequent_sample_indices); - EXPECT_THAT(h_frequent_sample_indices[i], - ::testing::ElementsAreArray(cpu_embedding.frequent_sample_indices[i])); - } - } -}; - -/****************** Frequent embedding: model cache indices ******************/ - -template -class CalculateModelCacheIndicesTest : public HybridEmbeddingUnitTest { - public: - CalculateModelCacheIndicesTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_frequent_model_cache_indices(); - - /* Compute indices */ - this->build_frequent(); - std::vector> h_model_cache_indices(this->num_instances); - std::vector> h_model_cache_indices_offsets(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]); - this->get_frequent_embedding(i).indices_->calculate_cache_masks(this->stream); - this->get_frequent_embedding(i).indices_->calculate_model_cache_indices(80, this->stream); - download_tensor(h_model_cache_indices[i], - this->get_frequent_embedding(i).indices_->model_cache_indices_, this->stream); - download_tensor(h_model_cache_indices_offsets[i], - this->get_frequent_embedding(i).indices_->model_cache_indices_offsets_, - this->stream); - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - h_model_cache_indices[i].resize(h_model_cache_indices_offsets[i][this->num_instances]); - EXPECT_THAT(h_model_cache_indices[i], - ::testing::ElementsAreArray(cpu_embedding.model_cache_indices[i])); - EXPECT_THAT(h_model_cache_indices_offsets[i], - ::testing::ElementsAreArray(cpu_embedding.model_cache_indices_offsets[i])); - } - } -}; - -/***************** Frequent embedding: network cache indices *****************/ - -template -class CalculateNetworkCacheIndicesTest : public HybridEmbeddingUnitTest { - public: - CalculateNetworkCacheIndicesTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_frequent_network_cache_mask(); - cpu_embedding.calculate_frequent_network_cache_indices(); - - /* Compute mask and indices */ - this->build_frequent(); - std::vector> h_network_cache_mask(this->num_instances); - std::vector> h_network_cache_indices(this->num_instances); - std::vector> h_network_cache_indices_offsets(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]); - this->get_frequent_embedding(i).indices_->calculate_cache_masks(this->stream); - this->get_frequent_embedding(i).indices_->calculate_network_cache_indices(this->stream); - download_tensor(h_network_cache_indices[i], - this->get_frequent_embedding(i).indices_->network_cache_indices_, - this->stream); - download_tensor(h_network_cache_indices_offsets[i], - this->get_frequent_embedding(i).indices_->network_cache_indices_offsets_, - this->stream); - h_network_cache_mask[i].resize(this->config.num_frequent); - HCTR_LIB_THROW( - cudaMemcpyAsync(h_network_cache_mask[i].data(), - reinterpret_cast( - this->get_frequent_embedding(i).indices_->cache_masks_.get_ptr()), - this->config.num_frequent, cudaMemcpyDeviceToHost, this->stream)); - HCTR_LIB_THROW(cudaStreamSynchronize(this->stream)); - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - h_network_cache_indices[i].resize( - cpu_embedding.network_cache_indices_offsets[i][this->num_instances]); - EXPECT_THAT(h_network_cache_indices[i], - ::testing::ElementsAreArray(cpu_embedding.network_cache_indices[i])); - EXPECT_THAT(h_network_cache_indices_offsets[i], - ::testing::ElementsAreArray(cpu_embedding.network_cache_indices_offsets[i])); - EXPECT_THAT(h_network_cache_mask[i], - ::testing::ElementsAreArray(cpu_embedding.network_cache_mask[i])); - } - } -}; - -/**************************** Test instantiations ****************************/ - -static const HybridEmbeddingConfig config_uint32 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_int64 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; - -// Edge cases: no frequent, all frequent -static const HybridEmbeddingConfig config_no_freq = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_all_freq = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink}; - -/* hybrid_embedding_model_indices_test */ - -TEST(hybrid_embedding_model_indices_test, uint32_float_64) { - CalculateModelIndicesTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_model_indices_test, int64_float_64) { - CalculateModelIndicesTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_model_indices_test, uint32_float_2048) { - CalculateModelIndicesTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_model_indices_test, int64_float_2048) { - CalculateModelIndicesTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_model_indices_test, uint32_float_128_no_freq) { - CalculateModelIndicesTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_model_indices_test, uint32_float_128_all_freq) { - CalculateModelIndicesTest(config_all_freq, 128).run(); -} - -/* hybrid_embedding_network_indices_test */ - -TEST(hybrid_embedding_network_indices_test, uint32_float_64) { - CalculateNetworkIndicesTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_network_indices_test, int64_float_64) { - CalculateNetworkIndicesTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_network_indices_test, uint32_float_2048) { - CalculateNetworkIndicesTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_network_indices_test, int64_float_2048) { - CalculateNetworkIndicesTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_network_indices_test, uint32_float_128_no_freq) { - CalculateNetworkIndicesTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_network_indices_test, uint32_float_128_all_freq) { - CalculateNetworkIndicesTest(config_all_freq, 128).run(); -} - -/* hybrid_embedding_frequent_sample_indices_test */ - -TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_64) { - CalculateFrequentSampleIndicesTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_frequent_sample_indices_test, int64_float_64) { - CalculateFrequentSampleIndicesTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_2048) { - CalculateFrequentSampleIndicesTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_frequent_sample_indices_test, int64_float_2048) { - CalculateFrequentSampleIndicesTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_128_no_freq) { - CalculateFrequentSampleIndicesTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_frequent_sample_indices_test, uint32_float_128_all_freq) { - CalculateFrequentSampleIndicesTest(config_all_freq, 128).run(); -} - -/* hybrid_embedding_model_cache_indices_test */ - -TEST(hybrid_embedding_model_cache_indices_test, uint32_float_64) { - CalculateModelCacheIndicesTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_model_cache_indices_test, int64_float_64) { - CalculateModelCacheIndicesTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_model_cache_indices_test, uint32_float_2048) { - CalculateModelCacheIndicesTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_model_cache_indices_test, int64_float_2048) { - CalculateModelCacheIndicesTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_model_cache_indices_test, uint32_float_128_no_freq) { - CalculateModelCacheIndicesTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_model_cache_indices_test, uint32_float_128_all_freq) { - CalculateModelCacheIndicesTest(config_all_freq, 128).run(); -} - -/* hybrid_embedding_network_cache_indices_test */ - -TEST(hybrid_embedding_network_cache_indices_test, uint32_float_64) { - CalculateNetworkCacheIndicesTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_network_cache_indices_test, int64_float_64) { - CalculateNetworkCacheIndicesTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_network_cache_indices_test, uint32_float_2048) { - CalculateNetworkCacheIndicesTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_network_cache_indices_test, int64_float_2048) { - CalculateNetworkCacheIndicesTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_network_cache_indices_test, uint32_float_128_no_freq) { - CalculateNetworkCacheIndicesTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_network_cache_indices_test, uint32_float_128_all_freq) { - CalculateNetworkCacheIndicesTest(config_all_freq, 128).run(); -} diff --git a/test/utest/embedding/hybrid_embedding/input_generator.cpp b/test/utest/embedding/hybrid_embedding/input_generator.cpp deleted file mode 100644 index 960068d2dd..0000000000 --- a/test/utest/embedding/hybrid_embedding/input_generator.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -std::vector HybridEmbeddingInputGenerator::generate_rand_table_sizes( - size_t num_tables, size_t vec_size, double max_mem) { - std::vector table_sizes(num_tables); - - // mem = sizeof(float) * vec_size * num_tables * max_table_size; - // => - const size_t max_table_size = (size_t)(max_mem / (sizeof(float) * vec_size * num_tables)); - const double max_exp = log(max_table_size) / log(10.); - - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - double r = rand() / (double)RAND_MAX; - // MATTHIAS. Remark: @alex & fan: There is a potiential underflow here. - table_sizes[embedding] = std::max((size_t)2, (size_t)floor(pow(10., 1. + r * (max_exp - 1)))); - } - - return table_sizes; -} - -template -void HybridEmbeddingInputGenerator::generate_uniform_rand_table_sizes(size_t num_categories, - size_t num_tables) { - if (num_categories > 0) config_.num_categories = num_categories; - if (num_tables > 0) config_.num_tables = num_tables; - - std::set separators; - separators.insert(0); - separators.insert(config_.num_categories); - std::uniform_int_distribution dist(1, config_.num_categories - 1); - - for (size_t i = 0; i < config_.num_tables - 1; i++) { - size_t sep; - do { - sep = dist(gen_); - } while (separators.find(sep) != separators.end()); - separators.insert(sep); - } - - for (auto it = std::next(separators.begin()); it != separators.end(); it++) { - table_sizes_.push_back(*it - *(std::prev(it))); - } -} - -template -void HybridEmbeddingInputGenerator::create_probability_distribution() { - const size_t num_embeddings = table_sizes_.size(); - std::uniform_real_distribution distr(0.3, 0.8); - - embedding_prob_distribution_.resize(num_embeddings); - for (size_t embedding = 0; embedding < num_embeddings; ++embedding) { - embedding_prob_distribution_[embedding].resize(table_sizes_[embedding]); - } - - for (size_t embedding = 0; embedding < num_embeddings; ++embedding) { - const size_t embedding_size = table_sizes_[embedding]; - std::vector embedding_shuffle_arg(table_sizes_[embedding]); - std::iota(embedding_shuffle_arg.begin(), embedding_shuffle_arg.end(), (size_t)0); - std::shuffle(embedding_shuffle_arg.begin(), embedding_shuffle_arg.end(), gen_); - embedding_shuffle_args.push_back(embedding_shuffle_arg); - if (embedding_size < 30) { - // choose uniform distribution - for (size_t c_e = 0; c_e < table_sizes_[embedding]; ++c_e) - embedding_prob_distribution_[embedding][c_e] = 1. / (double)embedding_size; - } else { - // MATTHIAS. Remark: @alex & fan: There is a potiential underflow here. - size_t size_first = std::max((size_t)1, size_t(4. * log10((double)embedding_size))); - size_first = std::min((size_t)embedding_size, (size_t)size_first); - double acc_prob_first = distr(gen_); - // a * (1 - r^n) / (1 - r) = acc_p - // Let a * r^{n} = 0.02 * acc_prob_first - // a - 0.02 * acc_prob_first = acc_prob_first * (1-r) - - // (1 + 0.02) * acc_prob_first - a = r * acc_prob_first - double r = 0.9; - double a = acc_prob_first * (1. - r) / (1. - pow(r, (double)size_first)); - for (size_t c_e = 0; c_e < size_first; ++c_e) - embedding_prob_distribution_[embedding][c_e] = a * pow(r, (double)c_e); - - // the following is approximate, will be normalized.. - // - // now apply power law to the remaining elements: - // - // p = a * n^{-2} - // => 1 - acc_prob_first = a / N - a / n - // => a ( 1/n - 1/N ) = 1 - acc_prob_first - // => a (N-n) / (nN) = 1 - acc_prob_first - // => a = n * N / (N-n) * (1 - acc_prob_first) - - a = size_first * embedding_size / (embedding_size - size_first) * (1. - acc_prob_first); - for (size_t c_e = size_first; c_e < table_sizes_[embedding]; ++c_e) - embedding_prob_distribution_[embedding][c_e] = a * pow((double)c_e, -2.); - - // normalize probability distribution - // calculate norm - double sum_p = 0.; - for (size_t c_e = 0; c_e < table_sizes_[embedding]; ++c_e) - sum_p += embedding_prob_distribution_[embedding][c_e]; - // correct - for (size_t c_e = 0; c_e < table_sizes_[embedding]; ++c_e) - embedding_prob_distribution_[embedding][c_e] /= sum_p; - } - } -} - -template -void HybridEmbeddingInputGenerator::generate_categories(dtype* data, size_t batch_size, - bool normalized) { - const size_t num_embeddings = table_sizes_.size(); - std::uniform_real_distribution distr(0, 1); - std::vector embedding_offsets; - HugeCTR::hybrid_embedding::EmbeddingTableFunctors::get_embedding_offsets(embedding_offsets, - table_sizes_); - // create samples - for (size_t embedding = 0; embedding < num_embeddings; ++embedding) { - std::vector& embedding_shuffle_arg = embedding_shuffle_args[embedding]; - std::vector& f_prob_e = embedding_prob_distribution_[embedding]; - std::vector acc_prob(f_prob_e.size() + 1, 0.0); - double acc = 0.; - for (size_t c_e = 0; c_e < f_prob_e.size(); ++c_e) { - acc_prob[c_e] = acc; - acc += f_prob_e[c_e]; - } - - acc_prob.front() = -42.0; - acc_prob.back() = 42.0; - - for (size_t sample = 0; sample < batch_size; ++sample) { - double r = distr(gen_); - size_t category = - (size_t)(std::lower_bound(acc_prob.begin(), acc_prob.end(), r) - acc_prob.begin()) - 1; - - // category index within table - size_t category_shuffled = embedding_shuffle_arg[category]; - data[sample * num_embeddings + embedding] = category_shuffled; - - if (normalized) { - data[sample * num_embeddings + embedding] += (size_t)embedding_offsets[embedding]; - } - } - } -} - -template -void HybridEmbeddingInputGenerator::generate_category_location() { - std::uniform_int_distribution distr(0, config_.num_instances - 1); - - std::vector all_probabilities; - for (auto& v : embedding_prob_distribution_) { - all_probabilities.insert(all_probabilities.end(), v.begin(), v.end()); - } - std::vector original_index(config_.num_categories); - std::iota(original_index.begin(), original_index.end(), (dtype)0); - - std::sort(original_index.begin(), original_index.end(), [&all_probabilities](dtype i1, dtype i2) { - return all_probabilities[i1] < all_probabilities[i2]; - }); - - // First num_frequent categories are frequent - category_location_.resize(2 * config_.num_categories, config_.num_instances); - for (dtype i = 0; i < config_.num_frequent; i++) { - dtype cat = original_index[i]; - category_location_[2 * cat + 1] = i; - } - - dtype max_size_per_instance = - (config_.num_categories - config_.num_frequent + config_.num_instances - 1) / - config_.num_instances; - std::vector sizes_per_instance(config_.num_instances, 0); - for (dtype i = config_.num_frequent; i < config_.num_categories; i++) { - dtype cat = original_index[i]; - dtype instance; - do { - instance = distr(gen_); - // If the selected instance is already full, pick another one - } while (sizes_per_instance[instance] == max_size_per_instance); - category_location_[2 * cat + 0] = instance; - category_location_[2 * cat + 1] = sizes_per_instance[instance]++; - } -} - -template -HybridEmbeddingInputGenerator::HybridEmbeddingInputGenerator( - HybridEmbeddingConfig config, size_t seed) - : config_(config), seed_(seed), gen_(seed) { - generate_uniform_rand_table_sizes(config_.num_categories, config_.num_tables); - create_probability_distribution(); -} - -template -HybridEmbeddingInputGenerator::HybridEmbeddingInputGenerator( - HybridEmbeddingConfig config, const std::vector& table_sizes, size_t seed) - : config_(config), table_sizes_(table_sizes), seed_(seed), gen_(seed) { - config_.num_tables = table_sizes.size(); - config_.num_categories = std::accumulate(table_sizes.begin(), table_sizes.end(), 0); - create_probability_distribution(); -} - -template -std::vector HybridEmbeddingInputGenerator::generate_categorical_input( - size_t batch_size, size_t num_tables) { - table_sizes_ = generate_rand_table_sizes(num_tables); - config_.num_tables = table_sizes_.size(); - config_.num_categories = std::accumulate(table_sizes_.begin(), table_sizes_.end(), 0); - create_probability_distribution(); - - std::vector data(batch_size * config_.num_tables); - generate_categories(data.data(), batch_size, false); - return data; -} - -template -std::vector HybridEmbeddingInputGenerator::generate_flattened_categorical_input( - size_t batch_size, size_t num_tables) { - table_sizes_ = generate_rand_table_sizes(num_tables); - config_.num_tables = table_sizes_.size(); - config_.num_categories = std::accumulate(table_sizes_.begin(), table_sizes_.end(), 0); - create_probability_distribution(); - - std::vector data(batch_size * config_.num_tables); - generate_categories(data.data(), batch_size, true); - return data; -} - -template -std::vector HybridEmbeddingInputGenerator::generate_categorical_input( - size_t batch_size) { - std::vector data(batch_size * config_.num_tables); - generate_categories(data.data(), batch_size, false); - return data; -} - -template -std::vector HybridEmbeddingInputGenerator::generate_flattened_categorical_input( - size_t batch_size) { - std::vector data(batch_size * config_.num_tables); - generate_categories(data.data(), batch_size, true); - return data; -} - -template -void HybridEmbeddingInputGenerator::generate_categorical_input(dtype* batch, - size_t batch_size) { - generate_categories(batch, batch_size, false); -} - -template -void HybridEmbeddingInputGenerator::generate_flattened_categorical_input(dtype* batch, - size_t batch_size) { - generate_categories(batch, batch_size, true); -} - -template -std::vector& HybridEmbeddingInputGenerator::get_category_location() { - return category_location_; -} - -template -std::vector& HybridEmbeddingInputGenerator::get_table_sizes() { - return table_sizes_; -} - -template class HybridEmbeddingInputGenerator; -template class HybridEmbeddingInputGenerator; - -} // namespace hybrid_embedding - -} // namespace HugeCTR \ No newline at end of file diff --git a/test/utest/embedding/hybrid_embedding/input_generator.hpp b/test/utest/embedding/hybrid_embedding/input_generator.hpp deleted file mode 100644 index 394b1775d2..0000000000 --- a/test/utest/embedding/hybrid_embedding/input_generator.hpp +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -struct HybridEmbeddingConfig { - size_t num_nodes; - size_t num_instances; - size_t num_tables; - size_t embedding_vec_size; - dtype num_categories; - dtype num_frequent; - float lr; - CommunicationType comm_type; -}; - -template -class HybridEmbeddingInputGenerator { - public: - HybridEmbeddingInputGenerator(size_t seed) : gen_(seed) {} - HybridEmbeddingInputGenerator(HybridEmbeddingConfig config, size_t seed); - HybridEmbeddingInputGenerator(HybridEmbeddingConfig config, - const std::vector &table_sizes, size_t seed); - // Multiple calls return different data - - // By default the data is provided in the 'raw' format: each data point is - // a category which is indexed according to the table it belongs to. - // Each sample contains elements and its - // value lies within the integer range [0, number of categories in category feature) - - /// @param batch_size number of samples to return - /// @param num_categories required sum of table sizes - /// @param num_tables required number of tables - /// @param flatten_input indicator whether generated categories have an associated unique value - std::vector generate_categorical_input(size_t batch_size, size_t num_tables); - // _flattened means that the category indices are unique - // (i.e., table offsets are added to the raw data) - std::vector generate_flattened_categorical_input(size_t batch_size, size_t num_tables); - - // regenerate data with precalculated table_sizes_ - std::vector generate_categorical_input(size_t batch_size); - std::vector generate_flattened_categorical_input(size_t batch_size); - - void generate_categorical_input(dtype *batch, size_t batch_size); - void generate_flattened_categorical_input(dtype *batch, size_t batch_size); - void generate_category_location(); - - // Multiple calls return the same data - std::vector &get_category_location(); - std::vector &get_table_sizes(); - - private: - HybridEmbeddingConfig config_; - std::vector> embedding_prob_distribution_; - std::vector table_sizes_; - size_t seed_; - std::mt19937 gen_; - - std::vector category_location_; - std::vector> embedding_shuffle_args; - - void generate_uniform_rand_table_sizes(size_t num_categories = 0, size_t num_tables = 0); - static std::vector generate_rand_table_sizes(size_t num_tables, - size_t embedding_vec_size = 128, - double max_mem = 8.e9); - void create_probability_distribution(); - void generate_categories(dtype *data, size_t batch_size, bool normalized); -}; - -} // namespace hybrid_embedding - -} // namespace HugeCTR \ No newline at end of file diff --git a/test/utest/embedding/hybrid_embedding/messages_test.cpp b/test/utest/embedding/hybrid_embedding/messages_test.cpp deleted file mode 100644 index 6f6973a585..0000000000 --- a/test/utest/embedding/hybrid_embedding/messages_test.cpp +++ /dev/null @@ -1,466 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -/**************** Infrequent embedding: forward sent message ****************/ - -template -class ForwardSentMessageTest : public HybridEmbeddingUnitTest { - public: - ForwardSentMessageTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - uint32_t local_batch_size = ceildiv(this->batch_size, this->num_instances); - uint32_t instances_per_node = this->num_instances / this->config.num_nodes; - - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_infrequent_model_indices(); - cpu_embedding.generate_embedding_vectors(); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - cpu_embedding.forward_a2a_messages_hier(); - } else { - cpu_embedding.forward_a2a_messages(); - } - - /* Tensors and vectors for the generated messages */ - std::shared_ptr> buff = GeneralBuffer2::create(); - std::vector> sent_messages(this->num_instances); - std::vector> message_buffer_pointers(this->num_instances); - std::vector> h_sent_messages(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({this->num_instances * local_batch_size * this->config.num_tables, - this->config.embedding_vec_size}, - &sent_messages[i]); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - buff->reserve({instances_per_node, 1}, &message_buffer_pointers[i]); - } - } - buff->allocate(); - - this->build_infrequent(); - - std::vector> h_message_buffer_pointers(this->config.num_nodes); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - /* Construct the arrays of pointers for each node */ - for (size_t i = 0; i < this->config.num_nodes; i++) { - h_message_buffer_pointers[i].resize(instances_per_node); - } - for (size_t i = 0; i < this->num_instances; i++) { - h_message_buffer_pointers[i / instances_per_node][i % instances_per_node] = - sent_messages[i].get_ptr(); - } - - /* Copy the arrays to device */ - for (size_t i = 0; i < this->num_instances; i++) { - HCTR_LIB_THROW(cudaMemcpyAsync(message_buffer_pointers[i].get_ptr(), - h_message_buffer_pointers[i / instances_per_node].data(), - instances_per_node * sizeof(emtype*), cudaMemcpyHostToDevice, - this->stream)); - } - - /* Fill buffers with zeroes */ - for (size_t i = 0; i < this->num_instances; i++) { - HCTR_LIB_THROW( - cudaMemsetAsync(sent_messages[i].get_ptr(), 0, - this->num_instances * local_batch_size * this->config.num_tables * - this->config.embedding_vec_size * sizeof(emtype), - this->stream)); - } - } - - /* Infrequent forward_model */ - for (size_t i = 0; i < this->num_instances; i++) { - if (this->config.comm_type == CommunicationType::NVLink_SingleNode) { - this->infrequent_embeddings_single_node[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_, - this->stream); - this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink) { - this->infrequent_embeddings_ib_nvlink[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_, - this->stream); - this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_model_indices(this->stream); - - this->infrequent_embeddings_ib_nvlink[i].forward_model(sent_messages[i].get_ptr(), - this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_, - this->stream); - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_model_indices( - this->stream); - - this->infrequent_embeddings_ib_nvlink_hier[i].fused_intra_forward_model( - message_buffer_pointers[i].get_ptr(), this->stream); - } - } - - for (size_t i = 0; i < this->num_instances; i++) { - download_tensor(h_sent_messages[i], sent_messages[i], this->stream); - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - uint32_t message_size = this->config.comm_type == CommunicationType::IB_NVLink_Hier - ? (this->num_instances * local_batch_size * - this->config.num_tables * this->config.embedding_vec_size) - : (this->config.embedding_vec_size * - cpu_embedding.model_indices_offsets[i][this->num_instances]); - ASSERT_TRUE(compare_array(message_size, h_sent_messages[i].data(), - cpu_embedding.forward_sent_messages[i].data(), 1e-2)); - } - } -}; - -/**************** Infrequent embedding: backward sent message ****************/ - -template -class BackwardSentMessageTest : public HybridEmbeddingUnitTest { - public: - BackwardSentMessageTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed) {} - - void run() { - uint32_t local_batch_size = ceildiv(this->batch_size, this->num_instances); - uint32_t instances_per_node = this->num_instances / this->config.num_nodes; - - /* Compute expected results on host */ - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_infrequent_model_indices(); - cpu_embedding.calculate_infrequent_network_indices(); - cpu_embedding.generate_gradients(); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - cpu_embedding.backward_a2a_messages_hier(); - } else { - cpu_embedding.backward_a2a_messages(); - } - - /* Tensors and vectors for the gradients and generated messages */ - std::shared_ptr> buff = GeneralBuffer2::create(); - std::vector> sent_messages(this->num_instances); - std::vector> message_buffer_pointers(this->num_instances); - std::vector> h_sent_messages(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({this->num_instances * local_batch_size * this->config.num_tables, - this->config.embedding_vec_size}, - &sent_messages[i]); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - buff->reserve({instances_per_node, 1}, &message_buffer_pointers[i]); - } - } - std::vector> gradients(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size}, - &gradients[i]); - } - buff->allocate(); - - this->build_infrequent(); - - std::vector> h_message_buffer_pointers(this->config.num_nodes); - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - /* Construct the arrays of pointers for each node */ - for (size_t i = 0; i < this->config.num_nodes; i++) { - h_message_buffer_pointers[i].resize(instances_per_node); - } - for (size_t i = 0; i < this->num_instances; i++) { - h_message_buffer_pointers[i / instances_per_node][i % instances_per_node] = - sent_messages[i].get_ptr(); - } - - /* Copy the arrays to device */ - for (size_t i = 0; i < this->num_instances; i++) { - HCTR_LIB_THROW(cudaMemcpyAsync(message_buffer_pointers[i].get_ptr(), - h_message_buffer_pointers[i / instances_per_node].data(), - instances_per_node * sizeof(emtype*), cudaMemcpyHostToDevice, - this->stream)); - } - - /* Fill buffers with zeroes */ - for (size_t i = 0; i < this->num_instances; i++) { - HCTR_LIB_THROW( - cudaMemsetAsync(sent_messages[i].get_ptr(), 0, - this->num_instances * local_batch_size * this->config.num_tables * - this->config.embedding_vec_size * sizeof(emtype), - this->stream)); - } - } - - /* Infrequent update_network */ - for (size_t i = 0; i < this->num_instances; i++) { - if (this->config.comm_type == CommunicationType::NVLink_SingleNode) { - this->infrequent_embeddings_single_node[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream); - this->infrequent_embeddings_single_node[i].indices_->calculate_network_indices( - 80, this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink) { - this->infrequent_embeddings_ib_nvlink[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream); - this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_network_indices(80, - this->stream); - - this->infrequent_embeddings_ib_nvlink[i].update_network( - gradients[i].get_ptr(), sent_messages[i].get_ptr(), this->stream); - } - - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream); - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_network_indices( - 80, this->stream); - - this->infrequent_embeddings_ib_nvlink_hier[i].fused_intra_update_network( - gradients[i].get_ptr(), message_buffer_pointers[i].get_ptr(), this->stream); - } - } - - for (size_t i = 0; i < this->num_instances; i++) { - download_tensor(h_sent_messages[i], sent_messages[i], this->stream); - } - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - uint32_t message_size = this->config.comm_type == CommunicationType::IB_NVLink_Hier - ? (this->num_instances * local_batch_size * - this->config.num_tables * this->config.embedding_vec_size) - : (this->config.embedding_vec_size * - cpu_embedding.network_indices_offsets[i][this->num_instances]); - ASSERT_TRUE(compare_array(message_size, h_sent_messages[i].data(), - cpu_embedding.backward_sent_messages[i].data(), 1e-2)); - } - } -}; - -/**************************** Test instantiations ****************************/ - -static const HybridEmbeddingConfig config_uint32 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_int64 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; - -// Edge cases: no frequent, all frequent -static const HybridEmbeddingConfig config_no_freq = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_all_freq = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink}; - -// Hierarchical A2A -static const HybridEmbeddingConfig config_uint32_hier = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_int64_hier = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_no_freq_hier = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_all_freq_hier = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink_Hier}; - -/* hybrid_embedding_forward_sent_message_test */ - -TEST(hybrid_embedding_forward_sent_message_test, uint32_half_64) { - ForwardSentMessageTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, int64_half_64) { - ForwardSentMessageTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, uint32_half_2048) { - ForwardSentMessageTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, int64_half_2048) { - ForwardSentMessageTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, uint32_float_64) { - ForwardSentMessageTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, int64_float_64) { - ForwardSentMessageTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, uint32_float_2048) { - ForwardSentMessageTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, int64_float_2048) { - ForwardSentMessageTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, uint32_float_128_no_freq) { - ForwardSentMessageTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_forward_sent_message_test, uint32_float_128_all_freq) { - ForwardSentMessageTest(config_all_freq, 128).run(); -} - -/* hybrid_embedding_forward_sent_message_hier_test */ - -TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_half_64) { - ForwardSentMessageTest(config_uint32_hier, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, int64_half_64) { - ForwardSentMessageTest(config_int64_hier, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_half_2048) { - ForwardSentMessageTest(config_uint32_hier, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, int64_half_2048) { - ForwardSentMessageTest(config_int64_hier, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_64) { - ForwardSentMessageTest(config_uint32_hier, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, int64_float_64) { - ForwardSentMessageTest(config_int64_hier, 64).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_2048) { - ForwardSentMessageTest(config_uint32_hier, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, int64_float_2048) { - ForwardSentMessageTest(config_int64_hier, 2048).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_128_no_freq) { - ForwardSentMessageTest(config_no_freq_hier, 128).run(); -} - -TEST(hybrid_embedding_forward_sent_message_hier_test, uint32_float_128_all_freq) { - ForwardSentMessageTest(config_all_freq_hier, 128).run(); -} - -/* hybrid_embedding_backward_sent_message_test */ - -TEST(hybrid_embedding_backward_sent_message_test, uint32_half_64) { - BackwardSentMessageTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, int64_half_64) { - BackwardSentMessageTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, uint32_half_2048) { - BackwardSentMessageTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, int64_half_2048) { - BackwardSentMessageTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, uint32_float_64) { - BackwardSentMessageTest(config_uint32, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, int64_float_64) { - BackwardSentMessageTest(config_int64, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, uint32_float_2048) { - BackwardSentMessageTest(config_uint32, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, int64_float_2048) { - BackwardSentMessageTest(config_int64, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, uint32_float_128_no_freq) { - BackwardSentMessageTest(config_no_freq, 128).run(); -} - -TEST(hybrid_embedding_backward_sent_message_test, uint32_float_128_all_freq) { - BackwardSentMessageTest(config_all_freq, 128).run(); -} - -/* hybrid_embedding_backward_sent_message_hier_test */ - -TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_half_64) { - BackwardSentMessageTest(config_uint32_hier, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, int64_half_64) { - BackwardSentMessageTest(config_int64_hier, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_half_2048) { - BackwardSentMessageTest(config_uint32_hier, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, int64_half_2048) { - BackwardSentMessageTest(config_int64_hier, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_64) { - BackwardSentMessageTest(config_uint32_hier, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, int64_float_64) { - BackwardSentMessageTest(config_int64_hier, 64).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_2048) { - BackwardSentMessageTest(config_uint32_hier, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, int64_float_2048) { - BackwardSentMessageTest(config_int64_hier, 2048).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_128_no_freq) { - BackwardSentMessageTest(config_no_freq_hier, 128).run(); -} - -TEST(hybrid_embedding_backward_sent_message_hier_test, uint32_float_128_all_freq) { - BackwardSentMessageTest(config_all_freq_hier, 128).run(); -} diff --git a/test/utest/embedding/hybrid_embedding/model_test.cpp b/test/utest/embedding/hybrid_embedding/model_test.cpp deleted file mode 100644 index 5a04e4a286..0000000000 --- a/test/utest/embedding/hybrid_embedding/model_test.cpp +++ /dev/null @@ -1,630 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace hybrid_embedding; - -namespace { - -template -void print_vector(const std::vector &vec) { - for (auto v : vec) { - std::cout << v << " ,"; - } - std::cout << std::endl; -} - -template -void model_test() { - Tensor2 tmp_categories; - size_t batch_size = 4; - size_t num_iterations = 2; - CommunicationType comm_type = CommunicationType::IB_NVLink; - uint32_t global_instance_id = 1; - std::vector num_instances_per_node{2, 2}; - std::vector table_sizes{100, 10, 10, 20}; - std::vector data_in{99, 3, 7, 19, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, - 3, 3, 3, 3, 50, 2, 4, 10, 2, 2, 2, 2, 1, 1, 1, 1}; - std::vector data_to_unique_categories_ref{ - 99, 103, 117, 139, 0, 100, 110, 120, 1, 101, 111, 121, 2, 102, 112, 122, - 3, 103, 113, 123, 50, 102, 114, 130, 2, 102, 112, 122, 1, 101, 111, 121}; - - Tensor2 d_data_in; - std::shared_ptr> buff = GeneralBuffer2::create(); - size_t num_categories = EmbeddingTableFunctors::get_num_categories(table_sizes); - buff->reserve({batch_size * num_iterations * table_sizes.size()}, &d_data_in); - buff->reserve({num_categories, 1}, &tmp_categories); - buff->allocate(); - upload_tensor(data_in, d_data_in, 0); - - /*1. Data() and data.data_to_unique_categories()*/ - Data data(table_sizes, batch_size, num_iterations); - data.data_to_unique_categories(d_data_in, 0); - std::vector data_to_unique_categories_ret; - download_tensor(data_to_unique_categories_ret, data.samples, 0); - EXPECT_THAT(data_to_unique_categories_ret, - ::testing::ElementsAreArray(data_to_unique_categories_ref)); - - /*2. Model()*/ - // std::cout << "debug0:" << num_categories << std::endl; - Model model(comm_type, global_instance_id, num_instances_per_node, num_categories); - - /*3. CalibrationData()*/ - size_t num_nodes = num_instances_per_node.size(); - CalibrationData calibration_data(num_nodes, 1.0 / 10.0, 4.0, 1.0, 1.0); - - /*4. Statistics()*/ - Statistics statistics(data.batch_size * data.num_iterations * data.table_sizes.size(), - data.table_sizes.size(), model.num_instances, num_categories); - statistics.sort_categories_by_count(data.samples, 0); - std::vector categories_sorted_ret; - std::vector counts_sorted_ret; - download_tensor(categories_sorted_ret, statistics.categories_sorted, 0); - download_tensor(counts_sorted_ret, statistics.counts_sorted, 0); - std::vector categories_sorted_ref{102, 1, 2, 101, 103, 111, 112, 121, 122, 0, 3, - 50, 99, 100, 110, 113, 114, 117, 120, 123, 130, 139}; - std::vector counts_sorted_ref{3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - EXPECT_THAT(categories_sorted_ret, ::testing::ElementsAreArray(categories_sorted_ref)); - EXPECT_THAT(counts_sorted_ret, ::testing::ElementsAreArray(counts_sorted_ref)); - // print_vector(counts_sorted_ret); - // print_vector(categories_sorted_ret); - - /*5. Model::init_hybrid_model*/ - model.init_hybrid_model(calibration_data, statistics, data, tmp_categories, 0); - EXPECT_EQ(model.num_frequent, 12); - std::vector category_location_ret; - download_tensor(category_location_ret, model.category_location, 0); - - std::vector category_location_ref{ - 4, 4, 4, 3, 4, 6, 4, 7, 0, 0, 1, 0, 2, 0, 3, 0, 0, 1, 1, 1, 2, 1, 3, 1, 0, 2, - 1, 2, 2, 2, 3, 2, 0, 3, 1, 3, 2, 3, 3, 3, 0, 4, 1, 4, 2, 4, 3, 4, 0, 5, 1, 5, - 2, 5, 3, 5, 0, 6, 1, 6, 2, 6, 3, 6, 0, 7, 1, 7, 2, 7, 3, 7, 0, 8, 1, 8, 2, 8, - 3, 8, 0, 9, 1, 9, 2, 9, 3, 9, 0, 10, 1, 10, 2, 10, 3, 10, 0, 11, 1, 11, 4, 9, 2, 11, - 3, 11, 0, 12, 1, 12, 2, 12, 3, 12, 0, 13, 1, 13, 2, 13, 3, 13, 0, 14, 1, 14, 2, 14, 3, 14, - 0, 15, 1, 15, 2, 15, 3, 15, 0, 16, 1, 16, 2, 16, 3, 16, 0, 17, 1, 17, 2, 17, 3, 17, 0, 18, - 1, 18, 2, 18, 3, 18, 0, 19, 1, 19, 2, 19, 3, 19, 0, 20, 1, 20, 2, 20, 3, 20, 0, 21, 1, 21, - 2, 21, 3, 21, 0, 22, 1, 22, 2, 22, 3, 22, 0, 23, 1, 23, 2, 23, 3, 23, 4, 10, 4, 0, 4, 1, - 0, 24, 1, 24, 2, 24, 3, 24, 0, 25, 1, 25, 2, 25, 4, 5, 4, 8, 3, 25, 0, 26, 1, 26, 2, 26, - 3, 26, 0, 27, 1, 27, 2, 27, 4, 11, 4, 2, 3, 27, 0, 28, 1, 28, 2, 28, 3, 28, 0, 29, 1, 29, - 2, 29, 3, 29, 0, 30, 1, 30, 2, 30, 3, 30, 0, 31, 1, 31, 2, 31, 3, 31, 140, 140}; - EXPECT_THAT(category_location_ret, ::testing::ElementsAreArray(category_location_ref)); - - std::vector h_frequent_model_table_offsets_ref{0, 0, 2, 2, 3, 3, 5, 5, 6, 6, - 6, 8, 8, 9, 9, 9, 10, 11, 11, 12}; - std::vector h_infrequent_model_table_offsets_ref{0, 24, 26, 28, 32}; - EXPECT_THAT(model.h_frequent_model_table_offsets, - ::testing::ElementsAreArray(h_frequent_model_table_offsets_ref)); - EXPECT_THAT(model.h_infrequent_model_table_offsets, - ::testing::ElementsAreArray(h_infrequent_model_table_offsets_ref)); -}; - -template -void model_init_test(const size_t num_instances, const size_t num_tables, const size_t batch_size, - CommunicationType ctype) { - // 1. generate the reference model from reference stats and corresponding data - // std::vector categories; - // std::vector counts; - - const size_t num_iterations = 1; - std::cout << "Model init test ... " << std::endl << std::endl; - std::cout << "number of instances : " << num_instances << std::endl; - std::cout << "Number of tables : " << num_tables << std::endl; - std::cout << "Batch size : " << batch_size << std::endl; - std::cout << "Number of iterations : " << num_iterations << std::endl; - - HybridEmbeddingInputGenerator input_generator(848484); - std::vector raw_data = input_generator.generate_categorical_input(batch_size, num_tables); - std::vector table_sizes = input_generator.get_table_sizes(); - const size_t num_categories = - std::accumulate(table_sizes.begin(), table_sizes.end(), static_cast(0)); - std::cout << "Table sizes : "; - for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding) - std::cout << '\t' << table_sizes[embedding]; - std::cout << std::endl; - - // create the gpu tensor for the raw data - cudaStream_t stream = 0; - Tensor2 d_raw_data; - std::shared_ptr> buf = GeneralBuffer2::create(); - std::cout << "number of samples : " << raw_data.size() << std::endl; - buf->reserve({raw_data.size(), 1}, &d_raw_data); - buf->allocate(); - upload_tensor(raw_data, d_raw_data, stream); - - std::cout << "Testing raw data..." << std::endl; - test_raw_data(d_raw_data.get_ptr(), batch_size, num_tables, num_iterations, table_sizes); - std::cout << "Done testing raw data..." << std::endl; - - // 2. perform model initialization, data - std::cout << "performing statistics and calibration initialization..." << std::endl; - Data data(table_sizes, batch_size, num_iterations); - data.data_to_unique_categories(d_raw_data, stream); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - std::cout << "Testing samples..." << std::endl; - test_samples(d_raw_data.get_ptr(), data); - std::cout << "Done testing samples!" << std::endl; - - Statistics statistics(data, num_instances); - std::cout << "Statistics construction " << std::endl; - CalibrationData calibration(1, 1. / 10., 130.e9, 190.e9, 1.0); - - // model creation - std::cout << "performing model initialization..." << std::endl; - std::vector num_instances_per_node(1); - num_instances_per_node[0] = (uint32_t)num_instances; - // Model model(ctype, 0, num_instances_per_node, num_categories); - // = {(uint32_t)num_instances}; - std::vector> models; - std::vector> frequent_infrequent_categories(num_instances); - for (size_t instance = 0; instance < num_instances; ++instance) { - models.emplace_back(ctype, (uint32_t)instance, num_instances_per_node, num_categories); - std::cout << "instance : " << instance << " out of " << num_instances << std::endl; - std::shared_ptr> buf = GeneralBuffer2::create(); - // std::shared_ptr> temp_block_buffer = buf->create_block<>(dtype); - // Tensor2 tmp_infrequent_categories; - buf->reserve({(size_t)num_categories, 1}, &frequent_infrequent_categories[instance]); - buf->allocate(); - // std::cout << "constructing instance, allocating memory..." << std::endl; - // std::cout << "initializing model..." << std::endl; - models[instance].init_hybrid_model(calibration, statistics, data, - frequent_infrequent_categories[instance], stream); - // std::cout << "done initializing model" << std::endl; - } - std::vector categories_sorted_stats; - std::vector counts_sorted_stats; - download_tensor(categories_sorted_stats, statistics.categories_sorted, stream); - download_tensor(counts_sorted_stats, statistics.counts_sorted, stream); - // TODO: check consistency of - // global_instance_id, - // num_instances_per_node, - // node_id, - - // Check defining properties - - std::cout << "Checking consistency and completeness of infrequent embedding..." << std::endl; - // check order of categories for infrequent - // - assuming default distributed embedding - std::vector num_infrequent_model_vec(num_instances); - size_t num_infrequent_tables = 0; - for (size_t instance = 0; instance < num_instances; ++instance) { - Model &model = models[instance]; - - std::vector category_location; - download_tensor(category_location, model.category_location, stream); - - size_t indx_infrequent = 0; - for (size_t category = 0; category < num_categories; ++category) { - if (category_location[2 * category] < num_instances) { - size_t instance_location = category_location[2 * category]; - size_t buffer_index = category_location[2 * category + 1]; - - EXPECT_EQ(instance_location, indx_infrequent % num_instances); - EXPECT_EQ(buffer_index, indx_infrequent / num_instances); - - indx_infrequent++; - } - } - const size_t num_infrequent_model = indx_infrequent; - num_infrequent_model_vec[instance] = num_infrequent_model; - - // check consistency table offsets - size_t num_infrequent_tables_instance = 0; - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - size_t cur_offset = model.h_infrequent_model_table_offsets[embedding]; - size_t next_offset = model.h_infrequent_model_table_offsets[embedding + 1]; - size_t indx_infrequent_instance = 0; - for (size_t category = 0; category < num_categories; ++category) { - if (category_location[2 * category] == instance) { - if (indx_infrequent_instance >= cur_offset && indx_infrequent_instance < next_offset) { - size_t embedding_category = - EmbeddingTableFunctors::get_embedding_table_index(table_sizes, category); - EXPECT_EQ(embedding_category, embedding); - } - indx_infrequent_instance++; - } - } - num_infrequent_tables_instance = indx_infrequent_instance; - } - num_infrequent_tables += num_infrequent_tables_instance; - } - // Check that the total number of embedding vectors in all instances for all tables equals - // the total number of infrequent embedding vectors - if (num_infrequent_model_vec.size() > 0) { - EXPECT_EQ(num_infrequent_tables, num_infrequent_model_vec[0]); - if (num_infrequent_tables != num_infrequent_model_vec[0]) { - std::cout << "num_infrequent_tables = " << num_infrequent_tables << std::endl; - std::cout << "num_infrequent_model_vec[0] = " << num_infrequent_model_vec[0] << std::endl; - } - } - // Check that the number of infrequent categories is the same for all instances. - for (size_t instance = 1; instance < num_instances; ++instance) { - EXPECT_EQ(num_infrequent_model_vec[instance], num_infrequent_model_vec[0]); - } - std::cout << "Checking consistency and completeness of frequent embedding..." << std::endl; - // Check that the frequent embedding model is complete and self-consistent - // - // - num_frequent is consistent with data and num_categories - i.e. table_sizes - // - category_frequent_index and frequent_categories are consistent - // - both are consistent with num_frequent - // - table offsets frequent embedding are consistent with frequent_categories array - // - for (size_t instance = 0; instance < num_instances; ++instance) { - Model &model = models[instance]; - const size_t num_categories = model.num_categories; - - std::vector &frequent_table_offsets = model.h_frequent_model_table_offsets; - std::vector category_location; - download_tensor(category_location, model.category_location, stream); - std::vector frequent_categories; - download_tensor(frequent_categories, model.frequent_categories, stream); - - // check that number of frequent categories in category_location == model.num_frequent - size_t num_frequent_model = 0; - for (size_t i = 0; i < num_categories; ++i) { - num_frequent_model += (size_t)(category_location[2 * i] == num_instances ? 1 : 0); - } - EXPECT_EQ(num_frequent_model, model.num_frequent); - - // check that category in frequent_categories has corresponding index in category_frequent_index - for (size_t i = 0; i < frequent_categories.size(); ++i) { - size_t category = frequent_categories[i]; - EXPECT_EQ(category_location[2 * category + 1], i); - } - - std::map category_to_stats_map; - for (size_t i = 0; i < categories_sorted_stats.size(); ++i) { - category_to_stats_map[categories_sorted_stats[i]] = i; - } - - // check that table offsets are consistent with the frequent_categories array - // - check that categories corresponding to embedding actually part of embedding - std::set set_categories_from_table_offsets; - std::set set_categories_frequent_categories_array(frequent_categories.begin(), - frequent_categories.end()); - for (size_t em_instance = 0; em_instance < num_instances; ++em_instance) { - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - size_t cur_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding]; - size_t next_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding + 1]; - size_t counts_cur = 0; - size_t counts_prev = 0; - for (size_t frequent_category_index = cur_offset; frequent_category_index < next_offset; - ++frequent_category_index) { - size_t category = frequent_categories[frequent_category_index]; - size_t embedding_category = - EmbeddingTableFunctors::get_embedding_table_index(table_sizes, category); - - EXPECT_EQ(embedding, embedding_category); - - // find category in category_sorted_stats array - size_t indx_stats = category_to_stats_map[category]; - counts_cur = (size_t)counts_sorted_stats[indx_stats]; - if (frequent_category_index > cur_offset) { - // find category in category_sorted_stats array - EXPECT_TRUE(counts_prev >= counts_cur); - } - counts_prev = counts_cur; - - set_categories_from_table_offsets.insert(category); - } - } - } - // - check that the table offsets cover all frequent categories - EXPECT_TRUE(set_categories_from_table_offsets == set_categories_frequent_categories_array); - - // check that infrequent categories as per category_location are not present in - // frequent_categories array - for (size_t category = 0; category < num_categories; ++category) { - if (category_location[2 * category] < num_instances) { - EXPECT_TRUE(set_categories_frequent_categories_array.find(category) == - set_categories_frequent_categories_array.end()); - } - } - } - - // TODO: - // // Check that the models of all the instances are identical - // std::vector category_frequent_index; - // std::vector category_location; - // download_tensor(category_frequent_index, models[0].category_frequent_index, stream); - // download_tensor(category_location, models[0].category_location, stream); - // for (size_t instance = 0; instance < num_instances; ++instance) { - // for (size_t category = 0; category < num_categories; ++category) { - - // } - // } - - std::cout << "Finished the unit test for model init()!" << std::endl; -} -template -void model_init_test(const size_t batch_size, HybridEmbeddingConfig config, - std::vector &table_sizes) { - // 1. generate the reference model from reference stats and corresponding data - // std::vector categories; - // std::vector counts; - long long num_instances = config.num_instances; - size_t num_tables = config.num_tables; - auto ctype = config.comm_type; - const size_t num_iterations = 1; - std::cout << "Model init test ... " << std::endl << std::endl; - std::cout << "number of instances : " << num_instances << std::endl; - std::cout << "Number of tables : " << num_tables << std::endl; - std::cout << "Batch size : " << batch_size << std::endl; - std::cout << "Number of iterations : " << num_iterations << std::endl; - - HybridEmbeddingInputGenerator input_generator(config, table_sizes, 848484); - std::vector raw_data = input_generator.generate_categorical_input(batch_size); - // std::vector table_sizes = input_generator.get_table_sizes(); - const size_t num_categories = - std::accumulate(table_sizes.begin(), table_sizes.end(), static_cast(0)); - std::cout << "Table sizes : "; - for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding) - std::cout << '\t' << table_sizes[embedding]; - std::cout << std::endl; - - // create the gpu tensor for the raw data - cudaStream_t stream = 0; - Tensor2 d_raw_data; - std::shared_ptr> buf = GeneralBuffer2::create(); - std::cout << "number of samples : " << raw_data.size() / num_tables << std::endl; - buf->reserve({raw_data.size(), 1}, &d_raw_data); - buf->allocate(); - upload_tensor(raw_data, d_raw_data, stream); - - std::cout << "Testing raw data..." << std::endl; - test_raw_data(d_raw_data.get_ptr(), batch_size, num_tables, num_iterations, table_sizes); - std::cout << "Done testing raw data..." << std::endl; - - // 2. perform model initialization, data - std::cout << "performing statistics and calibration initialization..." << std::endl; - Data data(table_sizes, batch_size, num_iterations); - data.data_to_unique_categories(d_raw_data, stream); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - - std::cout << "Testing samples..." << std::endl; - test_samples(d_raw_data.get_ptr(), data); - std::cout << "Done testing samples!" << std::endl; - - Statistics statistics(data, num_instances); - CalibrationData calibration(1, 1. / 10., 130.e9, 190.e9, 1.0); - - // model creation - std::cout << "performing model initialization..." << std::endl; - std::vector num_instances_per_node(1); - num_instances_per_node[0] = (uint32_t)num_instances; - // Model model(ctype, 0, num_instances_per_node, num_categories); - // = {(uint32_t)num_instances}; - std::vector> models; - std::vector> frequent_infrequent_categories(num_instances); - for (long long instance = 0; instance < num_instances; ++instance) { - std::shared_ptr> buf = GeneralBuffer2::create(); - // std::shared_ptr> temp_block_buffer = buf->create_block<>(dtype); - // Tensor2 tmp_infrequent_categories; - buf->reserve({(size_t)num_categories, 1}, &frequent_infrequent_categories[instance]); - buf->allocate(); - - // std::cout << "instance : " << instance << std::endl; - std::cout << "constructing instance, allocating memory..." << std::endl; - models.emplace_back(ctype, (uint32_t)instance, num_instances_per_node, num_categories); - std::cout << "initializing model..." << std::endl; - models[instance].init_hybrid_model(calibration, statistics, data, - frequent_infrequent_categories[instance], stream); - std::cout << "done initializing model" << std::endl; - } - std::vector categories_sorted_stats; - std::vector counts_sorted_stats; - download_tensor(categories_sorted_stats, statistics.categories_sorted, stream); - download_tensor(counts_sorted_stats, statistics.counts_sorted, stream); - - // TODO: check consistency of - // global_instance_id, - // num_instances_per_node, - // node_id, - - // Check defining properties - - std::cout << "Checking consistency and completeness of infrequent embedding..." << std::endl; - // check order of categories for infrequent - // - assuming default distributed embedding - std::vector num_infrequent_model_vec(num_instances); - size_t num_infrequent_tables = 0; - for (long long instance = 0; instance < num_instances; ++instance) { - Model &model = models[instance]; - - std::vector category_location; - download_tensor(category_location, model.category_location, stream); - - size_t indx_infrequent = 0; - for (size_t category = 0; category < num_categories; ++category) { - if (category_location[2 * category] < num_instances) { - size_t instance_location = category_location[2 * category]; - size_t buffer_index = category_location[2 * category + 1]; - - EXPECT_EQ(instance_location, indx_infrequent % num_instances); - EXPECT_EQ(buffer_index, indx_infrequent / num_instances); - - indx_infrequent++; - } - } - const size_t num_infrequent_model = indx_infrequent; - num_infrequent_model_vec[instance] = num_infrequent_model; - - // check consistency table offsets - size_t num_infrequent_tables_instance = 0; - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - size_t cur_offset = model.h_infrequent_model_table_offsets[embedding]; - size_t next_offset = model.h_infrequent_model_table_offsets[embedding + 1]; - size_t indx_infrequent_instance = 0; - for (size_t category = 0; category < num_categories; ++category) { - if (category_location[2 * category] == (dtype)instance) { - if (indx_infrequent_instance >= cur_offset && indx_infrequent_instance < next_offset) { - size_t embedding_category = - EmbeddingTableFunctors::get_embedding_table_index(table_sizes, category); - EXPECT_EQ(embedding_category, embedding); - } - indx_infrequent_instance++; - } - } - num_infrequent_tables_instance = indx_infrequent_instance; - } - num_infrequent_tables += num_infrequent_tables_instance; - } - // Check that the total number of embedding vectors in all instances for all tables equals - // the total number of infrequent embedding vectors - if (num_infrequent_model_vec.size() > 0) { - EXPECT_EQ(num_infrequent_tables, num_infrequent_model_vec[0]); - if (num_infrequent_tables != num_infrequent_model_vec[0]) { - std::cout << "num_infrequent_tables = " << num_infrequent_tables << std::endl; - std::cout << "num_infrequent_model_vec[0] = " << num_infrequent_model_vec[0] << std::endl; - } - } - // Check that the number of infrequent categories is the same for all instances. - for (long long instance = 1; instance < num_instances; ++instance) { - EXPECT_EQ(num_infrequent_model_vec[instance], num_infrequent_model_vec[0]); - } - - std::cout << "Checking consistency and completeness of frequent embedding..." << std::endl; - // Check that the frequent embedding model is complete and self-consistent - // - // - num_frequent is consistent with data and num_categories - i.e. table_sizes - // - category_frequent_index and frequent_categories are consistent - // - both are consistent with num_frequent - // - table offsets frequent embedding are consistent with frequent_categories array - // - for (long long instance = 0; instance < num_instances; ++instance) { - Model &model = models[instance]; - const size_t num_categories = model.num_categories; - - std::vector &frequent_table_offsets = model.h_frequent_model_table_offsets; - std::vector category_location; - download_tensor(category_location, model.category_location, stream); - std::vector frequent_categories; - download_tensor(frequent_categories, model.frequent_categories, stream); - - // check that number of frequent categories in category_location == model.num_frequent - size_t num_frequent_model = 0; - for (size_t i = 0; i < num_categories; ++i) { - num_frequent_model += (size_t)(category_location[2 * i] == num_instances ? 1 : 0); - } - EXPECT_EQ(num_frequent_model, model.num_frequent); - - // check that category in frequent_categories has corresponding index in category_frequent_index - for (size_t i = 0; i < frequent_categories.size(); ++i) { - size_t category = frequent_categories[i]; - EXPECT_EQ(category_location[2 * category + 1], i); - } - - std::map category_to_stats_map; - for (size_t i = 0; i < categories_sorted_stats.size(); ++i) { - category_to_stats_map[categories_sorted_stats[i]] = i; - } - - // check that table offsets are consistent with the frequent_categories array - // - check that categories corresponding to embedding actually part of embedding - std::set set_categories_from_table_offsets; - std::set set_categories_frequent_categories_array(frequent_categories.begin(), - frequent_categories.end()); - for (long long em_instance = 0; em_instance < num_instances; ++em_instance) { - for (size_t embedding = 0; embedding < num_tables; ++embedding) { - size_t cur_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding]; - size_t next_offset = frequent_table_offsets[em_instance * (num_tables + 1) + embedding + 1]; - size_t counts_cur = 0; - size_t counts_prev = 0; - for (size_t frequent_category_index = cur_offset; frequent_category_index < next_offset; - ++frequent_category_index) { - size_t category = frequent_categories[frequent_category_index]; - size_t embedding_category = - EmbeddingTableFunctors::get_embedding_table_index(table_sizes, category); - - EXPECT_EQ(embedding, embedding_category); - - // find category in category_sorted_stats array - size_t indx_stats = category_to_stats_map[category]; - counts_cur = (size_t)counts_sorted_stats[indx_stats]; - if (frequent_category_index > cur_offset) { - // find category in category_sorted_stats array - EXPECT_TRUE(counts_prev >= counts_cur); - } - counts_prev = counts_cur; - - set_categories_from_table_offsets.insert(category); - } - } - } - // - check that the table offsets cover all frequent categories - EXPECT_TRUE(set_categories_from_table_offsets == set_categories_frequent_categories_array); - - // check that infrequent categories as per category_location are not present in - // frequent_categories array - for (size_t category = 0; category < num_categories; ++category) { - if (category_location[2 * category] < num_instances) { - EXPECT_TRUE(set_categories_frequent_categories_array.find(category) == - set_categories_frequent_categories_array.end()); - } - } - } - std::cout << "Finished the unit test for model init()!" << std::endl; -} - -} // namespace - -// TEST(hybrid_embedding_model_test, uint32) { model_test(); } -// TEST(hybrid_embedding_model_test, long_long) { model_test(); } -TEST(hybrid_embedding_model_test, init_model) { - const size_t N = 5; - const size_t batch_size = 15 * 64 * 1024; - - for (size_t num_instances = 1; num_instances <= 16; num_instances = 4 * num_instances) { - for (size_t num_tables = 1; num_tables <= 32; num_tables = 4 * num_tables) { - for (size_t i = 0; i < N; ++i) { - model_init_test(num_instances, num_tables, batch_size, - CommunicationType::NVLink_SingleNode); - } - } - } -} -TEST(hybrid_embedding_model_test, large_table_sizes) { - const size_t batch_size = 64 * 1024; - size_t num_nodes = 1; - size_t num_instances = 1; - size_t num_tables = 2; - size_t embedding_vec_size = 4; - long long num_categories = -1; - long long num_frequent = -1; - float lr = 0.0001f; - std::vector table_sizes{(1ul << 28), 1ul << 10}; - HybridEmbeddingConfig config{ - num_nodes, num_instances, num_tables, embedding_vec_size, num_categories, num_frequent, lr}; - model_init_test(batch_size, config, table_sizes); -} -TEST(hybrid_embedding_model_test, debug) { - const size_t batch_size = 64 * 1024; - model_init_test(2, 1, batch_size, CommunicationType::NVLink_SingleNode); -} diff --git a/test/utest/embedding/hybrid_embedding/select_test.cu b/test/utest/embedding/hybrid_embedding/select_test.cu deleted file mode 100644 index efc379c7ff..0000000000 --- a/test/utest/embedding/hybrid_embedding/select_test.cu +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -namespace Predict { -template -struct is_odd { - __host__ __device__ __forceinline__ bool operator()(const T &a) const { return (a & 1); } - is_odd() = default; -}; -} // namespace Predict - -template -void check(std::vector &h_ref, std::vector &h_gpu) { - for (size_t i = 0; i < h_ref.size(); i++) { - if (h_ref[i] != h_gpu[i]) { - std::cerr << " error at index " << i << std::endl; - exit(-1); - } - } - std::cout << "check pass" << std::endl; -} -template -struct SelectTest { - Pred Op_; - size_t len_; - std::vector keys_; - std::vector ref_cpu_; - std::vector ref_gpu_; - T *d_keys_; - T *d_output_; - T *d_num_selected_out_; - T ref_count_; - - void gather_if(const std::vector &input, std::vector &output) { - output.clear(); - if (input.empty()) { - for (size_t i = 0; i < len_; i++) { - if (Op_(i)) { - output.push_back(i); - } - } - } else { - for (auto in : input) { - if (Op_(in)) { - output.push_back(in); - } - } - } - } - - SelectTest(size_t len, Pred Op, bool no_input = false) : len_(len), Op_(Op), ref_count_(0) { - if (!no_input) { - cudaMalloc((void **)(&d_keys_), sizeof(T) * len); - keys_.resize(len, 0); - for (size_t i = 0; i < keys_.size(); i++) { - keys_[i] = std::rand(); - } - std::cout << "keys init done" << std::endl; - } else { - d_keys_ = nullptr; - keys_.clear(); - } - cudaMalloc((void **)(&d_num_selected_out_), sizeof(T)); - cudaMalloc((void **)(&d_output_), sizeof(T) * len); - } - - void test() { - if (d_keys_) { - cudaMemcpy(d_keys_, keys_.data(), sizeof(T) * len_, cudaMemcpyHostToDevice); - } - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - HugeCTR::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_keys_, d_output_, - d_num_selected_out_, len_, Op_); - std::cout << "temp storage bytes\n" << temp_storage_bytes << std::endl; - cudaMalloc((void **)&d_temp_storage, temp_storage_bytes); - HugeCTR::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_keys_, d_output_, - d_num_selected_out_, len_, Op_); - cudaDeviceSynchronize(); - cudaMemcpy(&ref_count_, d_num_selected_out_, sizeof(T), cudaMemcpyDeviceToHost); - gather_if(keys_, ref_cpu_); - if (ref_count_ != static_cast(ref_cpu_.size())) { - std::cerr << "selected num mismatches\n" << std::endl; - std::cerr << "expected: " << ref_cpu_.size() << " got " << ref_count_ << std::endl; - exit(-1); - } - std::cout << "get num_selected " << ref_count_ << std::endl; - ref_gpu_.resize(ref_count_); - cudaMemcpy(ref_gpu_.data(), d_output_, sizeof(T) * ref_gpu_.size(), cudaMemcpyDeviceToHost); - check(ref_cpu_, ref_gpu_); - cudaFree(d_temp_storage); - } - ~SelectTest() { - if (d_keys_) { - cudaFree(d_keys_); - } - cudaFree(d_num_selected_out_); - cudaFree(d_output_); - } -}; - -TEST(select, is_odd_31) { - SelectTest> select_test((1ul << 32), Predict::is_odd()); - select_test.test(); -} -TEST(select, counting) { - SelectTest> select_test((1ul << 20), Predict::is_odd(), - true); - select_test.test(); -} -TEST(select, large_counting) { - SelectTest> select_test((1ul << 31), Predict::is_odd(), - true); - select_test.test(); -} diff --git a/test/utest/embedding/hybrid_embedding/statistics_test.cpp b/test/utest/embedding/hybrid_embedding/statistics_test.cpp deleted file mode 100644 index 0d900bce5b..0000000000 --- a/test/utest/embedding/hybrid_embedding/statistics_test.cpp +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace HugeCTR::hybrid_embedding; - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -void arg_sort(const std::vector &v, std::vector &arg) { - arg.resize(v.size()); - std::iota(arg.begin(), arg.end(), (size_t)0); - std::stable_sort(arg.begin(), arg.end(), [&v](size_t i1, size_t i2) { return v[i1] > v[i2]; }); -} - -template -void generate_reference_stats(const std::vector &data, std::vector &samples, - std::vector &categories_stats, - std::vector &counts_stats, - const std::vector &table_sizes, const size_t batch_size) { - const size_t num_embeddings = table_sizes.size(); - - std::vector embedding_offsets; - EmbeddingTableFunctors::get_embedding_offsets(embedding_offsets, table_sizes); - - samples.resize(data.size()); - for (size_t sample = 0; sample < batch_size; ++sample) { - for (size_t embedding = 0; embedding < num_embeddings; ++embedding) { - size_t indx = sample * num_embeddings + embedding; - samples[indx] = embedding_offsets[embedding] + data[indx]; - } - } - - // create statistics - std::set category_set(samples.begin(), samples.end()); - const size_t num_unique_categories = category_set.size(); - - // helper structures - std::map category_index; - std::vector categories(num_unique_categories); - size_t indx = (size_t)0; - for (const auto &category : category_set) { - category_index[category] = indx; - categories[indx] = category; - indx++; - } - - std::vector counts(num_unique_categories, (size_t)0); - for (size_t sample = 0; sample < samples.size(); ++sample) { - size_t indx = category_index[samples[sample]]; - counts[indx]++; - } - - // sort categories and counts by argument - std::vector arg; - arg_sort(counts, arg); - categories_stats.resize(num_unique_categories); - counts_stats.resize(num_unique_categories); - for (indx = 0; indx < num_unique_categories; ++indx) { - categories_stats[indx] = categories[arg[indx]]; - counts_stats[indx] = counts[arg[indx]]; - - // check order counts - if (indx > 0 && counts_stats[indx] > counts_stats[indx - 1]) { - HCTR_LOG_S(DEBUG, WORLD) << "incorrect counts order!" << std::endl; - } - } -} - -} // namespace hybrid_embedding - -} // namespace HugeCTR - -template -void statistics_test(const size_t batch_size, const size_t num_tables) { - // 1. generate reference samples and stats - cudaStream_t stream = 0; - - std::vector categories; - std::vector counts; - - HugeCTR::hybrid_embedding::HybridEmbeddingInputGenerator input_generator(848484); - std::vector raw_data = input_generator.generate_categorical_input(batch_size, num_tables); - std::vector table_sizes = input_generator.get_table_sizes(); - HCTR_LOG_S(DEBUG, WORLD) << "Number of tables : " << num_tables << std::endl; - { - auto log = HCTR_LOG_S(DEBUG, WORLD); - log << "Table sizes : "; - for (size_t embedding = 0; embedding < table_sizes.size(); ++embedding) { - log << '\t' << table_sizes[embedding]; - } - log << std::endl; - } - - std::vector samples_ref; - HugeCTR::hybrid_embedding::generate_reference_stats(raw_data, samples_ref, categories, - counts, table_sizes, batch_size); - - size_t tot_count = 0; - for (size_t c = 0; c < categories.size(); ++c) { - tot_count += counts[c]; - } - EXPECT_EQ(tot_count, raw_data.size()); - - // create the gpu tensor for the raw data - HCTR_LOG_S(DEBUG, WORLD) << "placing raw data on gpu..." << std::endl; - Tensor2 d_raw_data; - std::shared_ptr> buf = GeneralBuffer2::create(); - EXPECT_EQ(raw_data.size(), batch_size * num_tables); - HCTR_LOG_S(DEBUG, WORLD) << "number of samples : " << raw_data.size() << std::endl; - HCTR_LOG_S(DEBUG, WORLD) << "number of unique categories : " << categories.size() << std::endl; - buf->reserve({raw_data.size(), 1}, &d_raw_data); - buf->allocate(); - upload_tensor(raw_data, d_raw_data, stream); - - // 2. perform hybrid_embedding statistics on gpu - Data data(table_sizes, batch_size, 1); - data.data_to_unique_categories(d_raw_data, stream); - size_t num_instances = 8; // not important here - HugeCTR::hybrid_embedding::Statistics statistics(data, num_instances); - statistics.sort_categories_by_count(data.samples, stream); - HCTR_LIB_THROW(cudaStreamSynchronize(stream)); - EXPECT_EQ(statistics.num_samples, raw_data.size()); - EXPECT_EQ(categories.size(), statistics.num_unique_categories); - - // check that the samples are the same.. - std::vector h_samples(samples_ref.size()); - download_tensor(h_samples, data.samples, stream); - EXPECT_EQ(h_samples.size(), samples_ref.size()); - for (size_t sample = 0; sample < samples_ref.size(); ++sample) { - EXPECT_EQ(h_samples[sample], samples_ref[sample]); - } - - // 3. check that hybrid_embedding calculated stats == ref stats - std::vector h_categories_sorted; - std::vector h_counts_sorted; - download_tensor(h_categories_sorted, statistics.categories_sorted, stream); - download_tensor(h_counts_sorted, statistics.counts_sorted, stream); - - size_t tot_count_stats = 0; - for (size_t c = 0; c < categories.size(); ++c) { - tot_count_stats += h_counts_sorted[c]; - } - EXPECT_EQ(tot_count_stats, raw_data.size()); - - for (size_t c = 0; c < categories.size(); ++c) { - EXPECT_EQ(h_categories_sorted[c], categories[c]); - EXPECT_EQ(h_counts_sorted[c], counts[c]); - } - - const size_t num_categories_sorted_test = statistics.num_unique_categories; - if (num_categories_sorted_test != categories.size()) { - HCTR_LOG_S(DEBUG, WORLD) << "Number of categories_sorted is NOT the same as the reference!" - << std::endl; - } else { - HCTR_LOG_S(DEBUG, WORLD) << "Number of categories_sorted is the same as the reference!" - << std::endl; - } - EXPECT_EQ(num_categories_sorted_test, categories.size()); - std::unordered_set category_set_test( - h_categories_sorted.begin(), h_categories_sorted.begin() + num_categories_sorted_test); - std::unordered_set category_set_samples_test(h_samples.begin(), h_samples.end()); - if (category_set_test == category_set_samples_test) { - HCTR_LOG_S(DEBUG, WORLD) - << "The sorted categories are the same as in the samples and cover all samples!" - << std::endl; - } else { - HCTR_LOG_S(DEBUG, WORLD) - << "The sorted categories are NOT the same as in the samples and cover all samples!" - << std::endl; - } - EXPECT_TRUE(category_set_test == category_set_samples_test); - std::unordered_set category_set_ref(categories.begin(), categories.end()); - if (category_set_test == category_set_ref) { - HCTR_LOG_S(DEBUG, WORLD) << "The sorted categories are the same as the reference sorted!" - << std::endl; - } else { - HCTR_LOG_S(DEBUG, WORLD) << "The sorted categories are NOT the same as the reference sorted!" - << std::endl; - } - EXPECT_TRUE(category_set_test == category_set_ref); - size_t count_ne = (size_t)0; - for (size_t c = 0; c < categories.size(); ++c) { - count_ne += ((size_t)h_categories_sorted[c] != (size_t)categories[c] ? 1 : 0); - } - if (count_ne > 0) - HCTR_LOG_S(DEBUG, WORLD) << "Number of different categories : " - << static_cast(count_ne) / - static_cast(categories.size()) * 100. - << " %" << std::endl; - EXPECT_EQ(count_ne, 0); -} - -TEST(calculate_statistics_test, dtype_uint32) { - const size_t N = 5; - for (size_t batch_size = 128; batch_size < 15 * 64 * 1024; batch_size = 4 * batch_size) { - for (size_t num_tables = 1; num_tables <= 32; num_tables = 4 * num_tables) { - for (size_t i = 0; i < N; ++i) { - statistics_test(batch_size, num_tables); - } - } - } -} - -TEST(calculate_statistics_test, dtype_long_long) { - const size_t N = 5; - for (size_t batch_size = 128; batch_size < 15 * 64 * 1024; batch_size = 4 * batch_size) { - for (size_t num_tables = 1; num_tables <= 32; num_tables = 4 * num_tables) { - for (size_t i = 0; i < N; ++i) { - statistics_test(batch_size, num_tables); - } - } - } -} diff --git a/test/utest/embedding/hybrid_embedding/statistics_test.hpp b/test/utest/embedding/hybrid_embedding/statistics_test.hpp deleted file mode 100644 index 9e8f0a26a2..0000000000 --- a/test/utest/embedding/hybrid_embedding/statistics_test.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -namespace HugeCTR { - -namespace hybrid_embedding { - -template -void generate_reference_stats(const std::vector &data, std::vector &samples, - std::vector &categories_stats, - std::vector &counts_stats, - const std::vector &table_sizes, const size_t batch_size); - -} // namespace hybrid_embedding - -} // namespace HugeCTR diff --git a/test/utest/embedding/hybrid_embedding/test_common.cuh b/test/utest/embedding/hybrid_embedding/test_common.cuh deleted file mode 100644 index 4c07abed3c..0000000000 --- a/test/utest/embedding/hybrid_embedding/test_common.cuh +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include - -template -constexpr inline IntType ceildiv(IntType a, IntType b) { - return (a + b - 1) / b; -} - -using namespace HugeCTR; -using namespace HugeCTR::hybrid_embedding; - -template -class HybridEmbeddingUnitTest { - protected: - const HybridEmbeddingConfig config; - HybridEmbeddingInputGenerator input_generator; - const uint32_t batch_size; - const uint32_t num_instances; - const uint32_t embedding_vec_size; - - const std::vector category_location; - const std::vector samples; - const std::vector table_sizes; - - cudaStream_t stream; - // std::shared_ptr fake_resource_manager; - GPUResource fake_resource; - std::vector> model_list; - std::vector> data_list; - std::vector> frequent_embeddings_single_node; - std::vector> frequent_embeddings_multi_node; - - // std::vector> infrequent_embeddings; - std::vector> - infrequent_embeddings_single_node; - std::vector> infrequent_embeddings_ib_nvlink; - std::vector> - infrequent_embeddings_ib_nvlink_hier; - - std::vector> frequent_embedding_indices; - std::vector> infrequent_embedding_indices; - - float *dev_lr; - - FrequentEmbeddingData &get_frequent_embedding_data(size_t i) { - if (frequent_embeddings_single_node.size()) { - return frequent_embeddings_single_node[i].frequent_data_; - } else { - return frequent_embeddings_multi_node[i].frequent_data_; - } - } - - FrequentEmbeddingBase &get_frequent_embedding(size_t i) { - if (frequent_embeddings_single_node.size()) { - return frequent_embeddings_single_node[i]; - } else { - return frequent_embeddings_multi_node[i]; - } - } - - public: - void build_model() { - model_list.reserve(num_instances); - std::vector num_instances_per_node_list(config.num_nodes, - config.num_instances / config.num_nodes); - for (size_t i = 0; i < num_instances; i++) { - model_list.emplace_back(config.comm_type, i, num_instances_per_node_list, - config.num_categories); - model_list[i].num_frequent = config.num_frequent; - } - - for (size_t i = 0; i < num_instances; i++) { - upload_tensor(category_location, model_list[i].category_location, stream); - } - } - - void build_data() { - data_list.reserve(num_instances); - for (size_t i = 0; i < num_instances; i++) { - data_list.emplace_back(table_sizes, batch_size, 1); - } - - for (size_t i = 0; i < num_instances; i++) { - upload_tensor(samples, data_list[i].samples, stream); - upload_tensor(samples, data_list[i].samples, stream); - } - - HCTR_LIB_THROW(cudaMalloc(&dev_lr, sizeof(float))); - HCTR_LIB_THROW(cudaMemcpy(dev_lr, &config.lr, sizeof(float), cudaMemcpyHostToDevice)); - } - - void build_frequent() { - if (config.comm_type == CommunicationType::NVLink_SingleNode) { - frequent_embeddings_single_node.reserve(num_instances); - } else { - frequent_embeddings_multi_node.reserve(num_instances); - } - - for (size_t i = 0; i < num_instances; i++) { - std::shared_ptr> placeholder = NULL; - if (config.comm_type == CommunicationType::NVLink_SingleNode) { - frequent_embeddings_single_node.emplace_back(model_list[i], fake_resource, placeholder, - embedding_vec_size, config.num_frequent); - } else { - frequent_embeddings_multi_node.emplace_back(model_list[i], fake_resource, placeholder, - embedding_vec_size, config.num_frequent); - } - frequent_embedding_indices.emplace_back(config.num_frequent, data_list[i], model_list[i]); - } - - if (config.comm_type == CommunicationType::NVLink_SingleNode) { - std::vector h_vectors_cache_pointers(num_instances); - for (uint32_t i = 0; i < num_instances; i++) { - h_vectors_cache_pointers[i] = - frequent_embeddings_single_node[i].get_embedding_vectors_cache().get_ptr(); - } - for (uint32_t i = 0; i < num_instances; i++) { - HCTR_LIB_THROW(cudaMemcpyAsync( - frequent_embeddings_single_node[i].embedding_vectors_cache_pointers_.get_ptr(), - h_vectors_cache_pointers.data(), num_instances * sizeof(emtype *), - cudaMemcpyHostToDevice, stream)); - } - } - } - - void build_infrequent() { - if (config.comm_type == CommunicationType::NVLink_SingleNode) { - infrequent_embeddings_single_node.reserve(num_instances); - for (size_t i = 0; i < num_instances; i++) { - infrequent_embeddings_single_node.emplace_back(model_list[i], fake_resource, - embedding_vec_size); - infrequent_embedding_indices.emplace_back(data_list[i], model_list[i]); - } - } - - if (config.comm_type == CommunicationType::IB_NVLink) { - infrequent_embeddings_ib_nvlink.reserve(num_instances); - for (size_t i = 0; i < num_instances; i++) { - infrequent_embeddings_ib_nvlink.emplace_back(model_list[i], fake_resource, - embedding_vec_size); - infrequent_embedding_indices.emplace_back(data_list[i], model_list[i]); - } - } - - if (config.comm_type == CommunicationType::IB_NVLink_Hier) { - infrequent_embeddings_ib_nvlink_hier.reserve(num_instances); - for (size_t i = 0; i < num_instances; i++) { - infrequent_embeddings_ib_nvlink_hier.emplace_back(model_list[i], fake_resource, - embedding_vec_size); - infrequent_embedding_indices.emplace_back(data_list[i], model_list[i]); - uint32_t samples_size = data_list[i].batch_size * data_list[i].table_sizes.size(); - infrequent_embeddings_ib_nvlink_hier[i].max_num_infrequent_per_batch_ = samples_size; - infrequent_embeddings_ib_nvlink_hier[i].max_num_infrequent_per_train_batch_ = samples_size; - } - } - } - - ncclComm_t get_fake_comm() { - ncclComm_t comm; - int device_list[1] = {0}; - ncclCommInitAll(&comm, 1, device_list); - return comm; - } - - HybridEmbeddingUnitTest(const HybridEmbeddingConfig config, size_t batch_size, - size_t seed = 1234ll) - : config(config), - input_generator(config, seed), - batch_size(batch_size), - num_instances(config.num_instances), - embedding_vec_size(config.embedding_vec_size), - category_location((input_generator.generate_category_location(), - input_generator.get_category_location())), - samples(input_generator.generate_flattened_categorical_input(batch_size)), - table_sizes(input_generator.get_table_sizes()), - fake_resource(0, 0, 0, seed, seed, get_fake_comm()) { - HCTR_LIB_THROW(cudaStreamCreate(&stream)); - build_model(); - build_data(); - } -}; - -inline bool compare_element(float a, float b, float epsilon) { - // compare absolute error - if (fabs(a - b) < epsilon) return true; - - // compare relative error - if (fabs(a) >= fabs(b)) - if (fabs((a - b) / a) < epsilon) - return true; - else - return false; - else if (fabs((a - b) / b) < epsilon) - return true; - else - return false; -} - -inline bool compare_array(size_t len, const float *a, const float *b, float epsilon) { - for (size_t i = 0; i < len; i++) { - if (!compare_element(a[i], b[i], epsilon)) { - HCTR_LOG(INFO, WORLD, "Error in compare_array: i=%zu, a=%.8f, b=%.8f\n", i, a[i], b[i]); - return false; - } - } - - return true; -} - -// overload for fp16 on GPU -inline bool compare_array(size_t len, const __half *a, const __half *b, float epsilon) { - for (size_t i = 0; i < len; i++) { - float fa = __half2float(a[i]); - float fb = __half2float(b[i]); - if (!compare_element(fa, fb, epsilon)) { - HCTR_LOG(INFO, WORLD, "Error in compare_array: i=%zu, a=%.8f, b=%.8f\n", i, fa, fb); - return false; - } - } - - return true; -} diff --git a/test/utest/embedding/hybrid_embedding/update_test.cpp b/test/utest/embedding/hybrid_embedding/update_test.cpp deleted file mode 100644 index 4224a24b07..0000000000 --- a/test/utest/embedding/hybrid_embedding/update_test.cpp +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -/************************ Infrequent embedding update ************************/ - -template -class InfrequentUpdateTest : public HybridEmbeddingUnitTest { - protected: - bool single_node; - - public: - InfrequentUpdateTest(const HybridEmbeddingConfig config, size_t batch_size, - bool single_node, size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed), - single_node(single_node) {} - - void run() { - uint32_t local_batch_size = ceildiv(this->batch_size, this->num_instances); - - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_infrequent_model_indices(); - cpu_embedding.calculate_infrequent_network_indices(); - cpu_embedding.generate_embedding_vectors(); - cpu_embedding.generate_gradients(); - if (this->config.comm_type == CommunicationType::IB_NVLink) { - cpu_embedding.backward_a2a_messages(); - } else if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - cpu_embedding.backward_a2a_messages_hier(); - } - - /* Tensors for the messages and gradients */ - std::shared_ptr> buff = GeneralBuffer2::create(); - std::vector> received_messages(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({this->batch_size * this->config.num_tables, this->config.embedding_vec_size}, - &received_messages[i]); - } - std::vector> gradients(this->num_instances); - if (single_node) { - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({this->num_instances * local_batch_size * this->config.num_tables, - this->config.embedding_vec_size}, - &gradients[i]); - } - } - buff->allocate(); - - /* Single-node: upload gradients */ - this->build_infrequent(); - if (single_node) { - for (size_t i = 0; i < this->num_instances; i++) { - upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream); - } - } - - /* Infrequent update_model */ - std::vector> updated_vectors(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - if (this->config.comm_type == CommunicationType::NVLink_SingleNode) { - this->infrequent_embeddings_single_node[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_, - this->stream); - this->infrequent_embeddings_single_node[i].indices_->calculate_model_indices(this->stream); - - std::vector gradients_pointers(this->num_instances); - for (uint32_t network_id = 0; network_id < this->num_instances; network_id++) - gradients_pointers[network_id] = gradients[network_id].get_ptr(); - HCTR_LIB_THROW(cudaMemcpyAsync( - this->infrequent_embeddings_single_node[i].gradients_pointers_.get_ptr(), - gradients_pointers.data(), this->num_instances * sizeof(emtype *), - cudaMemcpyHostToDevice, this->stream)); - this->infrequent_embeddings_single_node[i].update_model_direct(this->dev_lr, 1.f, - this->stream); - - download_tensor(updated_vectors[i], - this->infrequent_embeddings_single_node[i].infrequent_embedding_vectors_, - this->stream); - } - if (this->config.comm_type == CommunicationType::IB_NVLink) { - this->infrequent_embeddings_ib_nvlink[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_, - this->stream); - this->infrequent_embeddings_ib_nvlink[i].indices_->calculate_model_indices(this->stream); - - upload_tensor(cpu_embedding.backward_received_messages[i], received_messages[i], - this->stream); - this->infrequent_embeddings_ib_nvlink[i].update_model(received_messages[i].get_ptr(), - this->dev_lr, 1.f, this->stream); - - download_tensor(updated_vectors[i], - this->infrequent_embeddings_ib_nvlink[i].infrequent_embedding_vectors_, - this->stream); - } - if (this->config.comm_type == CommunicationType::IB_NVLink_Hier) { - this->infrequent_embeddings_ib_nvlink_hier[i].set_current_indices( - &this->infrequent_embedding_indices[i]); - upload_tensor(cpu_embedding.infrequent_embedding_vectors[i], - this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_, - this->stream); - this->infrequent_embeddings_ib_nvlink_hier[i].indices_->calculate_model_indices( - this->stream); - - upload_tensor(cpu_embedding.backward_received_messages[i], received_messages[i], - this->stream); - this->infrequent_embeddings_ib_nvlink_hier[i].hier_update_model( - received_messages[i].get_ptr(), this->dev_lr, 1.f, this->stream); - - download_tensor(updated_vectors[i], - this->infrequent_embeddings_ib_nvlink_hier[i].infrequent_embedding_vectors_, - this->stream); - } - } - - /* Reference update_model */ - cpu_embedding.infrequent_update(); - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - updated_vectors[i].resize( - ceildiv(this->config.num_categories - this->config.num_frequent, - this->num_instances) * - this->config.embedding_vec_size); - EXPECT_THAT(updated_vectors[i], - ::testing::Pointwise(::testing::FloatNear(1e-2), - cpu_embedding.infrequent_embedding_vectors[i])); - } - } -}; - -/************************* Frequent embedding update *************************/ - -template -class FrequentUpdateTest : public HybridEmbeddingUnitTest { - protected: - bool single_node; - - public: - FrequentUpdateTest(const HybridEmbeddingConfig config, size_t batch_size, bool single_node, - size_t seed = 1234ll) - : HybridEmbeddingUnitTest(config, batch_size, seed), - single_node(single_node) {} - - void run() { - uint32_t local_batch_size = ceildiv(this->batch_size, this->num_instances); - - HybridEmbeddingCpu cpu_embedding(this->config, this->batch_size, - this->category_location, this->samples); - cpu_embedding.calculate_frequent_network_cache_indices(); - cpu_embedding.generate_embedding_vectors(); - cpu_embedding.generate_gradients(); - cpu_embedding.frequent_reduce_gradients(); - - /* Tensors for the gradients (single-node) */ - std::shared_ptr> buff = GeneralBuffer2::create(); - std::vector> gradients(this->num_instances); - if (single_node) { - for (size_t i = 0; i < this->num_instances; i++) { - buff->reserve({local_batch_size * this->config.num_tables, this->config.embedding_vec_size}, - &gradients[i]); - } - } - buff->allocate(); - - /* Frequent update_model */ - this->build_frequent(); - std::vector> updated_vectors(this->num_instances); - std::vector frequent_partial_gradients_pointers(this->num_instances); - for (size_t i = 0; i < this->num_instances; i++) { - this->get_frequent_embedding(i).set_current_indices(&this->frequent_embedding_indices[i]); - upload_tensor(cpu_embedding.frequent_embedding_vectors[i], - this->get_frequent_embedding_data(i).frequent_embedding_vectors_, this->stream); - if (single_node) { - upload_tensor(cpu_embedding.gradients[i], gradients[i], this->stream); - frequent_partial_gradients_pointers[i] = - this->get_frequent_embedding_data(i).get_gradients().get_ptr(); - } else - upload_tensor(cpu_embedding.reduced_gradients, - this->get_frequent_embedding_data(i).get_gradients(), this->stream); - } - for (size_t i = 0; i < this->num_instances; i++) { - if (single_node) { - this->get_frequent_embedding(i).indices_->calculate_cache_masks(this->stream); - this->get_frequent_embedding(i).indices_->calculate_network_cache_indices(this->stream); - this->get_frequent_embedding(i).indices_->calculate_model_cache_indices(80, this->stream); - this->get_frequent_embedding(i).indices_->calculate_frequent_sample_indices(this->stream); - this->frequent_embeddings_single_node[i].local_reduce(gradients[i].get_ptr(), this->stream); - } else { - this->frequent_embeddings_multi_node[i].update_model(this->dev_lr, 1.f, this->stream); - } - } - for (size_t i = 0; i < this->num_instances; i++) { - if (single_node) { - HCTR_LIB_THROW(cudaMemcpyAsync( - this->frequent_embeddings_single_node[i].partial_gradients_pointers_.get_ptr(), - frequent_partial_gradients_pointers.data(), this->num_instances * sizeof(emtype *), - cudaMemcpyHostToDevice, this->stream)); - this->frequent_embeddings_single_node[i].update_model_direct(this->dev_lr, 1.f, - this->stream); - } - download_tensor(updated_vectors[i], - this->get_frequent_embedding_data(i).frequent_embedding_vectors_, - this->stream); - } - - /* Reference update_model */ - if (single_node) - cpu_embedding.frequent_update_single_node(); - else - cpu_embedding.frequent_update(); - - /* Compare */ - for (size_t i = 0; i < this->num_instances; i++) { - updated_vectors[i].resize(this->config.num_frequent * this->config.embedding_vec_size); - EXPECT_THAT(updated_vectors[i], - ::testing::Pointwise(::testing::FloatNear(5e-2), - cpu_embedding.frequent_embedding_vectors[i])); - } - } -}; - -/**************************** Test instantiations ****************************/ - -static const HybridEmbeddingConfig config_uint32 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_int64 = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_uint32_single_node = { - 1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode}; -static const HybridEmbeddingConfig config_int64_single_node = { - 1, 8, 10, 128, 1000, 128, 0.5f, CommunicationType::NVLink_SingleNode}; - -// Edge cases: no frequent, all frequent -static const HybridEmbeddingConfig config_no_freq = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_all_freq = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink}; -static const HybridEmbeddingConfig config_no_freq_single_node = { - 1, 8, 10, 128, 1000, 0, 0.5f, CommunicationType::NVLink_SingleNode}; -static const HybridEmbeddingConfig config_all_freq_single_node = { - 1, 8, 10, 128, 1000, 1000, 0.5f, CommunicationType::NVLink_SingleNode}; - -// Hierarchical A2A -static const HybridEmbeddingConfig config_uint32_hier = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_int64_hier = { - 4, 32, 10, 128, 1000, 128, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_no_freq_hier = { - 4, 32, 10, 128, 1000, 0, 0.5f, CommunicationType::IB_NVLink_Hier}; -static const HybridEmbeddingConfig config_all_freq_hier = { - 4, 32, 10, 128, 1000, 1000, 0.5f, CommunicationType::IB_NVLink_Hier}; - -/* hybrid_embedding_infrequent_update_test */ - -TEST(hybrid_embedding_infrequent_update_test, uint32_half_64) { - InfrequentUpdateTest(config_uint32, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, int64_half_64) { - InfrequentUpdateTest(config_int64, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, uint32_half_2048) { - InfrequentUpdateTest(config_uint32, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, int64_half_2048) { - InfrequentUpdateTest(config_int64, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, uint32_float_64) { - InfrequentUpdateTest(config_uint32, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, int64_float_64) { - InfrequentUpdateTest(config_int64, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, uint32_float_2048) { - InfrequentUpdateTest(config_uint32, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, int64_float_2048) { - InfrequentUpdateTest(config_int64, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, uint32_float_128_no_freq) { - InfrequentUpdateTest(config_no_freq, 128, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_test, uint32_float_128_all_freq) { - InfrequentUpdateTest(config_all_freq, 128, false).run(); -} - -/* hybrid_embedding_infrequent_update_single_node_test */ - -TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_half_64) { - InfrequentUpdateTest(config_uint32_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, int64_half_64) { - InfrequentUpdateTest(config_int64_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_half_2048) { - InfrequentUpdateTest(config_uint32_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, int64_half_2048) { - InfrequentUpdateTest(config_int64_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_64) { - InfrequentUpdateTest(config_uint32_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, int64_float_64) { - InfrequentUpdateTest(config_int64_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_2048) { - InfrequentUpdateTest(config_uint32_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, int64_float_2048) { - InfrequentUpdateTest(config_int64_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_128_no_freq) { - InfrequentUpdateTest(config_no_freq_single_node, 128, true).run(); -} - -TEST(hybrid_embedding_infrequent_update_single_node_test, uint32_float_128_all_freq) { - InfrequentUpdateTest(config_all_freq_single_node, 128, true).run(); -} - -/* hybrid_embedding_infrequent_update_hier_test */ - -TEST(hybrid_embedding_infrequent_update_hier_test, uint32_half_64) { - InfrequentUpdateTest(config_uint32_hier, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, int64_half_64) { - InfrequentUpdateTest(config_int64_hier, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, uint32_half_2048) { - InfrequentUpdateTest(config_uint32_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, int64_half_2048) { - InfrequentUpdateTest(config_int64_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_64) { - InfrequentUpdateTest(config_uint32_hier, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, int64_float_64) { - InfrequentUpdateTest(config_int64_hier, 64, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_2048) { - InfrequentUpdateTest(config_uint32_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, int64_float_2048) { - InfrequentUpdateTest(config_int64_hier, 2048, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_128_no_freq) { - InfrequentUpdateTest(config_no_freq_hier, 128, false).run(); -} - -TEST(hybrid_embedding_infrequent_update_hier_test, uint32_float_128_all_freq) { - InfrequentUpdateTest(config_all_freq_hier, 128, false).run(); -} - -/* hybrid_embedding_frequent_update_test */ - -TEST(hybrid_embedding_frequent_update_test, uint32_half_64) { - FrequentUpdateTest(config_uint32, 64, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, int64_half_64) { - FrequentUpdateTest(config_int64, 64, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, uint32_half_2048) { - FrequentUpdateTest(config_uint32, 2048, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, int64_half_2048) { - FrequentUpdateTest(config_int64, 2048, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, uint32_float_64) { - FrequentUpdateTest(config_uint32, 64, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, int64_float_64) { - FrequentUpdateTest(config_int64, 64, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, uint32_float_2048) { - FrequentUpdateTest(config_uint32, 2048, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, int64_float_2048) { - FrequentUpdateTest(config_int64, 2048, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, uint32_float_128_no_freq) { - FrequentUpdateTest(config_no_freq, 128, false).run(); -} - -TEST(hybrid_embedding_frequent_update_test, uint32_float_128_all_freq) { - FrequentUpdateTest(config_all_freq, 128, false).run(); -} - -/* hybrid_embedding_frequent_update_single_node_test */ - -TEST(hybrid_embedding_frequent_update_single_node_test, uint32_half_64) { - FrequentUpdateTest(config_uint32_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, int64_half_64) { - FrequentUpdateTest(config_int64_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, uint32_half_2048) { - FrequentUpdateTest(config_uint32_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, int64_half_2048) { - FrequentUpdateTest(config_int64_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_64) { - FrequentUpdateTest(config_uint32_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, int64_float_64) { - FrequentUpdateTest(config_int64_single_node, 64, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_2048) { - FrequentUpdateTest(config_uint32_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, int64_float_2048) { - FrequentUpdateTest(config_int64_single_node, 2048, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_128_no_freq) { - FrequentUpdateTest(config_no_freq_single_node, 128, true).run(); -} - -TEST(hybrid_embedding_frequent_update_single_node_test, uint32_float_128_all_freq) { - FrequentUpdateTest(config_all_freq_single_node, 128, true).run(); -} diff --git a/test/utest/embedding/hybrid_sparse_embedding_test.cpp b/test/utest/embedding/hybrid_sparse_embedding_test.cpp deleted file mode 100644 index 0ad6d1dfe1..0000000000 --- a/test/utest/embedding/hybrid_sparse_embedding_test.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#define private public -#define protected public -#include -#include -#include - -using namespace HugeCTR; -using namespace hybrid_embedding; - -namespace { -// const int numprocs = 8; -// const size_t train_batch_size = 55296; -// const size_t evaluate_batch_size = 55296; -const size_t num_iterations_statistics = 100; -const size_t max_num_frequent_categories = 10; -const double p_dup_max = 1. / 100; -const double max_all_reduce_bandwidth = 1.3e11; -const double max_all_to_all_bandwidth = 1.9e11; -const size_t slot_num = 26; -const size_t embedding_vec_size = 128; -std::vector slot_size_array{39884406, 39043, 17289, 7420, 20263, 3, 7120, - 1543, 63, 38532951, 2953546, 403346, 10, 2208, - 11938, 155, 4, 976, 14, 39979771, 25641295, - 39664984, 585935, 12972, 108, 36}; -const float scaler = 1.0f; -const float lr = 0.01f; -const DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST; -template -void print_vector(const std::vector &vec, size_t num_elment, const std::string &vec_name) { - auto log = HCTR_LOG_S(INFO, WORLD); - log << "vector name: " << vec_name << ",vector size: " << vec.size() << std::endl; - for (size_t i = 0; i < std::min(num_elment, vec.size()); ++i) { - log << vec[i] << ","; - } - log << std::endl; -} -template -void hybrid_sparse_embedding_construct(const std::vector &device_list, size_t train_batch_size, - size_t evaluate_batch_size, int numprocs, - hybrid_embedding::CommunicationType communication_type, - hybrid_embedding::HybridEmbeddingType hybrid_embedding_type, - const Optimizer_t &optimizer, const Update_t &update_type) { - // HCTR_LIB_THROW(nvmlInit_v2()); - std::vector> vvgpu; - for (int i = 0; i < numprocs; i++) { - vvgpu.push_back(device_list); - } - - auto resource_manager = ResourceManagerExt::create(vvgpu, (unsigned long long)1234); - size_t total_gpu_count = resource_manager->get_global_gpu_count(); - size_t local_gpu_count = resource_manager->get_local_gpu_count(); - size_t total_categories = 0; - for (size_t i = 0; i < slot_size_array.size(); ++i) { - // slot_size_array[i] = (slot_size_array[i] + 8)/8; - total_categories += slot_size_array[i]; - } - - HybridEmbeddingConfig test_config = { - (size_t)numprocs, - total_gpu_count, - slot_num, - embedding_vec_size, - (TypeKey)total_categories, - (TypeKey)0, // irrelevant here - 1.0 // irrelevant here - }; - HybridEmbeddingInputGenerator generator(test_config, slot_size_array, 848484); - - OptHyperParams hyper_params; - hyper_params.sgd.atomic_update = true; - const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler}; - const HybridSparseEmbeddingParams embedding_params = { - train_batch_size, - evaluate_batch_size, - num_iterations_statistics, - max_num_frequent_categories * train_batch_size, - p_dup_max, - embedding_vec_size, - slot_num, - slot_size_array, - communication_type, - max_all_reduce_bandwidth, - max_all_to_all_bandwidth, - false, - hybrid_embedding_type, - opt_params}; - - Tensors2 train_input_tensors; - Tensors2 evaluate_input_tensors; - Tensors2 inits; - auto initial_input = - generator.generate_categorical_input(train_batch_size * num_iterations_statistics); - auto input = generator.generate_categorical_input(train_batch_size); - CudaDeviceContext context; - - GpuLearningRateSchedulers lr_scheds; - - for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) { - std::shared_ptr> buf = GeneralBuffer2::create(); - int cur_device = resource_manager->get_local_gpu(lgpu)->get_device_id(); - - context.set_device(cur_device); - - auto stream = resource_manager->get_local_gpu(lgpu)->get_stream(); - - Tensor2 tensor0; - buf->reserve({train_batch_size, slot_num}, &tensor0); - train_input_tensors.push_back(tensor0); - - Tensor2 tensor1; - buf->reserve({evaluate_batch_size, slot_num}, &tensor1); - evaluate_input_tensors.push_back(tensor1); - Tensor2 tensor2; - buf->reserve({train_batch_size * num_iterations_statistics, slot_num}, &tensor2); - inits.push_back(tensor2); - buf->allocate(); - // print_vector(initial_input, 26, "initial_input"); - // print_vector(input, 26, "input"); - upload_tensor(initial_input, inits[lgpu], stream); - upload_tensor(input, train_input_tensors[lgpu], stream); - - lr_scheds.emplace_back( - new GpuLearningRateScheduler(lr, 1, 0, 1, 2.f, 0.f, resource_manager->get_local_gpu(lgpu))); - } - HCTR_LOG_S(INFO, WORLD) << "hybridEmbdeding" << std::endl; - std::vector>> placeholder( - resource_manager->get_local_gpu_count(), NULL); - std::unique_ptr> embedding( - new HybridSparseEmbedding(train_input_tensors, evaluate_input_tensors, - embedding_params, placeholder, lr_scheds, false, - resource_manager)); - HCTR_LOG_S(INFO, WORLD) << "init_model" << std::endl; - embedding->init_model(inits); - // HCTR_LOG_S(DEBUG, WORLD) << "forward" << std::endl; - HCTR_LOG_S(INFO, WORLD) << "batch size = " << train_batch_size << std::endl; - HCTR_LOG_S(INFO, WORLD) << "total_categories = " << total_categories - << ", num_frequent = " << embedding->model_[0].num_frequent << std::endl; - for (size_t lgpu = 0; lgpu < local_gpu_count; ++lgpu) { - HCTR_LOG_S(INFO, WORLD) << "GPU[" << lgpu << "]" - << " num_infrequent = " - << embedding->model_[lgpu].h_infrequent_model_table_offsets[slot_num] - << std::endl; - } - - std::chrono::time_point check; - for (int j = 0; j < 10000; ++j) { - for (int i = 0; i < int(resource_manager->get_local_gpu_count()); i++) { - auto device_id = resource_manager->get_local_gpu(i)->get_device_id(); - context.set_device(device_id); - HCTR_LIB_THROW(cudaDeviceSynchronize()); - } - if (j % 100 == 0) { - auto cost = std::chrono::duration_cast( - std::chrono::steady_clock::now() - check) - .count() / - 1000000.0; - HCTR_LOG_S(INFO, ROOT) << "100 iter time: " << cost << std::endl; - check = std::chrono::steady_clock::now(); - } - - embedding->forward(true); - // HCTR_LOG_S(DEBUG, WORLD) << i << ": fwd" << std::endl; - embedding->backward(); - // HCTR_LOG_S(DEBUG, WORLD) << i << ": bwd" << std::endl; - embedding->update_params(); - // HCTR_LOG_S(DEBUG, WORLD) << i << ": update" << std::endl; - // HCTR_LOG_S(DEBUG, WORLD) << "forward, i = " << i << std::endl; - } - // HCTR_LOG_S(DEBUG, WORLD) << "backward" << std::endl; -} - -} // namespace - -// TEST(hybrid_sparse_embedding_profile, multi_node_uin32_float) { -// std::vector local_batch_sizes{1024, 2048, 3072, 4096, 6144, 8192}; -// // std::vector local_batch_sizes{6912}; -// size_t num_procs = 8; -// for (auto local_batch : local_batch_sizes) { -// hybrid_sparse_embedding_construct( -// {0}, local_batch * num_procs, local_batch * num_procs, num_procs, -// hybrid_embedding::CommunicationType::IB_NVLink, -// hybrid_embedding::HybridEmbeddingType::Distributed, Optimizer_t::SGD, Update_t::Local); -// } -// } - -// TEST(hybrid_sparse_embedding_profile, single_node_uin32_float) { -// std::vector local_batch_sizes{1024, 2048, 3072, 4096, 6144, 8192}; -// // std::vector local_batch_sizes{6912}; -// size_t num_procs = 1; -// for (auto local_batch : local_batch_sizes) { -// hybrid_sparse_embedding_construct( -// {0, 1, 2, 3, 4, 5, 6, 7}, local_batch * 8, local_batch * 8, num_procs, -// hybrid_embedding::CommunicationType::NVLink_SingleNode, -// hybrid_embedding::HybridEmbeddingType::Distributed, Optimizer_t::SGD, Update_t::Local); -// } -// } diff --git a/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu b/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu index 378fa84cb1..305422795e 100644 --- a/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu +++ b/test/utest/embedding/localized_slot_sparse_embedding_hash_test.cu @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -209,7 +209,7 @@ void train_and_test(const std::vector &device_list, const Optimizer_t &opti for (int i = 0; i < numprocs; i++) { vvgpu.push_back(device_list); } - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0); if (resource_manager->is_master_process()) { HCTR_LOG_S(DEBUG, WORLD) << "rank " << resource_manager->get_process_id() << " is generating data" << std::endl; @@ -550,7 +550,7 @@ void load_and_dump(const std::vector &device_list, const Optimizer_t &optim std::vector> vvgpu; vvgpu.push_back(device_list); - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0); // re-generate the dataset files { @@ -755,7 +755,7 @@ void load_and_dump_file(const std::vector &device_list, const Optimizer_t & for (int i = 0; i < numprocs; i++) { vvgpu.push_back(device_list); } - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto &resource_manager = ResourceManagerCore::create(vvgpu, 0); if (pid == 0) { // re-generate the dataset files diff --git a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu b/test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu deleted file mode 100644 index 9b853baebc..0000000000 --- a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_test.cu +++ /dev/null @@ -1,847 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace embedding_test; - -namespace { - -//--------------------------------------------------------------------------------------- -// global params for all testing -const int train_batch_num = 10; // can not more than 32 -const int test_batch_num = 1; -const int train_batchsize = 1024; -const int test_batchsize = 2560; -const int slot_num = 26; -const int max_nnz_per_slot = 1; -const int max_feature_num = max_nnz_per_slot * slot_num; // max_feature_num in a sample -const long long vocabulary_size = slot_num * 100; -const int embedding_vec_size = 128; -const int combiner = 0; // 0-sum, 1-mean -const long long label_dim = 1; -const long long dense_dim = 0; -typedef long long T; -using SparseTensor23s = std::vector; - -const float scaler = 1.0f; // used in mixed precision training -const float lr = 0.01f; - -// In order to not allocate the total size of hash table on each GPU, the users need to set the -// size of max_vocabulary_size_per_gpu, which should be more than vocabulary_size/gpu_count, -// eg: 1.25x of that. - -const int num_chunk_threads = 1; // must be 1 for CPU and GPU results comparison -const int num_files = 1; -const Check_t CHK = Check_t::Sum; // Check_t::Sum -const char *train_file_list_name = "train_file_list.txt"; -const char *test_file_list_name = "test_file_list.txt"; - -const char *train_file_list_parquet_name = "train_file_list_parquet.txt"; -const char *test_file_list_parquet_name = "test_file_list_parquet.txt"; - -const char *prefix = "./data_reader_test_data/temp_dataset_"; - -const char *sparse_model_file = "localized_hash_table"; - -// std::vector slot_sizes; // null means use vocabulary_size/gpu_count/load_factor as -// max_vocabulary_size_per_gpu - -// CAUTION: must match vocabulary_size -// std::vector slot_sizes = {39884406,39043,17289,7420,20263,3,7120,1543,63,38532951, -// 2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36}; // -// for cretio dataset -std::vector slot_sizes = {100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100}; // just for verify - -//----------------------------------------------------------------------------------------- - -auto load_sparse_model_to_map = [](std::vector &key_vec, std::vector &slot_vec, - std::vector &vec_vec, const std::string &sparse_model) { - const std::string key_file(sparse_model + "/key"); - const std::string slot_file(sparse_model + "/slot_id"); - const std::string vec_file(sparse_model + "/emb_vector"); - - std::ifstream fs_key(key_file, std::ifstream::binary); - std::ifstream fs_slot(slot_file, std::ifstream::binary); - std::ifstream fs_vec(vec_file, std::ifstream::binary); - - const size_t key_file_size_in_B = std::filesystem::file_size(key_file); - const size_t slot_file_size_in_B = std::filesystem::file_size(slot_file); - const size_t vec_file_size_in_B = std::filesystem::file_size(vec_file); - - const long long num_key = key_file_size_in_B / sizeof(long long); - const long long num_slot = slot_file_size_in_B / sizeof(size_t); - const long long num_vec = vec_file_size_in_B / (sizeof(float) * embedding_vec_size); - if (num_key != num_vec || num_key != num_slot || num_key != vocabulary_size) { - HCTR_OWN_THROW(Error_t::BrokenFile, - "num_key != num_vec (num_slot) || num_key != vocabulary_size"); - } - key_vec.clear(); - key_vec.resize(num_key); - slot_vec.clear(); - slot_vec.resize(num_key); - vec_vec.clear(); - vec_vec.resize(num_vec * embedding_vec_size); - - using TypeKey = typename std::decay::type; - if (std::is_same::value) { - fs_key.read(reinterpret_cast(key_vec.data()), key_file_size_in_B); - } else { - std::vector i64_key_vec(num_key, 0); - fs_key.read(reinterpret_cast(i64_key_vec.data()), key_file_size_in_B); - std::transform(i64_key_vec.begin(), i64_key_vec.end(), key_vec.begin(), - [](long long key) { return static_cast(key); }); - } - fs_slot.read(reinterpret_cast(slot_vec.data()), slot_file_size_in_B); - fs_vec.read(reinterpret_cast(vec_vec.data()), vec_file_size_in_B); -}; - -void init_sparse_model(const char *sparse_model) { - HCTR_LOG_S(DEBUG, WORLD) << "Init hash table" << std::endl; - // init hash table file: - if (!std::filesystem::exists(sparse_model)) { - std::filesystem::create_directories(sparse_model); - } - const std::string key_file = std::string(sparse_model) + "/key"; - const std::string slot_file = std::string(sparse_model) + "/slot_id"; - const std::string vec_file = std::string(sparse_model) + "/emb_vector"; - std::ofstream fs_key(key_file); - std::ofstream fs_slot(slot_file); - std::ofstream fs_vec(vec_file); - if (!fs_key.is_open() || !fs_slot.is_open() || !fs_vec.is_open()) { - HCTR_LOG_S(ERROR, WORLD) << "File not open for writing. " << HCTR_LOCATION() << std::endl; - } - - // UnifiedDataSimulator ldata_sim(0, slot_num-1); // for slot_id - test::UniformDataSimulator fdata_sim; // for value - std::unique_ptr buf(new float[embedding_vec_size]); - for (long long i = 0; i < vocabulary_size; i++) { - T key = (T)i; - // T key = ldata_sim.get_num(); - // CAUTION: can not set random keys here, because we need to ensure that: - // 1) we can find keys in the data file from this hash table - // 2) there are no repeated keys - fs_key.write((char *)&key, sizeof(T)); - T slot_id; - if (slot_sizes.size() == 0) { - // slot_id = key % slot_num; // CAUTION: need to dedicate the slot_id for each key for - // // correctness verification - HCTR_OWN_THROW( - Error_t::WrongInput, - "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot"); - } else { - size_t offset = 0; - for (size_t j = 0; j < slot_sizes.size(); j++) { - if ((key >= static_cast(offset)) && (key < static_cast(offset + slot_sizes[j]))) { - slot_id = (T)j; - break; - } - offset += slot_sizes[j]; - } - } - fs_slot.write((char *)&slot_id, sizeof(T)); - // float val = (float)i; - // float val = 0.1f; - fdata_sim.fill(buf.get(), embedding_vec_size, -0.1f, 0.1f); - fs_vec.write(reinterpret_cast(buf.get()), embedding_vec_size * sizeof(float)); - } - HCTR_LOG_S(DEBUG, WORLD) << " Done" << std::endl; -} - -template -void train_and_test(const std::vector &device_list, const Optimizer_t &optimizer, - const Update_t &update_type, - const DeviceMap::Layout layout = DeviceMap::LOCAL_FIRST) { - OptHyperParams hyper_params; - hyper_params.sgd.atomic_update = true; - const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler}; - float tolerance; - if (std::is_same::value) { - tolerance = 5e-3f; - } else { - tolerance = 1e-4f; - } - - test::mpi_init(); - const int numprocs{core23::MpiInitService::get().world_size()}; - - // if there are multi-node, we assume each node has the same gpu device_list - std::vector> vvgpu; - for (int i = 0; i < numprocs; i++) { - vvgpu.push_back(device_list); - } - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0, layout); - - if (resource_manager->is_master_process()) { - HCTR_LOG_S(DEBUG, WORLD) << "rank " << resource_manager->get_process_id() - << " is generating data" << std::endl; - { - // re-generate the dataset files - std::ifstream file(train_file_list_name); - if (file.good()) { - std::remove(train_file_list_name); - } - } - { - // re-generate the dataset files - std::ifstream file(test_file_list_name); - if (file.good()) { - std::remove(test_file_list_name); - } - } - std::vector>> train_generated_value; - std::vector>> train_generated_rowoffset; - std::vector>> train_generated_label; - std::vector>> train_generated_dense; - - std::vector>> test_generated_value; - std::vector>> test_generated_rowoffset; - std::vector>> test_generated_label; - std::vector>> test_generated_dense; - // data generation: key's corresponding slot_id=(key%slot_num) - // TODO currently, generate norm file for CPU reference while parquet for GPU reader - if (slot_sizes.size() > 0) { - HugeCTR::data_generation_for_localized_test( - train_file_list_name, prefix, num_files, train_batch_num * train_batchsize, slot_num, - vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0, - &train_generated_value, &train_generated_rowoffset, &train_generated_label, - &train_generated_dense); - HugeCTR::data_generation_for_localized_test( - test_file_list_name, prefix, num_files, test_batch_num * test_batchsize, slot_num, - vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0, - &test_generated_value, &test_generated_rowoffset, &test_generated_label, - &test_generated_dense); - } else { - HCTR_OWN_THROW( - Error_t::WrongInput, - "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot"); - } - HugeCTR::data_generation_for_parquet(train_file_list_parquet_name, prefix, - train_generated_value, train_generated_rowoffset, - train_generated_label, train_generated_dense); - HugeCTR::data_generation_for_parquet(test_file_list_parquet_name, prefix, - test_generated_value, test_generated_rowoffset, - test_generated_label, test_generated_dense); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); - HCTR_LOG_S(DEBUG, WORLD) << "This is rank: " << resource_manager->get_process_id() << std::endl; -#endif - - // setup a data reader - const DataReaderSparseParam param = {"localized", max_nnz_per_slot, true, slot_num}; - std::vector params; - params.push_back(param); - - std::unique_ptr> train_data_reader( - new DataReader(train_batchsize, label_dim, dense_dim, params, resource_manager, true, - num_chunk_threads, false)); - - train_data_reader->create_drwg_parquet( - train_file_list_parquet_name, false, std::vector(slot_num, 0), true, - std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize), - label_dim + dense_dim, label_dim + dense_dim); - - std::unique_ptr> test_data_reader( - new DataReader(test_batchsize, label_dim, dense_dim, params, resource_manager, true, - num_chunk_threads, false)); - - test_data_reader->create_drwg_parquet( - test_file_list_parquet_name, false, std::vector(slot_num, 0), true, - std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize), - label_dim + dense_dim, label_dim + dense_dim); - - // generate hashtable - if (resource_manager->is_master_process()) { - init_sparse_model(sparse_model_file); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - const SparseEmbeddingHashParams embedding_params = {train_batchsize, - test_batchsize, - 0, - slot_sizes, - embedding_vec_size, - max_feature_num, - slot_num, - combiner, - opt_params, - true, - false}; - - SparseTensor23s train_input = train_data_reader->get_sparse_tensor23s("localized"); - SparseTensor23s test_input = test_data_reader->get_sparse_tensor23s("localized"); - - std::unique_ptr> embedding( - new LocalizedSlotSparseEmbeddingOneHot( - core_helper::convert_sparse_tensors23_to_sparse_tensors(train_input), - core_helper::convert_sparse_tensors23_to_sparse_tensors(test_input), embedding_params, - resource_manager)); - - // upload hash table to device - embedding->load_parameters(sparse_model_file); - - // for SparseEmbeddingCpu - std::unique_ptr> embedding_cpu( - new SparseEmbeddingHashCpu( - train_batchsize, max_feature_num, vocabulary_size, embedding_vec_size, slot_num, - label_dim, dense_dim, CHK, train_batch_num * train_batchsize, combiner, opt_params, - train_file_list_name, sparse_model_file, SparseEmbedding_t::Localized)); - - TypeEmbeddingComp *embedding_feature_from_cpu = embedding_cpu->get_forward_results(); - TypeEmbeddingComp *wgrad_from_cpu = embedding_cpu->get_backward_results(); - T *hash_table_key_from_cpu = embedding_cpu->get_hash_table_key_ptr(); - float *hash_table_value_from_cpu = embedding_cpu->get_hash_table_value_ptr(); - - // for results check - std::shared_ptr> buf = GeneralBuffer2::create(); - - Tensor2 embedding_feature_from_gpu; - buf->reserve({train_batchsize * slot_num * embedding_vec_size}, &embedding_feature_from_gpu); - Tensor2 wgrad_from_gpu; - buf->reserve({train_batchsize * slot_num * embedding_vec_size}, &wgrad_from_gpu); - Tensor2 embedding_feature_from_gpu_eval; - buf->reserve({test_batchsize * slot_num * embedding_vec_size}, &embedding_feature_from_gpu_eval); - - buf->allocate(); - - typedef struct TypeHashValue_ { - float data[embedding_vec_size]; - } TypeHashValue; - - for (int i = 0; i < train_batch_num; i++) { - HCTR_LOG(INFO, WORLD, "Rank%d: Round %d start training:\n", resource_manager->get_process_id(), - i); - - // call read a batch - HCTR_LOG(INFO, WORLD, "Rank%d: data_reader->read_a_batch_to_device()\n", - resource_manager->get_process_id()); - train_data_reader->read_a_batch_to_device(); - - // GPU forward - HCTR_LOG(INFO, WORLD, "Rank%d: embedding->forward()\n", resource_manager->get_process_id()); - embedding->forward(true); - - // check the result of forward - HCTR_LOG(INFO, WORLD, "Rank%d: embedding->get_forward_results()\n", - resource_manager->get_process_id()); - embedding->get_forward_results(true, embedding_feature_from_gpu); // memcpy from GPU to CPU - - if (resource_manager->is_master_process()) { - // CPU forward - HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu->forward()\n"); - embedding_cpu->forward(); - - HCTR_LOG(INFO, WORLD, "Rank0: check forward results\n"); - ASSERT_TRUE(compare_embedding_feature(train_batchsize * slot_num * embedding_vec_size, - embedding_feature_from_gpu.get_ptr(), - embedding_feature_from_cpu, tolerance)); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - // GPU backward - HCTR_LOG(INFO, WORLD, "Rank%d: embedding->backward()\n", resource_manager->get_process_id()); - embedding->backward(); - - // check the result of backward - HCTR_LOG(INFO, WORLD, "Rank%d: embedding->get_backward_results()\n", - resource_manager->get_process_id()); - embedding->get_backward_results(wgrad_from_gpu, 0); - - if (resource_manager->is_master_process()) { - // CPU backward - HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu->backward()\n"); - embedding_cpu->backward(); - - HCTR_LOG(INFO, WORLD, "Rank0: check backward results: GPU and CPU\n"); - ASSERT_TRUE(compare_wgrad(train_batchsize * slot_num * embedding_vec_size, - wgrad_from_gpu.get_ptr(), wgrad_from_cpu, tolerance)); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - // GPU update_params - HCTR_LOG(INFO, WORLD, "Rank%d: embedding->update_params()\n", - resource_manager->get_process_id()); - embedding->update_params(); - - if (resource_manager->is_master_process()) { - // CPU update_params - HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu->update_params()\n"); - embedding_cpu->update_params(); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - HCTR_LOG(INFO, WORLD, "Rank%d: Round %d end:\n", resource_manager->get_process_id(), i); - } - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // create new obj for eval() - embedding->dump_parameters(sparse_model_file); - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - // for SparseEmbeddingCpu eval - std::unique_ptr> test_embedding_cpu( - new SparseEmbeddingHashCpu( - test_batchsize, max_feature_num, vocabulary_size, embedding_vec_size, slot_num, label_dim, - dense_dim, CHK, test_batch_num * test_batchsize, combiner, opt_params, - test_file_list_name, sparse_model_file, SparseEmbedding_t::Localized)); - - TypeEmbeddingComp *embedding_feature_from_cpu_eval = test_embedding_cpu->get_forward_results(); - - { - ///////////////////////////////////////////////////////////////////////////////////////////// - // eval - HCTR_LOG(INFO, WORLD, "\nRank%d: Round start eval:\n", resource_manager->get_process_id()); - - // call read a batch - HCTR_LOG(INFO, WORLD, "Rank%d: data_reader_eval->read_a_batch_to_device()\n", - resource_manager->get_process_id()); - test_data_reader->read_a_batch_to_device(); - - // GPU forward - HCTR_LOG(INFO, WORLD, "Rank%d: embedding_eval->forward()\n", - resource_manager->get_process_id()); - embedding->forward(false); - - // check the result of forward - HCTR_LOG(INFO, WORLD, "Rank%d: embedding_eval->get_forward_results()\n", - resource_manager->get_process_id()); - embedding->get_forward_results(false, - embedding_feature_from_gpu_eval); // memcpy from GPU to CPU - - if (resource_manager->is_master_process()) { - // CPU forward - HCTR_LOG(INFO, WORLD, "Rank0: embedding_cpu_eval->forward()\n"); - test_embedding_cpu->forward(); - - HCTR_LOG(INFO, WORLD, "Rank0: check forward results\n"); - ASSERT_TRUE(compare_embedding_feature(test_batchsize * slot_num * embedding_vec_size, - embedding_feature_from_gpu_eval.get_ptr(), - embedding_feature_from_cpu_eval, tolerance)); - } - } - - test::mpi_finalize(); -} - -template -void load_and_dump(const std::vector &device_list, const Optimizer_t &optimizer, - const Update_t &update_type) { - float tolerance = 1e-4f; - OptHyperParams hyper_params; - hyper_params.sgd.atomic_update = true; - const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler}; - std::vector> vvgpu; - vvgpu.push_back(device_list); - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); - - // re-generate the dataset files - { - std::ifstream fs(train_file_list_name); - if (fs.good()) { - std::remove(train_file_list_name); - } - } - std::vector>> train_generated_value; - std::vector>> train_generated_rowoffset; - std::vector>> train_generated_label; - std::vector>> train_generated_dense; - - // data generation: key's corresponding slot_id=(key%slot_num) - if (slot_sizes.size() > 0) { - HugeCTR::data_generation_for_localized_test( - train_file_list_name, prefix, num_files, train_batch_num * train_batchsize, slot_num, - vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0, - &train_generated_value, &train_generated_rowoffset, &train_generated_label, - &train_generated_dense); - } else { - HCTR_OWN_THROW( - Error_t::WrongInput, - "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot"); - HugeCTR::data_generation_for_parquet(train_file_list_parquet_name, prefix, - train_generated_value, train_generated_rowoffset, - train_generated_label, train_generated_dense); - } - - // setup a data reader - const DataReaderSparseParam param = {"localized", max_nnz_per_slot, true, slot_num}; - std::vector params; - params.push_back(param); - - std::unique_ptr> train_data_reader( - new DataReader(train_batchsize, label_dim, dense_dim, params, resource_manager, true, - num_chunk_threads, false)); - - train_data_reader->create_drwg_parquet( - train_file_list_parquet_name, false, std::vector(slot_num, 0), true, - std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize), - label_dim + dense_dim, label_dim + dense_dim); - - // generate hashtable - init_sparse_model(sparse_model_file); - - const SparseEmbeddingHashParams embedding_params = {train_batchsize, - test_batchsize, - 0, - slot_sizes, - embedding_vec_size, - max_feature_num, - slot_num, - combiner, - opt_params, - true, - false}; - - SparseTensor23s train_input = train_data_reader->get_sparse_tensor23s("localized"); - - std::unique_ptr> embedding( - new LocalizedSlotSparseEmbeddingOneHot( - core_helper::convert_sparse_tensors23_to_sparse_tensors(train_input), - core_helper::convert_sparse_tensors23_to_sparse_tensors(train_input), embedding_params, - resource_manager)); - - // upload hash table to device - embedding->load_parameters(sparse_model_file); - - HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n", - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - - std::shared_ptr> blobs_buff = - GeneralBuffer2::create(); - - Tensor2 keys; - blobs_buff->reserve({embedding->get_max_vocabulary_size()}, &keys); - - Tensor2 slot_id; - blobs_buff->reserve({embedding->get_max_vocabulary_size()}, &slot_id); - - Tensor2 embeddings; - blobs_buff->reserve({embedding->get_max_vocabulary_size(), embedding_vec_size}, &embeddings); - - blobs_buff->allocate(); - - BufferBag buf_bag; - buf_bag.keys = keys.shrink(); - buf_bag.slot_id = slot_id.shrink(); - buf_bag.embedding = embeddings; - - size_t dump_size; - embedding->dump_parameters(buf_bag, &dump_size); - - HCTR_LOG(INFO, WORLD, "dump_size=%zu, max_vocabulary_size=%zu, vocabulary_size=%zu\n", dump_size, - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - - embedding->dump_parameters(buf_bag, &dump_size); - - HCTR_LOG(INFO, WORLD, "dump_size=%zu, max_vocabulary_size=%zu, vocabulary_size=%zu\n", dump_size, - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - - embedding->reset(); - - HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n", - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - - embedding->load_parameters(buf_bag, dump_size); - - HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n", - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - - embedding->dump_parameters(buf_bag, &dump_size); - - HCTR_LOG(INFO, WORLD, "dump_size=%zu, max_vocabulary_size=%zu, vocabulary_size=%zu\n", dump_size, - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - - std::string tmp_sparse_model_file{"tmp_sparse_model"}; - embedding->dump_parameters(tmp_sparse_model_file); - - std::vector hash_table_key_from_cpu; - std::vector slot_id_from_cpu; - std::vector hash_table_value_from_cpu; - load_sparse_model_to_map(hash_table_key_from_cpu, slot_id_from_cpu, hash_table_value_from_cpu, - sparse_model_file); - - std::vector hash_table_key_from_gpu; - std::vector slot_id_from_gpu; - std::vector hash_table_value_from_gpu; - load_sparse_model_to_map(hash_table_key_from_gpu, slot_id_from_gpu, hash_table_value_from_gpu, - tmp_sparse_model_file); - - typedef struct TypeHashValue_ { - float data[embedding_vec_size]; - } TypeHashValue; - - ASSERT_TRUE(compare_hash_table( - vocabulary_size, hash_table_key_from_gpu.data(), - reinterpret_cast(hash_table_value_from_gpu.data()), - hash_table_key_from_cpu.data(), - reinterpret_cast(hash_table_value_from_cpu.data()), tolerance)); - - ASSERT_TRUE(compare_key_slot(vocabulary_size, hash_table_key_from_gpu.data(), - slot_id_from_gpu.data(), hash_table_key_from_cpu.data(), - slot_id_from_cpu.data())); -} - -template -void load_and_dump_file(const std::vector &device_list, const Optimizer_t &optimizer, - const Update_t &update_type) { - std::string sparse_model_src("sparse_model_src"); - std::string sparse_model_dst("sparse_model_dst"); - - float tolerance = 1e-4f; - OptHyperParams hyper_params; - hyper_params.sgd.atomic_update = true; - const OptParams opt_params = {optimizer, lr, hyper_params, update_type, scaler}; - - int numprocs = 1, pid = 0; - std::vector> vvgpu; - test::mpi_init(); - for (int i = 0; i < numprocs; i++) { - vvgpu.push_back(device_list); - } - const auto &resource_manager = ResourceManagerExt::create(vvgpu, 0); - - if (pid == 0) { - // re-generate the dataset files - if (std::filesystem::exists(train_file_list_name)) { - std::filesystem::remove(train_file_list_name); - } - std::vector>> train_generated_value; - std::vector>> train_generated_rowoffset; - std::vector>> train_generated_label; - std::vector>> train_generated_dense; - // data generation: key's corresponding slot_id=(key%slot_num) - if (slot_sizes.size() > 0) { - HugeCTR::data_generation_for_localized_test( - train_file_list_name, prefix, num_files, train_batch_num * train_batchsize, slot_num, - vocabulary_size, label_dim, dense_dim, max_nnz_per_slot, slot_sizes, false, 0.0, - &train_generated_value, &train_generated_rowoffset, &train_generated_label, - &train_generated_dense); - } else { - HCTR_OWN_THROW( - Error_t::WrongInput, - "Must set slot_sizes since there is no hashtable in LocalizedSlotSpasrseEmbeddingOneHot"); - } - HugeCTR::data_generation_for_parquet(train_file_list_parquet_name, prefix, - train_generated_value, train_generated_rowoffset, - train_generated_label, train_generated_dense); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - // setup a data reader - const DataReaderSparseParam param = {"localized", max_nnz_per_slot, true, slot_num}; - std::vector params; - params.push_back(param); - - std::unique_ptr> train_data_reader( - new DataReader(train_batchsize, label_dim, dense_dim, params, resource_manager, true, - num_chunk_threads, false)); - - train_data_reader->create_drwg_parquet( - train_file_list_parquet_name, false, std::vector(slot_num, 0), true, - std::max(train_batch_num * train_batchsize, test_batch_num * test_batchsize), - label_dim + dense_dim, label_dim + dense_dim); - - const SparseEmbeddingHashParams embedding_params = {train_batchsize, - test_batchsize, - 0, - slot_sizes, - embedding_vec_size, - max_feature_num, - slot_num, - combiner, - opt_params, - true, - false}; - - SparseTensor23s train_input = train_data_reader->get_sparse_tensor23s("localized"); - std::unique_ptr> embedding( - new LocalizedSlotSparseEmbeddingOneHot( - core_helper::convert_sparse_tensors23_to_sparse_tensors(train_input), - core_helper::convert_sparse_tensors23_to_sparse_tensors(train_input), embedding_params, - resource_manager)); - - // init hash table file - if (pid == 0) { - init_sparse_model(sparse_model_src.c_str()); - } - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - // upload hash table to device - embedding->load_parameters(sparse_model_src); - - if (pid == 0) { - HCTR_LOG(INFO, WORLD, "max_vocabulary_size=%zu, vocabulary_size=%zu\n", - embedding->get_max_vocabulary_size(), embedding->get_vocabulary_size()); - } - - // dump sparse model to file - embedding->dump_parameters(sparse_model_dst); - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Barrier(MPI_COMM_WORLD)); -#endif - - std::vector hash_table_key_from_cpu; - std::vector slot_id_from_cpu; - std::vector hash_table_value_from_cpu; - load_sparse_model_to_map(hash_table_key_from_cpu, slot_id_from_cpu, hash_table_value_from_cpu, - sparse_model_src); - - std::vector hash_table_key_from_gpu; - std::vector slot_id_from_gpu; - std::vector hash_table_value_from_gpu; - load_sparse_model_to_map(hash_table_key_from_gpu, slot_id_from_gpu, hash_table_value_from_gpu, - sparse_model_dst); - - typedef struct TypeHashValue_ { - float data[embedding_vec_size]; - } TypeHashValue; - - ASSERT_TRUE(compare_hash_table( - vocabulary_size, hash_table_key_from_gpu.data(), - reinterpret_cast(hash_table_value_from_gpu.data()), - hash_table_key_from_cpu.data(), - reinterpret_cast(hash_table_value_from_cpu.data()), tolerance)); - - ASSERT_TRUE(compare_key_slot(vocabulary_size, hash_table_key_from_gpu.data(), - slot_id_from_gpu.data(), hash_table_key_from_cpu.data(), - slot_id_from_cpu.data())); - - test::mpi_finalize(); -} - -} // namespace - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_1gpu) { - train_and_test({0}, Optimizer_t::SGD, Update_t::Local); -} - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_4gpu) { - train_and_test({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local); -} - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_1gpu) { - train_and_test({0}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_4gpu) { - train_and_test({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_1gpu) { - train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Local); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_4gpu) { - train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_1gpu) { - train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_4gpu) { - train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, load_and_dump_1gpu) { - load_and_dump({0}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, load_and_dump_4gpu) { - load_and_dump({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, load_and_dump_file_1gpu) { - load_and_dump_file({0}, Optimizer_t::SGD, Update_t::Global); -} - -TEST(localized_sparse_embedding_one_hot_test, load_and_dump_file_4gpu) { - load_and_dump_file({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global); -} -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_1gpu_nf) { - train_and_test({0}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_4gpu_nf) { - train_and_test({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_1gpu_nf) { - train_and_test({0}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp32_sgd_global_update_4gpu_nf) { - train_and_test({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_1gpu_nf) { - train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_4gpu_nf) { - train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Local, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_1gpu_nf) { - train_and_test<__half>({0}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST); -} - -TEST(localized_sparse_embedding_one_hot_test, fp16_sgd_global_update_4gpu_nf) { - train_and_test<__half>({0, 1, 2, 3}, Optimizer_t::SGD, Update_t::Global, DeviceMap::NODE_FIRST); -} diff --git a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu b/test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu deleted file mode 100644 index 3ff50682b7..0000000000 --- a/test/utest/embedding/localized_slot_sparse_embedding_one_hot_update_test.cu +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; -using namespace embedding_test; - -template -class GpuData { - public: - GpuData() {} - ~GpuData() {} - GpuData(const std::vector& h_value_index, const size_t max_vocabulary_size, - const size_t embedding_vec_size) { - size_t num_samples = h_value_index.size(); - init_data(num_samples, max_vocabulary_size, embedding_vec_size); - HCTR_LIB_THROW(cudaMemcpy(value_index.get_ptr(), h_value_index.data(), - sizeof(size_t) * num_samples, cudaMemcpyHostToDevice)); - } - - void init_data(const size_t num_samples, const size_t max_vocabulary_size, - const size_t embedding_vec_size) { - std::shared_ptr> buf = GeneralBuffer2::create(); - - buf->reserve({num_samples}, &value_index); - buf->reserve({max_vocabulary_size * embedding_vec_size}, &weights); - buf->reserve({num_samples * embedding_vec_size}, &wgrad); - - const size_t max_top_categories = get_max_size_top_categories(); - buf->reserve({max_top_categories}, &top_categories); - size_top_categories = 0; - - buf->allocate(); - } - - Tensor2 value_index; - Tensor2 top_categories; - size_t size_top_categories; - - Tensor2 wgrad; - Tensor2 weights; - - void init_weights(size_t num_samples, size_t max_vocabulary_size, size_t embedding_vec_size, - const std::vector& h_wgrad) { - HCTR_LIB_THROW(cudaMemcpy(wgrad.get_ptr(), h_wgrad.data(), - sizeof(TypeEmbeddingComp) * num_samples * embedding_vec_size, - cudaMemcpyHostToDevice)); - HCTR_LIB_THROW(cudaMemset(weights.get_ptr(), 0.f, - sizeof(float) * max_vocabulary_size * embedding_vec_size)); - } -}; - -template -void update_test(const std::vector& value_index, size_t max_vocabulary_size, - size_t embedding_vec_size, const std::vector& wgrad) { - HCTR_LOG_S(DEBUG, WORLD) << "Starting embedding update test..." << std::endl; - cudaStream_t stream = 0; - - // get number of sms - cudaDeviceProp device_prop; - cudaGetDeviceProperties(&device_prop, 0); - - // test sorting - std::map> ref_categorize; - size_t num_samples = value_index.size(); - for (size_t i = 0; i < num_samples; ++i) { - ref_categorize[value_index[i]].insert(i); - } - size_t num_unique_categories_ref = ref_categorize.size(); - - std::vector value_index_sort; - std::vector sample_id_sort; - std::vector sorted_sample_offset_category; - - GpuData gpu_data(value_index, max_vocabulary_size, embedding_vec_size); - - // now for the update - size_t weight_size = max_vocabulary_size * embedding_vec_size; - std::vector weights_test(weight_size, 0.0f); - std::vector weights_ref(weight_size, 0.0f); - - // ref weight update : - for (auto const& pair : ref_categorize) { - for (size_t j = 0; j < embedding_vec_size; ++j) { - float sum_j = 0.f; - for (auto const& sample_index : pair.second) { - sum_j += (float)wgrad[sample_index * embedding_vec_size + j]; - } - weights_ref[pair.first * embedding_vec_size + j] = -sum_j; - } - } - // done with calculating ref weights - - // init wgrad and weights on gpu: - gpu_data.init_weights(num_samples, max_vocabulary_size, embedding_vec_size, wgrad); - - HCTR_LOG_S(DEBUG, WORLD) << "performing atomic cached kernel..." << std::endl; - SparseEmbeddingFunctors::opt_sgd_atomic_cached( - num_samples, embedding_vec_size, gpu_data.value_index.get_ptr(), 1.0f, 1.0f, - gpu_data.wgrad.get_ptr(), gpu_data.weights.get_ptr(), gpu_data.top_categories.get_ptr(), - gpu_data.size_top_categories, stream, true); - - HCTR_LOG_S(DEBUG, WORLD) << "done performing kernel, testing results.." << std::endl; - HCTR_LIB_THROW(cudaMemcpy(weights_test.data(), gpu_data.weights.get_ptr(), - sizeof(float) * embedding_vec_size * max_vocabulary_size, - cudaMemcpyDeviceToHost)); - - const float epsilon = 1.0e-4; - double diff_ave = 0.0; - - size_t count_neq = 0; - size_t count_all = 0; - bool all_el_equal = true; - for (auto const& pair : ref_categorize) { - const size_t& category = pair.first; - bool category_equal = true; - for (size_t j = 0; j < embedding_vec_size; ++j) { - size_t index = category * embedding_vec_size + j; - float diff = weights_ref[index] - weights_test[index]; - diff = (diff > 0.f ? diff : -diff); - diff_ave += (double)diff; - all_el_equal = (all_el_equal && (diff < epsilon)); - category_equal = category_equal && (diff < epsilon); - - count_neq += (size_t)(diff >= epsilon); - count_all++; - } - if (!category_equal) { - HCTR_LOG_S(DEBUG, WORLD) << "Fail : the weights of category " << category - << " are wrongly computed." << std::endl; - HCTR_LOG_S(DEBUG, WORLD) << "Weight expected : " << weights_ref[category * embedding_vec_size] - << "\t weight calculated : " - << weights_test[category * embedding_vec_size] << std::endl; - } - } - - diff_ave /= static_cast(count_all); - HCTR_LOG_S(DEBUG, WORLD) << "number of correct elements : " << count_all - count_neq - << " out of " << count_all << " = " - << (double)(count_all - count_neq) / (double)count_all * 100.0 << " % " - << std::endl; - if (!all_el_equal) { - HCTR_LOG_S(DEBUG, WORLD) << "average diff : " << diff_ave << std::endl; - { - auto log = HCTR_LOG_S(DEBUG, WORLD); - log << "CPU : "; - for (size_t i = 0; i < 10; ++i) { - log << '\t' << weights_ref[128 + i]; - } - log << std::endl; - } - { - auto log = HCTR_LOG_S(DEBUG, WORLD); - log << "GPU : "; - for (size_t i = 0; i < 10; ++i) { - log << '\t' << weights_test[128 + i]; - } - log << std::endl; - } - } - ASSERT_TRUE(all_el_equal && "not all embedding vector weights are updated correctly!"); - - bool all_el_zero = true; - for (size_t i = 0; i < max_vocabulary_size; ++i) { - if (ref_categorize.find(i) == ref_categorize.end()) { - for (size_t j = 0; j < embedding_vec_size; ++j) { - all_el_zero = all_el_zero && (weights_test[i * embedding_vec_size + j] == 0.f); - } - } - } - ASSERT_TRUE(all_el_zero && "some embedding vectors that shouldn't be updated were modified!"); - - HCTR_LOG_S(DEBUG, WORLD) << "Finished embedding update test SUCCESSFULLY!" << std::endl; -} - -template -void setup_and_run_randomized_test(const int N_test, const int embedding_vec_size, - const int num_samples) { - std::vector category_size{39884, 3, 63, 10}; - std::vector category_offset(4); - - size_t max_vocabulary_size = 0; - for (size_t i = 0; i < category_size.size(); ++i) { - category_offset[i] = max_vocabulary_size; - max_vocabulary_size += category_size[i]; - } - - std::vector wgrad(num_samples * embedding_vec_size, (etype)1.); - - for (int n = 0; n < N_test; ++n) { - // create test input - std::vector value_index; - for (int i = 0; i < num_samples; ++i) { - int embedding = rand() % 4; - size_t category = category_offset[embedding] + (size_t)rand() % category_size[embedding]; - value_index.push_back(category); - } - - // perform test - update_test(value_index, max_vocabulary_size, embedding_vec_size, wgrad); - } -} - -TEST(localized_one_hot_update_test, fp16_sgd_atomic_cached) { - const int N_test = 5; - const int embedding_vec_size = 128; - const int num_samples = 64 * 1024; - - for (size_t multiplier = 1; multiplier < 32; multiplier *= 2) { - setup_and_run_randomized_test<__half>(N_test, embedding_vec_size, num_samples); - } -} - -TEST(localized_one_hot_update_test, fp32_sgd_atomic_cached) { - const int N_test = 5; - const int embedding_vec_size = 128; - const int num_samples = 64 * 1024; - - for (size_t multiplier = 1; multiplier < 32; multiplier *= 2) { - setup_and_run_randomized_test(N_test, embedding_vec_size, num_samples); - } -} diff --git a/test/utest/embedding/unified_embedding.hpp b/test/utest/embedding/unified_embedding.hpp index ed7d8515d5..920ef2d780 100644 --- a/test/utest/embedding/unified_embedding.hpp +++ b/test/utest/embedding/unified_embedding.hpp @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/test/utest/embedding/unified_embedding_test.cpp b/test/utest/embedding/unified_embedding_test.cpp index fc1859e25a..a084ad0af7 100644 --- a/test/utest/embedding/unified_embedding_test.cpp +++ b/test/utest/embedding/unified_embedding_test.cpp @@ -15,7 +15,7 @@ */ #include -#include +#include #include using namespace HugeCTR; @@ -33,7 +33,7 @@ void unified_embedding_forward(const TestParams &test_param, const std::vectorget_global_gpu_count(); size_t local_gpu_count = resource_manager->get_local_gpu_count(); diff --git a/test/utest/embedding_collection/configuration.hpp b/test/utest/embedding_collection/configuration.hpp index d9f4217fef..26c5e4f94c 100644 --- a/test/utest/embedding_collection/configuration.hpp +++ b/test/utest/embedding_collection/configuration.hpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/test/utest/embedding_collection/embedding_collection_utils.hpp b/test/utest/embedding_collection/embedding_collection_utils.hpp index ebb922a766..c8bab4aae5 100644 --- a/test/utest/embedding_collection/embedding_collection_utils.hpp +++ b/test/utest/embedding_collection/embedding_collection_utils.hpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include using namespace embedding; diff --git a/test/utest/embedding_collection/test_compress_offset.cpp b/test/utest/embedding_collection/test_compress_offset.cpp index 2aa2c52cf8..56bf9a8982 100644 --- a/test/utest/embedding_collection/test_compress_offset.cpp +++ b/test/utest/embedding_collection/test_compress_offset.cpp @@ -21,13 +21,13 @@ #include #include #include -#include +#include #include using namespace embedding; TEST(test_compress_offset, test_compress_offset) { - auto resource_manager = HugeCTR::ResourceManagerExt::create({{0}}, 0); + auto resource_manager = HugeCTR::ResourceManagerCore::create({{0}}, 0); auto core = std::make_shared(resource_manager, 0); HugeCTR::CudaDeviceContext context(core->get_device_id()); HugeCTR::core23::Device device(core23::DeviceType::GPU, core->get_device_id()); diff --git a/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp b/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp index 3401d24806..5f6a5724c4 100644 --- a/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp +++ b/test/utest/embedding_collection/test_embedding_collection_load_dump.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include @@ -283,7 +283,7 @@ void embedding_collection_e2e_io(const std::vector& lookup_params, {}}; auto table_param_list = get_table_param_list_io(ebc_param.emb_type); - auto resource_manager = HugeCTR::ResourceManagerExt::create({device_list}, 0); + auto resource_manager = HugeCTR::ResourceManagerCore::create({device_list}, 0); EmbeddingIO emb_io = EmbeddingIO(resource_manager); int num_gpus = static_cast(device_list.size()); int batch_size_per_gpu = batch_size / num_gpus; @@ -374,6 +374,7 @@ void embedding_collection_e2e_io(const std::vector& lookup_params, std::vector data_distributor_outputs; for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { + HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id()); data_distributor_outputs.push_back(HugeCTR::allocate_output_for_data_distributor( core_resource_manager_list[gpu_id], ebc_param)); } @@ -443,6 +444,7 @@ void embedding_collection_e2e_io(const std::vector& lookup_params, auto sync_gpus = [&]() { for (auto core : core_resource_manager_list) { + HugeCTR::CudaDeviceContext context(core->get_device_id()); HCTR_LIB_THROW(cudaStreamSynchronize(core->get_local_gpu()->get_stream())); } }; @@ -486,6 +488,7 @@ void embedding_collection_e2e_io(const std::vector& lookup_params, emb_ref.embedding_forward_cpu(key_list, bucket_range); #pragma omp parallel for num_threads(num_gpus) for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { + HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id()); data_distributor->distribute(gpu_id, sparse_dp_tensors[gpu_id], sparse_dp_num_keys_per_bucket[gpu_id], data_distributor_outputs[gpu_id], batch_size); @@ -508,6 +511,7 @@ void embedding_collection_e2e_io(const std::vector& lookup_params, emb_ref.embedding_backward_cpu(top_grads, key_list, bucket_range); #pragma omp parallel for num_threads(num_gpus) for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { + HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id()); ebc->backward_per_gpu(gpu_id, data_distributor_outputs[gpu_id], ebc_top_grads[gpu_id], batch_size); } @@ -517,6 +521,7 @@ void embedding_collection_e2e_io(const std::vector& lookup_params, emb_ref.embedding_update_cpu(); #pragma omp parallel for num_threads(num_gpus) for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { + HugeCTR::CudaDeviceContext context(core_resource_manager_list[gpu_id]->get_device_id()); ebc->update_per_gpu(gpu_id); } sync_gpus(); diff --git a/test/utest/embedding_collection/test_embedding_collection_v2.cu b/test/utest/embedding_collection/test_embedding_collection_v2.cu index dd0e0d233c..fdba5e744f 100644 --- a/test/utest/embedding_collection/test_embedding_collection_v2.cu +++ b/test/utest/embedding_collection/test_embedding_collection_v2.cu @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -280,7 +280,7 @@ void embedding_collection_e2e(const Configuration &config) { std::iota(device_list_per_node.begin(), device_list_per_node.end(), 0); std::vector> device_list(num_nodes, device_list_per_node); - auto resource_manager = HugeCTR::ResourceManagerExt::create(device_list, 0); + auto resource_manager = HugeCTR::ResourceManagerCore::create(device_list, 0); std::vector> core_resource_manager_list; for (int gpu_id = 0; gpu_id < num_local_gpus; ++gpu_id) { diff --git a/test/utest/embedding_collection/test_embedding_table.cpp b/test/utest/embedding_collection/test_embedding_table.cpp index 9fe943be17..71f77d5c88 100644 --- a/test/utest/embedding_collection/test_embedding_table.cpp +++ b/test/utest/embedding_collection/test_embedding_table.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include using namespace embedding; int num_embedding_table = 3; @@ -48,7 +48,7 @@ template void test_embedding_table(int device_id, int table_type) { std::vector device_list{device_id}; HugeCTR::CudaDeviceContext context(device_id); - auto resource_manager = HugeCTR::ResourceManagerExt::create({device_list}, 0); + auto resource_manager = HugeCTR::ResourceManagerCore::create({device_list}, 0); auto core = std::make_shared(resource_manager, 0); auto key_type = HugeCTR::core23::ToScalarType::value; diff --git a/test/utest/embedding_collection/test_embedding_table_optimizer.cpp b/test/utest/embedding_collection/test_embedding_table_optimizer.cpp index c99ecc346b..cd72f15c46 100644 --- a/test/utest/embedding_collection/test_embedding_table_optimizer.cpp +++ b/test/utest/embedding_collection/test_embedding_table_optimizer.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include using namespace embedding; @@ -41,7 +41,7 @@ void test_embedding_table_optimizer(int device_id, const char table_type[], std::vector device_list{device_id}; HugeCTR::CudaDeviceContext context(device_id); - auto resource_manager = HugeCTR::ResourceManagerExt::create({device_list}, 0); + auto resource_manager = HugeCTR::ResourceManagerCore::create({device_list}, 0); auto core = std::make_shared(resource_manager, 0); const auto key_type = core23::ToScalarType::value; diff --git a/test/utest/metrics/auc_test.cpp b/test/utest/metrics/auc_test.cpp index 4643caf561..e974a5e9f6 100644 --- a/test/utest/metrics/auc_test.cpp +++ b/test/utest/metrics/auc_test.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -217,7 +217,7 @@ void metric_test(std::vector device_list, size_t batch_size, size_t num_tot for (int i = 0; i < num_procs; i++) { vvgpu.push_back(device_list); } - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); + const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242); // Setup the containers core23::Shape dims = {static_cast(batch_size / num_classes), diff --git a/test/utest/metrics/averageloss_test.cpp b/test/utest/metrics/averageloss_test.cpp index 54c5da6ba0..7decb7b6d7 100644 --- a/test/utest/metrics/averageloss_test.cpp +++ b/test/utest/metrics/averageloss_test.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -129,7 +129,7 @@ void averageloss_test(std::vector device_list, size_t batch_size, size_t nu for (int i = 0; i < num_procs; i++) { vvgpu.push_back(device_list); } - const auto resource_manager = ResourceManagerExt::create(vvgpu, 424242); + const auto resource_manager = ResourceManagerCore::create(vvgpu, 424242); // Create AverageLoss metric auto metric = std::make_unique>(resource_manager); diff --git a/test/utest/network/network_build_test.cpp b/test/utest/network/network_build_test.cpp index a1f089d8ca..421e1d5cac 100644 --- a/test/utest/network/network_build_test.cpp +++ b/test/utest/network/network_build_test.cpp @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include @@ -117,7 +117,7 @@ void network_build_test() { std::vector device_vec(core23::Device::count()); std::generate(device_vec.begin(), device_vec.end(), [dev = 0]() mutable { return dev++; }); std::vector> vvgpu(1, device_vec); - const auto& resource_manager = ResourceManagerExt::create(vvgpu, 0); + const auto& resource_manager = ResourceManagerCore::create(vvgpu, 0); std::vector> networks; std::vector> train_label_and_first_tensors; diff --git a/test/utest/pipeline/pipeline_test.cu b/test/utest/pipeline/pipeline_test.cu index 6587f31e4d..b71a2081af 100644 --- a/test/utest/pipeline/pipeline_test.cu +++ b/test/utest/pipeline/pipeline_test.cu @@ -19,7 +19,7 @@ #include #include -#include +#include #include using namespace HugeCTR; @@ -43,7 +43,7 @@ __global__ void setC(float *var, int count) { } void pipeline_test(const std::vector &device_list, bool use_graph) { - const auto &resource_manager = ResourceManager::create({device_list}, 0); + const auto &resource_manager = ResourceManagerCore::create({device_list}, 0); cudaProfilerStart(); std::vector pipeline_list; std::vector dup_pipeline_list; diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 0e7c96ade7..dc0492db15 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -17,7 +17,6 @@ cmake_minimum_required(VERSION 3.20) if(NOT DISABLE_CUDF) add_subdirectory(raw_script) add_subdirectory(dlrm_script) - add_subdirectory(io_benchmark) add_subdirectory(db_benchmark) add_subdirectory(inference_test_scripts) endif() \ No newline at end of file diff --git a/tools/io_benchmark/main.cpp b/tools/io_benchmark/main.cpp deleted file mode 100644 index 0b180cd756..0000000000 --- a/tools/io_benchmark/main.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -using namespace HugeCTR; - -std::vector str_to_vec(const std::string& str) { - std::istringstream is(str); - std::vector tokens{std::istream_iterator{is}, - std::istream_iterator{}}; - std::vector res; - for (auto& s : tokens) { - res.push_back(std::stoi(s)); - } - return res; -} - -int main(int argc, char** argv) { - argparse::ArgumentParser args("read_upload_bench"); - - args.add_argument("--num_dense").default_value(13).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("--num_categorical").default_value(26).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("--batch_size").default_value(65536).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("--gpus") - .default_value(std::string("0")) - .help("Space-delimited list of GPUs to upload the data onto"); - - args.add_argument("--num_threads").default_value(1).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("--num_batches_per_thread") - .default_value(1) - .action([](const std::string& value) { return std::stoi(value); }); - - args.add_argument("--io_block_size").default_value(524288).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("--io_depth").default_value(2).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("--io_alignment").default_value(512).action([](const std::string& value) { - return std::stoi(value); - }); - - args.add_argument("file").remaining(); - - try { - args.parse_args(argc, argv); - } catch (const std::runtime_error& err) { - std::cout << err.what() << std::endl; - std::cout << args; - exit(1); - } - - std::string fname; - try { - fname = args.get("file"); - } catch (std::logic_error& e) { - std::cout << "No input file provided" << std::endl; - exit(1); - } - - const int sample_dim = args.get("--num_dense") + args.get("--num_categorical") + 1; - const int batch_size_bytes = args.get("--batch_size") * sample_dim * sizeof(int); - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Init(&argc, &argv)); -#endif - HCTR_LIB_THROW(nvmlInit_v2()); - - std::vector> vvgpu; - vvgpu.push_back(str_to_vec(args.get("--gpus"))); - const auto resource_manager = ResourceManager::create(vvgpu, 424242); - - AsyncReaderImpl reader_impl( - fname, batch_size_bytes, resource_manager.get(), args.get("--num_threads"), - args.get("--num_batches_per_thread"), args.get("--io_block_size"), - args.get("--io_depth"), args.get("--io_alignment")); - - HCTR_LOG(INFO, WORLD, "Initialization done, starting to read...\n"); - fflush(stdout); - auto start = std::chrono::high_resolution_clock::now(); - - reader_impl.load_async(); - - size_t sz = 1; - while (sz > 0) { - BatchDesc desc = reader_impl.get_batch(); - sz = desc.size_bytes; - // usleep(200); - reader_impl.finalize_batch(); - } - - auto end = std::chrono::high_resolution_clock::now(); - auto elapsed = std::chrono::duration_cast(end - start); - HCTR_LOG(INFO, WORLD, "Reading took %.3fs, B/W %.2f GB/s\n", elapsed.count() / 1000.0, - std::filesystem::file_size(fname) / ((double)elapsed.count() * 1e6)); - -#ifdef ENABLE_MPI - HCTR_MPI_THROW(MPI_Finalize()); -#endif - - return 0; -}