diff --git a/CMakeLists.txt b/CMakeLists.txt index acde1eb292..cc31568c45 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,7 +122,8 @@ if (DISABLE_A2A_WARMUP) endif() option(DISABLE_CUDF "Disable cudf: disable parquet format related features" OFF) - +# this manual definition is a WAR and RMM team will fix it in the future +add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) option(USE_CUDART_STATIC "Setup clangformat target" OFF) if(USE_CUDART_STATIC) set(DISABLE_CUDF ON) @@ -137,13 +138,29 @@ if (DISABLE_CUDF) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDISABLE_CUDF") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DDISABLE_CUDF") else() + execute_process( + COMMAND bash -c "pip show cudf|grep Version | sed 's/.*: //'" + OUTPUT_VARIABLE CUDF_VERSION + ) + + string(REPLACE "." ";" VERSION_LIST ${CUDF_VERSION}) + list(GET VERSION_LIST 0 CUDF_VERSION_MAJOR) + list(GET VERSION_LIST 1 CUDF_VERSION_MINOR) + # list(GET VERSION_LIST 2 CUDF_VERSION_PATCH) + # add_compile_definitions(CUDF_VERSION_PATCH=${CUDF_VERSION_PATCH}) + + add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR}) + add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR}) + message(STATUS "CUDF_VERSION is ${CUDF_VERSION}") + find_package(Parquet REQUIRED CONFIG PATHS /usr/lib/cmake/arrow/ /usr/lib/cmake/Parquet/ NO_DEFAULT_PATH) if(Parquet_FOUND AND NOT ENABLE_HDFS AND NOT ENABLE_S3 AND NOT ENABLE_GCS) - message (STATUS "Arrow Parquet is found") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_ARROW_PARQUET") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_ARROW_PARQUET") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_ARROW_PARQUET") + message (STATUS "Arrow Parquet is found") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_ARROW_PARQUET") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_ARROW_PARQUET") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_ARROW_PARQUET") endif() + endif() option(SHARP_A2A "Enable SHARP All2All" OFF) diff --git a/HugeCTR/include/data_generator.hpp b/HugeCTR/include/data_generator.hpp index 513663a5f6..f9ab0b9a30 100644 --- a/HugeCTR/include/data_generator.hpp +++ b/HugeCTR/include/data_generator.hpp @@ -31,9 +31,8 @@ #include #include #include -#endif - #include +#endif namespace HugeCTR { diff --git a/HugeCTR/include/resource_manager.hpp b/HugeCTR/include/resource_manager.hpp index cfa0090c66..4d05947337 100644 --- a/HugeCTR/include/resource_manager.hpp +++ b/HugeCTR/include/resource_manager.hpp @@ -21,7 +21,10 @@ #include #include #include + +#ifndef DISABLE_CUDF #include +#endif namespace HugeCTR { @@ -45,8 +48,10 @@ class ResourceManager : public ResourceManagerBase { virtual DeviceMap::Layout get_device_layout() const = 0; +#ifndef DISABLE_CUDF virtual const std::shared_ptr& get_device_rmm_device_memory_resource(int local_gpu_id) const = 0; +#endif }; } // namespace HugeCTR diff --git a/HugeCTR/include/resource_managers/resource_manager_core.hpp b/HugeCTR/include/resource_managers/resource_manager_core.hpp index 438319e194..5eedf381dc 100644 --- a/HugeCTR/include/resource_managers/resource_manager_core.hpp +++ b/HugeCTR/include/resource_managers/resource_manager_core.hpp @@ -33,13 +33,16 @@ class ResourceManagerCore : public ResourceManager { std::vector> gpu_resources_; /**< GPU resource vector */ std::vector> p2p_matrix_; + void all2all_warmup(); + void enable_all_peer_accesses(); + +#ifndef DISABLE_CUDF std::vector> base_cuda_mr_; std::vector> memory_resource_; std::vector original_device_resource_; - void all2all_warmup(); - void enable_all_peer_accesses(); void initialize_rmm_resources(); +#endif public: ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map, @@ -112,7 +115,9 @@ class ResourceManagerCore : public ResourceManager { DeviceMap::Layout get_device_layout() const override { return device_map_.get_device_layout(); } +#ifndef DISABLE_CUDF const std::shared_ptr& get_device_rmm_device_memory_resource( int local_gpu_id) const override; +#endif }; } // namespace HugeCTR diff --git a/HugeCTR/src/data_readers/file_source_parquet.cpp b/HugeCTR/src/data_readers/file_source_parquet.cpp index 4acd6ab6e4..63c2a19fa6 100644 --- a/HugeCTR/src/data_readers/file_source_parquet.cpp +++ b/HugeCTR/src/data_readers/file_source_parquet.cpp @@ -154,8 +154,12 @@ cudf_io::table_with_metadata ParquetFileSource::read_group(size_t row_group_id, parquet_args_.set_row_groups(rgrps); // parquet_args_.set_num_rows(-1); parquet_args_.set_timestamp_type(cudf::data_type(cudf::type_id::EMPTY)); - auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, mr); +#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR >= 24 + auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, cudf::get_default_stream(), mr); +#else + auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, mr); +#endif if (!counter_) { HCTR_OWN_THROW(Error_t::UnspecificError, "Read parquet file first\n"); } diff --git a/HugeCTR/src/resource_managers/resource_manager_core.cpp b/HugeCTR/src/resource_managers/resource_manager_core.cpp index 06fe93b1b4..41dbc97048 100644 --- a/HugeCTR/src/resource_managers/resource_manager_core.cpp +++ b/HugeCTR/src/resource_managers/resource_manager_core.cpp @@ -21,12 +21,15 @@ #include #include #include + +#ifndef DISABLE_CUDF #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" #include #include #include #pragma GCC diagnostic pop +#endif namespace HugeCTR { @@ -98,27 +101,6 @@ void ResourceManagerCore::enable_all_peer_accesses() { } } -void ResourceManagerCore::initialize_rmm_resources() { - const size_t pool_alloc_size = 256 * 1024 * 1024; - using dmmr = rmm::mr::device_memory_resource; - static const char* allow_set_char = getenv("HCTR_RMM_SETTABLE"); - bool allow_set = true; - if (allow_set_char && allow_set_char[0] == '0') { - allow_set = false; - } - CudaDeviceContext context; - auto local_gpu_device_id_list = get_local_gpu_device_id_list(); - for (size_t i = 0; i < local_gpu_device_id_list.size(); i++) { - context.set_device(local_gpu_device_id_list[i]); - base_cuda_mr_.emplace_back(std::make_shared()); - memory_resource_.emplace_back(std::make_shared>( - base_cuda_mr_.back().get(), pool_alloc_size)); - if (allow_set) { - original_device_resource_.push_back( - rmm::mr::set_current_device_resource(memory_resource_.back().get())); - } - } -} ResourceManagerCore::ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map, unsigned long long seed) : num_process_(num_process), process_id_(process_id), device_map_(std::move(device_map)) { @@ -204,13 +186,17 @@ ResourceManagerCore::ResourceManagerCore(int num_process, int process_id, Device all2all_warmup(); +#ifndef DISABLE_CUDF initialize_rmm_resources(); +#endif // int dev_id = 0; // cudaGetDevice(&dev_id); // HCTR_LOG(INFO, WORLD, "ResourceManagerCore ctor getCurrentDeviceId after rmm_init %d\n", // dev_id); } + ResourceManagerCore::~ResourceManagerCore() { +#ifndef DISABLE_CUDF if (original_device_resource_.empty()) { return; } @@ -220,6 +206,7 @@ ResourceManagerCore::~ResourceManagerCore() { context.set_device(local_gpu_device_id_list[i]); rmm::mr::set_current_device_resource(original_device_resource_[i]); } +#endif } bool ResourceManagerCore::p2p_enabled(int src_device_id, int dst_device_id) const { return p2p_matrix_[src_device_id][dst_device_id]; @@ -240,6 +227,29 @@ bool ResourceManagerCore::all_p2p_enabled() const { return true; } +#ifndef DISABLE_CUDF +void ResourceManagerCore::initialize_rmm_resources() { + const size_t pool_alloc_size = 256 * 1024 * 1024; + using dmmr = rmm::mr::device_memory_resource; + static const char* allow_set_char = getenv("HCTR_RMM_SETTABLE"); + bool allow_set = true; + if (allow_set_char && allow_set_char[0] == '0') { + allow_set = false; + } + CudaDeviceContext context; + auto local_gpu_device_id_list = get_local_gpu_device_id_list(); + for (size_t i = 0; i < local_gpu_device_id_list.size(); i++) { + context.set_device(local_gpu_device_id_list[i]); + base_cuda_mr_.emplace_back(std::make_shared()); + memory_resource_.emplace_back(std::make_shared>( + base_cuda_mr_.back().get(), pool_alloc_size)); + if (allow_set) { + original_device_resource_.push_back( + rmm::mr::set_current_device_resource(memory_resource_.back().get())); + } + } +} + const std::shared_ptr& ResourceManagerCore::get_device_rmm_device_memory_resource(int local_gpu_id) const { auto dev_list = get_local_gpu_device_id_list(); @@ -247,5 +257,5 @@ ResourceManagerCore::get_device_rmm_device_memory_resource(int local_gpu_id) con auto index = std::distance(dev_list.begin(), it); return memory_resource_[index]; } - +#endif } // namespace HugeCTR diff --git a/tools/dlrm_script/CMakeLists.txt b/tools/dlrm_script/CMakeLists.txt index 2a22ab2dfc..0bbcc07cfb 100644 --- a/tools/dlrm_script/CMakeLists.txt +++ b/tools/dlrm_script/CMakeLists.txt @@ -41,7 +41,8 @@ if (NOT CUDF_RESULT) list(GET CUDF_VERSION_LIST 0 CUDF_VERSION_MAJOR) list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_MINOR) list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH) - + add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR}) + add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR}) if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6) add_definitions(-DCUDF_GE_2306) endif() diff --git a/tools/dlrm_script/dlrm_raw.cu b/tools/dlrm_script/dlrm_raw.cu index b5d1608027..56b7aa9355 100644 --- a/tools/dlrm_script/dlrm_raw.cu +++ b/tools/dlrm_script/dlrm_raw.cu @@ -136,7 +136,15 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string int loop_count = 0; while (true) { total_file_bytes_read += in_args.get_byte_range_size(); +#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \ + CUDF_VERSION_MINOR >= 12 + cudf_io::table_with_metadata tbl_w_metadata = + cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr); +#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23 + auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr); +#else cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr); +#endif total_row_nums += tbl_w_metadata.tbl->num_rows(); dim3 block(prop.maxThreadsPerBlock, 1, 1); @@ -488,7 +496,16 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri int loop_count = 0; while (true) { total_file_bytes_read += in_args.get_byte_range_size(); +#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \ + CUDF_VERSION_MINOR >= 12 + cudf_io::table_with_metadata tbl_w_metadata = + cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr); +#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23 + auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr); +#else cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr); + +#endif total_row_nums += tbl_w_metadata.tbl->num_rows(); dim3 block(prop.maxThreadsPerBlock, 1, 1); diff --git a/tools/dlrm_script/dlrm_raw_utils.hpp b/tools/dlrm_script/dlrm_raw_utils.hpp index 18bfa71f86..5f21102c90 100644 --- a/tools/dlrm_script/dlrm_raw_utils.hpp +++ b/tools/dlrm_script/dlrm_raw_utils.hpp @@ -536,7 +536,15 @@ size_t convert_input_binaries(rmm::mr::device_memory_resource *mr, std::string i while (true) { process_read_bytes += in_args.get_byte_range_size(); +#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \ + CUDF_VERSION_MINOR >= 12 + auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr); +#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23 + auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr); +#else auto tbl_w_metadata = cudf_io::read_csv(in_args, mr); + +#endif int32_t num_rows = tbl_w_metadata.tbl->num_rows(); read_row_nums += num_rows;