Skip to content

Commit

Permalink
Merge branch 'fix-rmm_build_error-aleliu' into 'main'
Browse files Browse the repository at this point in the history
fix DISABLE_CUDF build option

See merge request dl/hugectr/hugectr!1521
  • Loading branch information
minseokl committed Jan 26, 2024
2 parents 19e7154 + a620bda commit af9c40c
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 33 deletions.
27 changes: 22 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ if (DISABLE_A2A_WARMUP)
endif()

option(DISABLE_CUDF "Disable cudf: disable parquet format related features" OFF)

# this manual definition is a WAR and RMM team will fix it in the future
add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
option(USE_CUDART_STATIC "Setup clangformat target" OFF)
if(USE_CUDART_STATIC)
set(DISABLE_CUDF ON)
Expand All @@ -137,13 +138,29 @@ if (DISABLE_CUDF)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDISABLE_CUDF")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DDISABLE_CUDF")
else()
execute_process(
COMMAND bash -c "pip show cudf|grep Version | sed 's/.*: //'"
OUTPUT_VARIABLE CUDF_VERSION
)

string(REPLACE "." ";" VERSION_LIST ${CUDF_VERSION})
list(GET VERSION_LIST 0 CUDF_VERSION_MAJOR)
list(GET VERSION_LIST 1 CUDF_VERSION_MINOR)
# list(GET VERSION_LIST 2 CUDF_VERSION_PATCH)
# add_compile_definitions(CUDF_VERSION_PATCH=${CUDF_VERSION_PATCH})

add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
message(STATUS "CUDF_VERSION is ${CUDF_VERSION}")

find_package(Parquet REQUIRED CONFIG PATHS /usr/lib/cmake/arrow/ /usr/lib/cmake/Parquet/ NO_DEFAULT_PATH)
if(Parquet_FOUND AND NOT ENABLE_HDFS AND NOT ENABLE_S3 AND NOT ENABLE_GCS)
message (STATUS "Arrow Parquet is found")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_ARROW_PARQUET")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_ARROW_PARQUET")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_ARROW_PARQUET")
message (STATUS "Arrow Parquet is found")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_ARROW_PARQUET")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_ARROW_PARQUET")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DENABLE_ARROW_PARQUET")
endif()

endif()

option(SHARP_A2A "Enable SHARP All2All" OFF)
Expand Down
3 changes: 1 addition & 2 deletions HugeCTR/include/data_generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/bit.hpp>
#endif

#include <rmm/device_buffer.hpp>
#endif

namespace HugeCTR {

Expand Down
5 changes: 5 additions & 0 deletions HugeCTR/include/resource_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
#include <device_map.hpp>
#include <gpu_resource.hpp>
#include <resource_manager_base.hpp>

#ifndef DISABLE_CUDF
#include <rmm/mr/device/device_memory_resource.hpp>
#endif

namespace HugeCTR {

Expand All @@ -45,8 +48,10 @@ class ResourceManager : public ResourceManagerBase {

virtual DeviceMap::Layout get_device_layout() const = 0;

#ifndef DISABLE_CUDF
virtual const std::shared_ptr<rmm::mr::device_memory_resource>&
get_device_rmm_device_memory_resource(int local_gpu_id) const = 0;
#endif
};

} // namespace HugeCTR
9 changes: 7 additions & 2 deletions HugeCTR/include/resource_managers/resource_manager_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,16 @@ class ResourceManagerCore : public ResourceManager {
std::vector<std::shared_ptr<GPUResource>> gpu_resources_; /**< GPU resource vector */
std::vector<std::vector<bool>> p2p_matrix_;

void all2all_warmup();
void enable_all_peer_accesses();

#ifndef DISABLE_CUDF
std::vector<std::shared_ptr<rmm::mr::device_memory_resource>> base_cuda_mr_;
std::vector<std::shared_ptr<rmm::mr::device_memory_resource>> memory_resource_;
std::vector<rmm::mr::device_memory_resource*> original_device_resource_;

void all2all_warmup();
void enable_all_peer_accesses();
void initialize_rmm_resources();
#endif

public:
ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map,
Expand Down Expand Up @@ -112,7 +115,9 @@ class ResourceManagerCore : public ResourceManager {

DeviceMap::Layout get_device_layout() const override { return device_map_.get_device_layout(); }

#ifndef DISABLE_CUDF
const std::shared_ptr<rmm::mr::device_memory_resource>& get_device_rmm_device_memory_resource(
int local_gpu_id) const override;
#endif
};
} // namespace HugeCTR
6 changes: 5 additions & 1 deletion HugeCTR/src/data_readers/file_source_parquet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,12 @@ cudf_io::table_with_metadata ParquetFileSource::read_group(size_t row_group_id,
parquet_args_.set_row_groups(rgrps);
// parquet_args_.set_num_rows(-1);
parquet_args_.set_timestamp_type(cudf::data_type(cudf::type_id::EMPTY));
auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, mr);

#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR >= 24
auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, cudf::get_default_stream(), mr);
#else
auto tbl_w_metadata = cudf_io::read_parquet(parquet_args_, mr);
#endif
if (!counter_) {
HCTR_OWN_THROW(Error_t::UnspecificError, "Read parquet file first\n");
}
Expand Down
54 changes: 32 additions & 22 deletions HugeCTR/src/resource_managers/resource_manager_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@
#include <random>
#include <resource_managers/resource_manager_core.hpp>
#include <utils.hpp>

#ifndef DISABLE_CUDF
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>
#pragma GCC diagnostic pop
#endif

namespace HugeCTR {

Expand Down Expand Up @@ -98,27 +101,6 @@ void ResourceManagerCore::enable_all_peer_accesses() {
}
}

void ResourceManagerCore::initialize_rmm_resources() {
const size_t pool_alloc_size = 256 * 1024 * 1024;
using dmmr = rmm::mr::device_memory_resource;
static const char* allow_set_char = getenv("HCTR_RMM_SETTABLE");
bool allow_set = true;
if (allow_set_char && allow_set_char[0] == '0') {
allow_set = false;
}
CudaDeviceContext context;
auto local_gpu_device_id_list = get_local_gpu_device_id_list();
for (size_t i = 0; i < local_gpu_device_id_list.size(); i++) {
context.set_device(local_gpu_device_id_list[i]);
base_cuda_mr_.emplace_back(std::make_shared<rmm::mr::cuda_memory_resource>());
memory_resource_.emplace_back(std::make_shared<rmm::mr::pool_memory_resource<dmmr>>(
base_cuda_mr_.back().get(), pool_alloc_size));
if (allow_set) {
original_device_resource_.push_back(
rmm::mr::set_current_device_resource(memory_resource_.back().get()));
}
}
}
ResourceManagerCore::ResourceManagerCore(int num_process, int process_id, DeviceMap&& device_map,
unsigned long long seed)
: num_process_(num_process), process_id_(process_id), device_map_(std::move(device_map)) {
Expand Down Expand Up @@ -204,13 +186,17 @@ ResourceManagerCore::ResourceManagerCore(int num_process, int process_id, Device

all2all_warmup();

#ifndef DISABLE_CUDF
initialize_rmm_resources();
#endif
// int dev_id = 0;
// cudaGetDevice(&dev_id);
// HCTR_LOG(INFO, WORLD, "ResourceManagerCore ctor getCurrentDeviceId after rmm_init %d\n",
// dev_id);
}

ResourceManagerCore::~ResourceManagerCore() {
#ifndef DISABLE_CUDF
if (original_device_resource_.empty()) {
return;
}
Expand All @@ -220,6 +206,7 @@ ResourceManagerCore::~ResourceManagerCore() {
context.set_device(local_gpu_device_id_list[i]);
rmm::mr::set_current_device_resource(original_device_resource_[i]);
}
#endif
}
bool ResourceManagerCore::p2p_enabled(int src_device_id, int dst_device_id) const {
return p2p_matrix_[src_device_id][dst_device_id];
Expand All @@ -240,12 +227,35 @@ bool ResourceManagerCore::all_p2p_enabled() const {
return true;
}

#ifndef DISABLE_CUDF
void ResourceManagerCore::initialize_rmm_resources() {
const size_t pool_alloc_size = 256 * 1024 * 1024;
using dmmr = rmm::mr::device_memory_resource;
static const char* allow_set_char = getenv("HCTR_RMM_SETTABLE");
bool allow_set = true;
if (allow_set_char && allow_set_char[0] == '0') {
allow_set = false;
}
CudaDeviceContext context;
auto local_gpu_device_id_list = get_local_gpu_device_id_list();
for (size_t i = 0; i < local_gpu_device_id_list.size(); i++) {
context.set_device(local_gpu_device_id_list[i]);
base_cuda_mr_.emplace_back(std::make_shared<rmm::mr::cuda_memory_resource>());
memory_resource_.emplace_back(std::make_shared<rmm::mr::pool_memory_resource<dmmr>>(
base_cuda_mr_.back().get(), pool_alloc_size));
if (allow_set) {
original_device_resource_.push_back(
rmm::mr::set_current_device_resource(memory_resource_.back().get()));
}
}
}

const std::shared_ptr<rmm::mr::device_memory_resource>&
ResourceManagerCore::get_device_rmm_device_memory_resource(int local_gpu_id) const {
auto dev_list = get_local_gpu_device_id_list();
auto it = std::find(dev_list.begin(), dev_list.end(), local_gpu_id);
auto index = std::distance(dev_list.begin(), it);
return memory_resource_[index];
}

#endif
} // namespace HugeCTR
3 changes: 2 additions & 1 deletion tools/dlrm_script/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ if (NOT CUDF_RESULT)
list(GET CUDF_VERSION_LIST 0 CUDF_VERSION_MAJOR)
list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_MINOR)
list(GET CUDF_VERSION_LIST 1 CUDF_VERSION_PATCH)

add_compile_definitions(CUDF_VERSION_MAJOR=${CUDF_VERSION_MAJOR})
add_compile_definitions(CUDF_VERSION_MINOR=${CUDF_VERSION_MINOR})
if(${CUDF_VERSION_MAJOR} EQUAL 23 AND ${CUDF_VERSION_MINOR} GREATER 6)
add_definitions(-DCUDF_GE_2306)
endif()
Expand Down
17 changes: 17 additions & 0 deletions tools/dlrm_script/dlrm_raw.cu
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,15 @@ void process_kaggle_dataset(const std::string &input_dir_path, const std::string
int loop_count = 0;
while (true) {
total_file_bytes_read += in_args.get_byte_range_size();
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \
CUDF_VERSION_MINOR >= 12
cudf_io::table_with_metadata tbl_w_metadata =
cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
#else
cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);
#endif
total_row_nums += tbl_w_metadata.tbl->num_rows();

dim3 block(prop.maxThreadsPerBlock, 1, 1);
Expand Down Expand Up @@ -488,7 +496,16 @@ void process_terabyte_dataset(const std::string &input_dir_path, const std::stri
int loop_count = 0;
while (true) {
total_file_bytes_read += in_args.get_byte_range_size();
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \
CUDF_VERSION_MINOR >= 12
cudf_io::table_with_metadata tbl_w_metadata =
cudf_io::read_csv(in_args, cudf::get_default_stream(), p_mr);
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
#else
cudf_io::table_with_metadata tbl_w_metadata = cudf_io::read_csv(in_args, p_mr);

#endif
total_row_nums += tbl_w_metadata.tbl->num_rows();

dim3 block(prop.maxThreadsPerBlock, 1, 1);
Expand Down
8 changes: 8 additions & 0 deletions tools/dlrm_script/dlrm_raw_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,15 @@ size_t convert_input_binaries(rmm::mr::device_memory_resource *mr, std::string i

while (true) {
process_read_bytes += in_args.get_byte_range_size();
#if defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR == 23 && defined(CUDF_VERSION_MINOR) && \
CUDF_VERSION_MINOR >= 12
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
#elif defined(CUDF_VERSION_MAJOR) && CUDF_VERSION_MAJOR > 23
auto tbl_w_metadata = cudf_io::read_csv(in_args, cudf::get_default_stream(), mr);
#else
auto tbl_w_metadata = cudf_io::read_csv(in_args, mr);

#endif
int32_t num_rows = tbl_w_metadata.tbl->num_rows();
read_row_nums += num_rows;

Expand Down

0 comments on commit af9c40c

Please sign in to comment.