Skip to content

Commit

Permalink
Provide error message when a pool has taken all memory. (#11173)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Jan 22, 2025
1 parent eb980fc commit a97f379
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
6 changes: 6 additions & 0 deletions src/common/error_msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,11 @@ constexpr StringView InconsistentFeatureTypes() {
}

void CheckOldNccl(std::int32_t major, std::int32_t minor, std::int32_t patch);

constexpr StringView ZeroCudaMemory() {
return "No GPU memory is left, are you using RMM? If so, please install XGBoost with RMM "
"support. If you are using other types of memory pool, please consider reserving a "
"portion of the GPU memory for XGBoost.";
}
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_
2 changes: 1 addition & 1 deletion src/common/hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ bst_idx_t SketchBatchNumElements(bst_idx_t sketch_batch_num_elements, SketchShap
return std::min(static_cast<bst_idx_t>(n_max_used_f32), shape.nnz);
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
(void)container_bytes; // We known the remaining size when RMM is not used.

if (sketch_batch_num_elements == detail::UnknownSketchNumElements()) {
auto required_memory =
RequiredMemory(shape.n_samples, shape.n_features, shape.nnz, num_cuts, has_weight);
// use up to 80% of available space
auto avail = dh::AvailableMemory(device) * 0.8;
CHECK_GT(avail, 0) << error::ZeroCudaMemory();
if (required_memory > avail) {
sketch_batch_num_elements = avail / BytesPerElement(has_weight);
} else {
Expand Down

0 comments on commit a97f379

Please sign in to comment.