NVIDIA · eordentlich · Jan 12, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -28,11 +28,10 @@ If you already have a Dataproc account, you can run the example notebooks on a D
   ```
 - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
   ```
-  export CUDA_VERSION=11.8
   export RAPIDS_VERSION=24.12.0
 
   gcloud dataproc clusters create $USER-spark-rapids-ml \
-  --image-version=2.1-ubuntu \
+  --image-version=2.2-ubuntu22 \
   --region ${COMPUTE_REGION} \
   --master-machine-type n1-standard-16 \
   --master-accelerator type=nvidia-tesla-t4,count=1 \
@@ -42,11 +41,11 @@ If you already have a Dataproc account, you can run the example notebooks on a D
   --worker-machine-type n1-standard-16 \
   --num-worker-local-ssds 4 \
   --worker-local-ssd-interface=NVME \
-  --initialization-actions gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://${GCS_BUCKET}/spark_rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
+  --initialization-actions gs://${GCS_BUCKET}/spark-rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
+  --initialization-action-timeout=20m \
   --optional-components=JUPYTER \
   --metadata gpu-driver-provider="NVIDIA" \
   --metadata rapids-runtime=SPARK \
-  --metadata cuda-version=${CUDA_VERSION} \
   --metadata rapids-version=${RAPIDS_VERSION} \
   --bucket ${GCS_BUCKET} \
   --enable-component-gateway \

diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,17 +16,12 @@
 
 RAPIDS_VERSION=24.12.0
 
-# patch existing packages
-mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"
-
-# dataproc 2.1 pyarrow and arrow conda installation is not compatible with cudf
-mamba uninstall -y pyarrow arrow
 
 # install cudf and cuml
 pip install --upgrade pip
-pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} cuvs-cu11~=${RAPIDS_VERSION} \
-    pylibraft-cu11~=${RAPIDS_VERSION} \
-    rmm-cu11~=${RAPIDS_VERSION} \
+pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \
+    pylibraft-cu12~=${RAPIDS_VERSION} \
+    rmm-cu12~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com
 
 # install spark-rapids-ml

diff --git a/python/benchmark/dataproc/init_benchmark.sh b/python/benchmark/dataproc/init_benchmark.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,19 +24,14 @@ function get_metadata_attribute() {
 
 RAPIDS_VERSION=$(get_metadata_attribute rapids-version 24.12.0)
 
-# patch existing packages
-mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"
 
 # install cudf and cuml
 # using ~= pulls in lates micro version patches
 pip install --upgrade pip
 
-# dataproc 2.1 pyarrow and arrow conda installation is not compatible with cudf
-mamba uninstall -y pyarrow arrow
-
-pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} cuvs-cu11~=${RAPIDS_VERSION} \
-    pylibraft-cu11~=${RAPIDS_VERSION} \
-    rmm-cu11~=${RAPIDS_VERSION} \
+pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \
+    pylibraft-cu12~=${RAPIDS_VERSION} \
+    rmm-cu12~=${RAPIDS_VERSION} \
     --extra-index-url=https://pypi.nvidia.com
 
 # install benchmark files

diff --git a/python/benchmark/dataproc/start_cluster.sh b/python/benchmark/dataproc/start_cluster.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@ gpu_args=$(cat <<EOF
 --master-accelerator type=nvidia-tesla-t4,count=1
 --worker-accelerator type=nvidia-tesla-t4,count=1
 --initialization-actions gs://${BENCHMARK_HOME}/spark-rapids.sh,gs://${BENCHMARK_HOME}/init_benchmark.sh
+--initialization-action-timeout=20m
 --metadata gpu-driver-provider="NVIDIA"
 --metadata rapids-runtime=SPARK
 --metadata benchmark-home=${BENCHMARK_HOME}
@@ -62,7 +63,7 @@ if [[ $? == 0 ]]; then
 else
     set -x
     gcloud dataproc clusters create ${cluster_name} \
-    --image-version=2.1-ubuntu \
+    --image-version=2.2-ubuntu22 \
     --region ${COMPUTE_REGION} \
     --master-machine-type n1-standard-16 \
     --num-workers 2 \