From 50226178dde53a3e41d6b22de7fcc0e679dcc143 Mon Sep 17 00:00:00 2001 From: Shiva Krishna Merla Date: Thu, 7 Sep 2023 23:59:38 +0000 Subject: [PATCH] Cherry-picks for v0.5.4 release --- cmd/nvidia-mig-parted/main.go | 2 +- .../nvidia-mig-manager-example-hopper.yaml | 53 ++++++++++++++++++- .../nvidia-mig-manager-example.yaml | 2 +- deployments/systemd/config-default.yaml | 4 +- deployments/systemd/packages/debian/changelog | 13 +++++ .../rpm/SPECS/nvidia-mig-manager.spec | 7 +++ versions.mk | 6 +-- 7 files changed, 78 insertions(+), 9 deletions(-) diff --git a/cmd/nvidia-mig-parted/main.go b/cmd/nvidia-mig-parted/main.go index 955c4c30..38333c71 100644 --- a/cmd/nvidia-mig-parted/main.go +++ b/cmd/nvidia-mig-parted/main.go @@ -43,7 +43,7 @@ func main() { c.UseShortOptionHandling = true c.EnableBashCompletion = true c.Usage = "Manage MIG partitions across the full set of NVIDIA GPUs on a node" - c.Version = "0.5.2" + c.Version = "0.5.4" // Setup the flags for this command c.Flags = []cli.Flag{ diff --git a/deployments/gpu-operator/nvidia-mig-manager-example-hopper.yaml b/deployments/gpu-operator/nvidia-mig-manager-example-hopper.yaml index c9fc540f..d888e662 100644 --- a/deployments/gpu-operator/nvidia-mig-manager-example-hopper.yaml +++ b/deployments/gpu-operator/nvidia-mig-manager-example-hopper.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: nvidia-mig-manager-service-account containers: - name: nvidia-mig-manager - image: nvcr.io/nvidia/cloud-native/k8s-mig-manager:v0.5.2-ubi8 + image: nvcr.io/nvidia/cloud-native/k8s-mig-manager:v0.5.4-ubi8 imagePullPolicy: IfNotPresent env: - name: NODE_NAME @@ -144,10 +144,59 @@ data: mig-devices: "7g.80gb": 1 - all-balanced: + # H100 NVL, H800 NVL + all-1g.12gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.12gb": 7 + + all-1g.12gb.me: + - devices: all + mig-enabled: true + mig-devices: + "1g.12gb+me": 1 + + all-1g.24gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.24gb": 4 + + all-2g.24gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.24gb": 3 + + all-3g.47gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.47gb": 2 + + all-7g.94gb: - devices: all + mig-enabled: true + mig-devices: + "7g.94gb": 1 + + # H100-80GB, H100-NVL, H800-80GB, H800-NVL + all-balanced: + # H100-80GB, H800-80GB + - device-filter: ["0x233110DE", "0x233010DE", "0x232210DE"] + devices: all mig-enabled: true mig-devices: "1g.10gb": 2 "2g.20gb": 1 "3g.40gb": 1 + + # H100 NVL, H800 NVL + - device-filter: ["0x232110DE", "0x233A10DE"] + devices: all + mig-enabled: true + mig-devices: + "1g.12gb": 1 + "2g.24gb": 1 + "3g.47gb": 1 diff --git a/deployments/gpu-operator/nvidia-mig-manager-example.yaml b/deployments/gpu-operator/nvidia-mig-manager-example.yaml index 976a9c87..c457ebaa 100644 --- a/deployments/gpu-operator/nvidia-mig-manager-example.yaml +++ b/deployments/gpu-operator/nvidia-mig-manager-example.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: nvidia-mig-manager-service-account containers: - name: nvidia-mig-manager - image: nvcr.io/nvidia/cloud-native/k8s-mig-manager:v0.5.2-ubi8 + image: nvcr.io/nvidia/cloud-native/k8s-mig-manager:v0.5.4-ubi8 imagePullPolicy: IfNotPresent env: - name: NODE_NAME diff --git a/deployments/systemd/config-default.yaml b/deployments/systemd/config-default.yaml index fc9deaf4..e7e77ef4 100644 --- a/deployments/systemd/config-default.yaml +++ b/deployments/systemd/config-default.yaml @@ -44,7 +44,7 @@ mig-configs: # H100-80GB, H800-80GB, A100-80GB, A800-80GB, A100-40GB, A800-40GB all-1g.10gb: # H100-80GB, H800-80GB, A100-80GB, A800-80GB - - device-filter: ["0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE"] + - device-filter: ["0x233110DE", "0x233010DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE"] devices: all mig-enabled: true mig-devices: @@ -163,7 +163,7 @@ mig-configs: "3g.20gb": 1 # H100-80GB, H800-80GB, A100-80GB, A800-80GB - - device-filter: ["0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE"] + - device-filter: ["0x233110DE", "0x233010DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE"] devices: all mig-enabled: true mig-devices: diff --git a/deployments/systemd/packages/debian/changelog b/deployments/systemd/packages/debian/changelog index a0f1e1a5..c6adced3 100644 --- a/deployments/systemd/packages/debian/changelog +++ b/deployments/systemd/packages/debian/changelog @@ -1,3 +1,16 @@ +nvidia-mig-manager (0.5.4-1) UNRELEASED; urgency=medium + + * Update MIG config for Hopper with device ID of H100 80GB HBM3 SKU + + -- NVIDIA CORPORATION Thu, 7 Sep 2023 21:27:44 +0200 + +nvidia-mig-manager (0.5.3-1) UNRELEASED; urgency=medium + + * Update to latest CUDA image 12.2.0 + * Update example config for Hopper with H100 NVL and H800 NVL + + -- NVIDIA CORPORATION Wed, 12 Jul 2023 11:06:03 +0200 + nvidia-mig-manager (0.5.2-1) UNRELEASED; urgency=medium * Update to latest CUDA image 12.1.0 diff --git a/deployments/systemd/packages/rpm/SPECS/nvidia-mig-manager.spec b/deployments/systemd/packages/rpm/SPECS/nvidia-mig-manager.spec index 66d75408..3c4e8eeb 100644 --- a/deployments/systemd/packages/rpm/SPECS/nvidia-mig-manager.spec +++ b/deployments/systemd/packages/rpm/SPECS/nvidia-mig-manager.spec @@ -133,6 +133,13 @@ maybe_remove_hooks_symlink maybe_remove_config_symlink %changelog +* Thu Sep 7 2023 NVIDIA CORPORATION 0.5.4-1 +- Update MIG config for Hopper with device ID of H100 80GB HBM3 SKU + +* Wed Jul 12 2023 NVIDIA CORPORATION 0.5.3-1 +- Update to latest CUDA image 12.2.0 +- Update example config for Hopper with H100 NVL and H800 NVL + * Tue Mar 28 2023 NVIDIA CORPORATION 0.5.2-1 - Update to latest CUDA image 12.1.0 - Update k8s-mig-manager to support CDI diff --git a/versions.mk b/versions.mk index 5af5de4e..58772345 100644 --- a/versions.mk +++ b/versions.mk @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -VERSION ?= 0.5.2 +VERSION ?= 0.5.4 vVERSION := v$(VERSION:v%=%) -CUDA_VERSION := 12.1.0 +CUDA_VERSION := 12.2.0 GOLANG_VERSION := 1.20.1 -NVIDIA_CTK_VERSION := f6983969ad5d67b84adfda8eee5b43083790ff22 +NVIDIA_CTK_VERSION := v1.13.4