Skip to content

Commit

Permalink
fix: GPU bootstrap, refresh driver versions and list of supported GPU…
Browse files Browse the repository at this point in the history
… VM SKUs (#587)
  • Loading branch information
tallaxes committed Dec 18, 2024
1 parent 7668db3 commit 475fb87
Show file tree
Hide file tree
Showing 15 changed files with 249 additions and 145 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/crate-ci/typos
rev: v1.17.2
rev: v1.28.1
hooks:
- id: typos
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require (
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
golang.org/x/crypto v0.27.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.29.3
k8s.io/apiextensions-apiserver v0.29.3
k8s.io/apimachinery v0.29.3
Expand Down Expand Up @@ -146,7 +147,6 @@ require (
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cloud-provider v0.29.3 // indirect
k8s.io/component-base v0.29.3 // indirect
Expand Down
6 changes: 3 additions & 3 deletions pkg/providers/imagefamily/azlinux.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ func (u AzureLinux) UserData(kubeletConfig *corev1beta1.KubeletConfiguration, ta
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
// GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu
// See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201
SubnetID: u.Options.SubnetID,
GPUDriverType: u.Options.GPUDriverType,
GPUImageSHA: u.Options.GPUImageSHA,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/imagefamily/bootstrap/aksbootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ type NodeBootstrapVariables struct {
SwapFileSizeMB int // t user input
GPUImageSHA string // s static sha rarely updated
GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator.
GPUDriverType string // k
GPUInstanceProfile string // t user-specified
CustomSearchDomainName string // c user-specified [presumably cluster-level]
CustomSearchRealmUser string // c user-specified [presumably cluster-level]
Expand Down Expand Up @@ -472,6 +473,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) {
nbv.GPUNode = true
nbv.ConfigGPUDriverIfNeeded = true
nbv.GPUDriverVersion = a.GPUDriverVersion
nbv.GPUDriverType = a.GPUDriverType
nbv.GPUImageSHA = a.GPUImageSHA
}

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type Options struct {
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
SubnetID string
}
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}"
SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}"
GPU_IMAGE_SHA="{{.GPUImageSHA}}"
GPU_DRIVER_VERSION="{{.GPUDriverVersion}}"
GPU_DRIVER_TYPE="{{.GPUDriverType}}"
GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}"
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/ubuntu_2204.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ func (u Ubuntu2204) UserData(kubeletConfig *corev1beta1.KubeletConfiguration, ta
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
GPUDriverType: u.Options.GPUDriverType,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
Expand Down
53 changes: 27 additions & 26 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -646,12 +646,10 @@ var _ = Describe("InstanceType Provider", func() {
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))

}
}

})

DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) {
Expand Down Expand Up @@ -697,7 +695,7 @@ var _ = Describe("InstanceType Provider", func() {
}}}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))
})
It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() {
Expand Down Expand Up @@ -1032,20 +1030,15 @@ var _ = Describe("InstanceType Provider", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
node := ExpectScheduled(ctx, env.Client, pod)

Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
Expect(vm.Properties).ToNot(BeNil())
Expect(vm.Properties.HardwareProfile).ToNot(BeNil())
Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse())

clusterNodes := cluster.Nodes()
node := clusterNodes[0]
if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0"))
}
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0"))
})

It("should schedule GPU pod on GPU capable node", func() {
Expand Down Expand Up @@ -1075,23 +1068,31 @@ var _ = Describe("InstanceType Provider", func() {
})

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)

// Verify that the node has the GPU label set that the pod was scheduled on
clusterNodes := cluster.Nodes()
Expect(clusterNodes).ToNot(BeEmpty())
Expect(len(clusterNodes)).To(Equal(1))
node := clusterNodes[0]
Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))

if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
node := ExpectScheduled(ctx, env.Client, pod)

Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100"))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1"))
// the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption
Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3"))

// Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3)
customData := ExpectDecodedCustomData(azureEnv)
Expect(customData).To(SatisfyAll(
ContainSubstring("GPU_NODE=true"),
ContainSubstring("SGX_NODE=false"),
ContainSubstring("MIG_NODE=false"),
ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"),
ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"),
ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""),
ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)),
ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)),
ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""),
ContainSubstring("GPU_INSTANCE_PROFILE=\"\""),
))

}
// Verify that the node the pod was scheduled on has GPU resource and labels set
Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4"))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1"))
})
})

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp
Arch: arch,
GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name),
GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name),
GPUDriverType: utils.GetGPUDriverType(instanceType.Name),
GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name),
TenantID: p.tenantID,
SubscriptionID: p.subscriptionID,
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/parameters/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type StaticParameters struct {
Arch string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
TenantID string
SubscriptionID string
Expand Down
20 changes: 19 additions & 1 deletion pkg/test/expectations/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,31 @@ limitations under the License.
package expectations

import (
"encoding/base64"
"fmt"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/Azure/karpenter-provider-azure/pkg/fake"
"github.com/Azure/karpenter-provider-azure/pkg/test"
. "github.com/onsi/gomega"
)

func ExpectUnavailable(env *test.Environment, instanceType string, zone string, capacityType string) {
Expect(env.UnavailableOfferingsCache.IsUnavailable(instanceType, fmt.Sprintf("%s-%s", fake.Region, zone), capacityType)).To(BeTrue())
}

func ExpectDecodedCustomData(env *test.Environment) string {
GinkgoHelper()
Expect(env.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))

vm := env.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
customData := *vm.Properties.OSProfile.CustomData
Expect(customData).ToNot(BeNil())

decodedBytes, err := base64.StdEncoding.DecodeString(customData)
Expect(err).To(Succeed())
decodedString := string(decodedBytes[:])

return decodedString
}
Loading

0 comments on commit 475fb87

Please sign in to comment.