Skip to content

Commit

Permalink
Merge pull request #1199 from NVIDIA/add-kmod-type
Browse files Browse the repository at this point in the history
add new field kernelModuleType to the API Spec
  • Loading branch information
tariq1890 authored Jan 14, 2025
2 parents 55cb484 + 85b71aa commit 8efa2d1
Show file tree
Hide file tree
Showing 17 changed files with 129 additions and 126 deletions.
39 changes: 9 additions & 30 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,8 @@ import (
kata_v1alpha1 "github.com/NVIDIA/k8s-kata-manager/api/v1alpha1/config"
upgrade_v1alpha1 "github.com/NVIDIA/k8s-operator-libs/api/upgrade/v1alpha1"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"golang.org/x/mod/semver"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/NVIDIA/gpu-operator/internal/consts"
)

// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
Expand Down Expand Up @@ -482,12 +479,20 @@ type DriverSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UsePrecompiled *bool `json:"usePrecompiled,omitempty"`

// Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
// UseOpenKernelModules indicates if the open GPU kernel modules should be used
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"`

// KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
// Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
// type is chosen based on the GPU devices on the host and the driver branch used
// +kubebuilder:validation:Enum=auto;open;proprietary
// +kubebuilder:default=auto
KernelModuleType string `json:"kernelModuleType,omitempty"`

// Enabled indicates if deployment of NVIDIA Driver through operator is enabled
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator"
Expand Down Expand Up @@ -1856,11 +1861,7 @@ func (d *DriverSpec) UsePrecompiledDrivers() bool {

// OpenKernelModulesEnabled returns true if driver install is enabled using open GPU kernel modules
func (d *DriverSpec) OpenKernelModulesEnabled() bool {
if d.UseOpenKernelModules == nil {
// default is false if not specified by user
return false
}
return *d.UseOpenKernelModules
return d.KernelModuleType == "open"
}

// IsEnabled returns true if device-plugin is enabled(default) through gpu-operator
Expand Down Expand Up @@ -2000,28 +2001,6 @@ func (gds *GPUDirectStorageSpec) IsEnabled() bool {
return *gds.Enabled
}

// IsOpenKernelModulesRequired returns true if NVIDIA OpenRM drivers required in this configuration
func (gds *GPUDirectStorageSpec) IsOpenKernelModulesRequired() bool {
// Add constraints here which require OpenRM drivers
if !gds.IsEnabled() {
return false
}

// If image digest is provided instead of the version, assume that OpenRM driver is required
if strings.HasPrefix(gds.Version, "sha256") {
return true
}

gdsVersion := gds.Version
if !strings.HasPrefix(gdsVersion, "v") {
gdsVersion = fmt.Sprintf("v%s", gdsVersion)
}
if semver.Compare(gdsVersion, consts.MinimumGDSVersionForOpenRM) >= 0 {
return true
}
return false
}

// IsEnabled returns true if GDRCopy is enabled through gpu-operator
func (gdrcopy *GDRCopySpec) IsEnabled() bool {
if gdrcopy.Enabled == nil {
Expand Down
13 changes: 9 additions & 4 deletions api/nvidia/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,20 @@ type NVIDIADriverSpec struct {
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="usePrecompiled is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
UsePrecompiled *bool `json:"usePrecompiled,omitempty"`

// Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
// UseOpenKernelModules indicates if the open GPU kernel modules should be used
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"`

// KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
// Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
// type is chosen based on the GPU devices on the host and the driver branch used
// +kubebuilder:validation:Enum=auto;open;proprietary
// +kubebuilder:default=auto
KernelModuleType string `json:"kernelModuleType,omitempty"`

// NVIDIA Driver container startup probe settings
StartupProbe *ContainerProbeSpec `json:"startupProbe,omitempty"`

Expand Down Expand Up @@ -642,10 +650,7 @@ func (d *NVIDIADriverSpec) IsGDRCopyEnabled() bool {

// IsOpenKernelModulesEnabled returns true if NVIDIA OpenRM drivers are enabled
func (d *NVIDIADriverSpec) IsOpenKernelModulesEnabled() bool {
if d.UseOpenKernelModules == nil || !*d.UseOpenKernelModules {
return false
}
return true
return d.KernelModuleType == "open"
}

// IsOpenKernelModulesRequired returns true if NVIDIA OpenRM drivers required in this configuration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ metadata:
"driver": {
"enabled": true,
"useNvidiaDriverCRD": false,
"useOpenKernelModules": false,
"upgradePolicy": {
"autoUpgrade": true,
"drain": {
Expand All @@ -50,6 +49,7 @@ metadata:
"force": false,
"timeoutSeconds": 300
},
"kernelModuleType": "auto",
"maxParallelUpgrades": 1,
"maxUnavailable": "25%",
"podDeletion": {
Expand Down
16 changes: 14 additions & 2 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,17 @@ spec:
name:
type: string
type: object
kernelModuleType:
default: auto
description: |-
KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
type is chosen based on the GPU devices on the host and the driver branch used
enum:
- auto
- open
- proprietary
type: string
licensingConfig:
description: 'Optional: Licensing configuration for NVIDIA vGPU
licensing'
Expand Down Expand Up @@ -978,8 +989,9 @@ spec:
NVIDIA Driver is managed by the NVIDIADriver CRD type
type: boolean
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
description: |-
Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
UseOpenKernelModules indicates if the open GPU kernel modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA
Expand Down
16 changes: 14 additions & 2 deletions bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ spec:
name:
type: string
type: object
kernelModuleType:
default: auto
description: |-
KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
type is chosen based on the GPU devices on the host and the driver branch used
enum:
- auto
- open
- proprietary
type: string
labels:
additionalProperties:
type: string
Expand Down Expand Up @@ -684,8 +695,9 @@ spec:
type: object
type: array
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
description: |-
Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
UseOpenKernelModules indicates if the open GPU kernel modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA Driver
Expand Down
16 changes: 14 additions & 2 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,17 @@ spec:
name:
type: string
type: object
kernelModuleType:
default: auto
description: |-
KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
type is chosen based on the GPU devices on the host and the driver branch used
enum:
- auto
- open
- proprietary
type: string
licensingConfig:
description: 'Optional: Licensing configuration for NVIDIA vGPU
licensing'
Expand Down Expand Up @@ -978,8 +989,9 @@ spec:
NVIDIA Driver is managed by the NVIDIADriver CRD type
type: boolean
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
description: |-
Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
UseOpenKernelModules indicates if the open GPU kernel modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA
Expand Down
16 changes: 14 additions & 2 deletions config/crd/bases/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ spec:
name:
type: string
type: object
kernelModuleType:
default: auto
description: |-
KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
type is chosen based on the GPU devices on the host and the driver branch used
enum:
- auto
- open
- proprietary
type: string
labels:
additionalProperties:
type: string
Expand Down Expand Up @@ -684,8 +695,9 @@ spec:
type: object
type: array
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
description: |-
Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
UseOpenKernelModules indicates if the open GPU kernel modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA Driver
Expand Down
14 changes: 9 additions & 5 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ const (
DefaultCCModeEnvName = "DEFAULT_CC_MODE"
// OpenKernelModulesEnabledEnvName is the name of the driver-container envvar for enabling open GPU kernel module support
OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED"
// KernelModuleTypeEnvName is the name of the driver-container envvar to set the desired kernel module type
KernelModuleTypeEnvName = "KERNEL_MODULE_TYPE"
// MPSRootEnvName is the name of the envvar for configuring the MPS root
MPSRootEnvName = "MPS_ROOT"
// DefaultMPSRoot is the default MPS root path on the host
Expand Down Expand Up @@ -2664,9 +2666,6 @@ func transformGDSContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
if config.Driver.UsePrecompiledDrivers() {
return fmt.Errorf("GPUDirect Storage driver (nvidia-fs) is not supported along with pre-compiled NVIDIA drivers")
}
if config.GPUDirectStorage.IsOpenKernelModulesRequired() && !config.Driver.OpenKernelModulesEnabled() {
return fmt.Errorf("GPUDirect Storage driver '%s' is only supported with NVIDIA OpenRM drivers. Please set 'driver.useOpenKernelModules=true' in ClusterPolicy to enable OpenRM mode", config.GPUDirectStorage.Version)
}

gdsContainer := &obj.Spec.Template.Spec.Containers[i]

Expand Down Expand Up @@ -3166,8 +3165,13 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
setContainerEnv(driverContainer, env.Name, env.Value)
}
}
if config.Driver.OpenKernelModulesEnabled() {
setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true")

if len(config.Driver.KernelModuleType) > 0 {
setContainerEnv(driverContainer, KernelModuleTypeEnvName, config.Driver.KernelModuleType)
// we set the "OPEN_KERNEL_MODULES_ENABLED" envar for backwards compatibility with older driver containers
if config.Driver.OpenKernelModulesEnabled() {
setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true")
}
}

// set container probe timeouts
Expand Down
68 changes: 0 additions & 68 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -952,74 +952,6 @@ func TestSandboxDevicePluginAssets(t *testing.T) {
}
}

func TestIsOpenKernelModulesRequired(t *testing.T) {
enable := true
disable := false
testCases := []struct {
description string
gds *gpuv1.GPUDirectStorageSpec
output bool
}{
{
"gds-disabled",
&gpuv1.GPUDirectStorageSpec{Enabled: &disable, Version: "v2.14.5"},
false,
},
{
"digest",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "sha256:8d1ec78f2b1ddb7f0c47453d0427231190747bda411733a7dd0c8f5196f09e9c"},
true,
},
{
"lower",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v2.14.5"},
false,
},
{
"equal",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v2.17.5"},
true,
},
{
"greater",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v2.17.6"},
true,
},
{
"major-bump",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "v3.1.0"},
true,
},
{
"non-semver",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.14.5"},
false,
},
{
"non-semver-greater",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.17.6"},
true,
},
{
"lower-beta",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.14.6-beta"},
false,
},
{
"greater-beta",
&gpuv1.GPUDirectStorageSpec{Enabled: &enable, Version: "2.17.6-beta"},
true,
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
isOpenRMRequired := tc.gds.IsOpenKernelModulesRequired()
require.Equal(t, tc.output, isOpenRMRequired, "Incorrect status from IsOpenKernelModulesRequired() for GDS driver")
})
}
}

// getDCGMExporterTestInput return a ClusterPolicy instance for a particular
// dcgm-exporter test case.
func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy {
Expand Down
16 changes: 14 additions & 2 deletions deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,17 @@ spec:
name:
type: string
type: object
kernelModuleType:
default: auto
description: |-
KernelModuleType represents the type of driver kernel modules to be used when installing the GPU driver.
Accepted values are auto, proprietary and open. NOTE: If auto is chosen, it means that the recommended kernel module
type is chosen based on the GPU devices on the host and the driver branch used
enum:
- auto
- open
- proprietary
type: string
licensingConfig:
description: 'Optional: Licensing configuration for NVIDIA vGPU
licensing'
Expand Down Expand Up @@ -978,8 +989,9 @@ spec:
NVIDIA Driver is managed by the NVIDIADriver CRD type
type: boolean
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
description: |-
Deprecated: This field is no longer honored by the gpu-operator. Please use KernelModuleType instead.
UseOpenKernelModules indicates if the open GPU kernel modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA
Expand Down
Loading

0 comments on commit 8efa2d1

Please sign in to comment.