Skip to content

Commit

Permalink
[operator-validator] remove redundant daemonset kube get calls
Browse files Browse the repository at this point in the history
Signed-off-by: Tariq Ibrahim <[email protected]>
  • Loading branch information
tariq1890 committed Jun 18, 2024
1 parent 2c4f301 commit 02f5872
Showing 1 changed file with 12 additions and 42 deletions.
54 changes: 12 additions & 42 deletions validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1106,19 +1106,16 @@ func (p *Plugin) runWorkload() error {
pod.Spec.RuntimeClassName = &runtimeClass
}

// update owner reference
err = setOwnerReference(ctx, p.kubeClient, pod)
validatorDaemonset, err := p.kubeClient.AppsV1().DaemonSets(namespaceFlag).Get(ctx, "nvidia-operator-validator", meta_v1.GetOptions{})
if err != nil {
return fmt.Errorf("unable to set ownerReference for validator pod: %s", err)
return fmt.Errorf("unable to retrieve the operator validator daemonset: %w", err)
}

// update owner reference
pod.SetOwnerReferences(validatorDaemonset.ObjectMeta.OwnerReferences)
// set pod tolerations
err = setTolerations(ctx, p.kubeClient, pod)
if err != nil {
return fmt.Errorf("unable to set tolerations for validator pod: %s", err)
}

// update podSpec with node name so it will just run on current node
pod.Spec.Tolerations = validatorDaemonset.Spec.Template.Spec.Tolerations
// update podSpec with node name, so it will just run on current node
pod.Spec.NodeName = nodeNameFlag

resourceName, err := p.getGPUResourceName()
Expand Down Expand Up @@ -1165,30 +1162,6 @@ func (p *Plugin) runWorkload() error {
return nil
}

func setOwnerReference(ctx context.Context, kubeClient kubernetes.Interface, pod *corev1.Pod) error {
// get owner of validator daemonset (which is ClusterPolicy)
validatorDaemonset, err := kubeClient.AppsV1().DaemonSets(namespaceFlag).Get(ctx, "nvidia-operator-validator", meta_v1.GetOptions{})
if err != nil {
return err
}

// update owner reference of plugin workload validation pod as ClusterPolicy for cleanup
pod.SetOwnerReferences(validatorDaemonset.ObjectMeta.OwnerReferences)
return nil
}

func setTolerations(ctx context.Context, kubeClient kubernetes.Interface, pod *corev1.Pod) error {
// get tolerations of validator daemonset
validatorDaemonset, err := kubeClient.AppsV1().DaemonSets(namespaceFlag).Get(ctx, "nvidia-operator-validator", meta_v1.GetOptions{})
if err != nil {
return err
}

// set same tolerations for individual validator pods
pod.Spec.Tolerations = validatorDaemonset.Spec.Template.Spec.Tolerations
return nil
}

// waits for the pod to be created
func waitForPod(ctx context.Context, kubeClient kubernetes.Interface, name string, namespace string) error {
for i := 0; i < podCreationWaitRetries; i++ {
Expand Down Expand Up @@ -1397,19 +1370,16 @@ func (c *CUDA) runWorkload() error {
pod.Spec.RuntimeClassName = &runtimeClass
}

// update owner reference
err = setOwnerReference(ctx, c.kubeClient, pod)
validatorDaemonset, err := c.kubeClient.AppsV1().DaemonSets(namespaceFlag).Get(ctx, "nvidia-operator-validator", meta_v1.GetOptions{})
if err != nil {
return fmt.Errorf("unable to set owner reference for validator pod: %s", err)
return fmt.Errorf("unable to retrieve the operator validator daemonset: %w", err)
}

// update owner reference
pod.SetOwnerReferences(validatorDaemonset.ObjectMeta.OwnerReferences)
// set pod tolerations
err = setTolerations(ctx, c.kubeClient, pod)
if err != nil {
return fmt.Errorf("unable to set tolerations for validator pod: %s", err)
}

// update podSpec with node name so it will just run on current node
pod.Spec.Tolerations = validatorDaemonset.Spec.Template.Spec.Tolerations
// update podSpec with node name, so it will just run on current node
pod.Spec.NodeName = nodeNameFlag

opts := meta_v1.ListOptions{LabelSelector: labels.Set{"app": cudaValidatorLabelValue}.AsSelector().String(),
Expand Down

0 comments on commit 02f5872

Please sign in to comment.