implement task for configuring the nodes

Signed-off-by: Dmitry Shmulevich <[email protected]>
NVIDIA · May 20, 2024 · b1bf77c · b1bf77c
1 parent 8be8b2b
commit b1bf77c
Show file tree

Hide file tree

Showing 9 changed files with 395 additions and 24 deletions.
diff --git a/charts/virtual-nodes/values-example.yaml b/charts/virtual-nodes/values-example.yaml
@@ -14,7 +14,7 @@
 
 nodes:
 - type: dgxa100.80g
-  count: 1
+  count: 2
   annotations: {}
   labels:
     nvidia.com/gpu.count: "8"
@@ -29,7 +29,7 @@ nodes:
     status: "False"
     type: KernelDeadlock
 - type: dgxh100.80g
-  count: 1
+  count: 2
   annotations: {}
   labels:
     nvidia.com/gpu.count: "8"

diff --git a/demos/basic.demo b/demos/basic.demo
@@ -15,10 +15,6 @@ kubectl apply -f charts/overrides/kwok/pod-complete.yml
 kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml
 kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-container-running-failed.yaml
 
-# Add virtual nodes to the cluster
-helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml
-kubectl get nodes
-
 # Build Knavigator
 make build
 
@@ -28,6 +24,9 @@ make build
 # Show the job running
 kubectl get job
 
+# Show the pods running
+kubectl get pod -o wide
+
 # Clean up
 kubectl delete job job1
 

diff --git a/docs/deployment.md b/docs/deployment.md
@@ -29,7 +29,7 @@ Some of the tested frameworks are:
 - [Jobset](https://github.com/kubernetes-sigs/jobset?tab=readme-ov-file#installation)
 - [Kueue](https://kueue.sigs.k8s.io/docs/installation/)
 
-## KWOK Integration
+## KWOK integration
 
 Knavigator integrates with KWOK to simulate large clusters with hundreds or thousands of virtual nodes. This allows for the execution of experiments in a resource-efficient manner, without the need to run actual user workloads. The integration is facilitated through the API Server, which communicates with KWOK to manage the virtual nodes.
 
@@ -52,21 +52,33 @@ kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/ch
 kubectl apply -f charts/overrides/kwok/pod-complete.yml
 ```
 
-For configuring virtual nodes, you need to provide the `values.yaml` file to define the type and quantity of nodes you wish to create. You also have the option to enhance node configurations by adding annotations, labels, and conditions. For guidance, refer to the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file.
+## Setting up virtual nodes
 
-Currently, the system includes the following node types:
-- [dgxa100.40g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview)
+There are two ways to set up virtual nodes in the cluster, both of which require [Helm v3](https://helm.sh/docs/intro/install/) to be installed on your machine.
+
+### 1. Using the `helm` command
+
+Run the `helm install` command and provide the `values.yaml` file that specifies the types and quantities of nodes you wish to create. For example, see the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file.
+This example includes configurations for the following node types:
 - [dgxa100.80g](https://docs.nvidia.com/dgx/dgxa100-user-guide/introduction-to-dgxa100.html#hardware-overview)
 - [dgxh100.80g](https://docs.nvidia.com/dgx/dgxh100-user-guide/introduction-to-dgxh100.html#hardware-overview)
 - cpu.x86
 
-If you need to introduce additional node types, update the values file used for node configuration with the node information (such as type and count), and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file.
-
-To deploy the nodes in `values-example.yaml`, use the Helm command:
+To deploy the nodes defined in `values-example.yaml`, use the following command:
 ```bash
-helm install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml
+helm upgrade --install virtual-nodes charts/virtual-nodes -f charts/virtual-nodes/values-example.yaml
 ```
 
+### 2. Using the Task Specification
+
+Set up virtual nodes within the `Configure` task in the task specification file. For this example, refer to [test-custom-resource.yml](../resources/tests/test-custom-resource.yml#L11-L19).
+
+### Enhancing Node Configurations
+
+In both methods, you can enhance node configurations by adding annotations, labels, and conditions.
+
+To introduce additional node types, update the `values.yaml` file or the `Configure` task used for node configuration with the node information (such as type, count, etc.), and include a parameters section in the [nodes.yaml](../charts/virtual-nodes/templates/nodes.yaml) file.
+
 > :warning: **Warning:** Ensure you deploy virtual nodes as the final step before launching `knavigator`. If you deploy any components after virtual nodes are created, the pods for these components might be assigned to virtual nodes, which could will their functionality.
 
 ## Running Knavigator

diff --git a/pkg/engine/configure_task.go b/pkg/engine/configure_task.go
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package engine
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"sync"
+	"time"
+
+	"github.com/go-logr/logr"
+	"gopkg.in/yaml.v3"
+	"k8s.io/client-go/kubernetes"
+
+	"github.com/NVIDIA/knavigator/pkg/config"
+)
+
+type ConfigureTask struct {
+	BaseTask
+	configureTaskParams
+
+	client *kubernetes.Clientset
+}
+
+type configureTaskParams struct {
+	Nodes   []virtualNode `yaml:"nodes"`
+	Timeout time.Duration `yaml:"timeout"`
+}
+
+type virtualNode struct {
+	Type        string              `yaml:"type" json:"type"`
+	Count       int                 `yaml:"count" json:"count"`
+	Annotations map[string]string   `yaml:"annotations,omitempty" json:"annotations,omitempty"`
+	Labels      map[string]string   `yaml:"labels,omitempty" json:"labels,omitempty"`
+	Conditions  []map[string]string `yaml:"conditions,omitempty" json:"conditions,omitempty"`
+}
+
+func newConfigureTask(log logr.Logger, client *kubernetes.Clientset, cfg *config.Task) (*ConfigureTask, error) {
+	if client == nil {
+		return nil, fmt.Errorf("%s/%s: Kubernetes client is not set", cfg.Type, cfg.ID)
+	}
+
+	task := &ConfigureTask{
+		BaseTask: BaseTask{
+			log:      log,
+			taskType: TaskConfigure,
+			taskID:   cfg.ID,
+		},
+		client: client,
+	}
+
+	if err := task.validate(cfg.Params); err != nil {
+		return nil, err
+	}
+
+	return task, nil
+}
+
+// validate initializes and validates parameters for ConfigureTask
+func (task *ConfigureTask) validate(params map[string]interface{}) error {
+	data, err := yaml.Marshal(params)
+	if err != nil {
+		return fmt.Errorf("failed to parse parameters in %s task %s: %v", task.taskType, task.taskID, err)
+	}
+	if err = yaml.Unmarshal(data, &task.configureTaskParams); err != nil {
+		return fmt.Errorf("failed to parse parameters in %s task %s: %v", task.taskType, task.taskID, err)
+	}
+
+	if task.Timeout == 0 {
+		return fmt.Errorf("%s: missing parameter 'timeout'", task.ID())
+	}
+
+	return nil
+}
+
+// Exec implements Runnable interface
+func (task *ConfigureTask) Exec(ctx context.Context) (err error) {
+	ctx, cancel := context.WithTimeout(ctx, task.Timeout)
+	defer cancel()
+
+	errs := make(chan error)
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	go func() {
+		wg.Wait()
+		close(errs)
+	}()
+
+	go func() {
+		defer wg.Done()
+		if err := task.updateVirtualNodes(ctx); err != nil {
+			errs <- err
+		}
+	}()
+
+	for e := range errs {
+		if e != nil {
+			task.log.Error(e, "configuration error")
+			err = e
+		}
+	}
+
+	return
+}
+
+func (task *ConfigureTask) updateVirtualNodes(ctx context.Context) error {
+	if len(task.Nodes) == 0 {
+		return nil
+	}
+
+	nodeExpr, err := nodes2json(task.Nodes)
+	if err != nil {
+		return err
+	}
+
+	// update helm repo
+	args := []string{"repo", "add", "--force-update", "knavigator", "https://nvidia.github.io/knavigator/helm-charts"}
+
+	task.log.V(4).Info("Updating helm repo")
+
+	if err = runCommand(ctx, task.log, "helm", args); err != nil {
+		return err
+	}
+
+	// upgrade helm chart
+	args = []string{"upgrade", "--install", "virtual-nodes", "knavigator/virtual-nodes",
+		"--wait", "--set-json", nodeExpr}
+
+	task.log.V(4).Info("Updating nodes", "cmd", append([]string{"helm"}, args...))
+
+	return runCommand(ctx, task.log, "helm", args)
+}
+
+func nodes2json(nodes []virtualNode) (string, error) {
+	data, err := json.Marshal(nodes)
+	if err != nil {
+		return "", err
+	}
+	return fmt.Sprintf("nodes=%s", string(data)), nil
+}
+
+func runCommand(ctx context.Context, log logr.Logger, exe string, args []string) error {
+	command := exec.CommandContext(ctx, exe, args...)
+
+	var stdout, stderr bytes.Buffer
+	command.Stdout = &stdout
+	command.Stderr = &stderr
+
+	if err := command.Run(); err != nil {
+		log.Error(err, "failed to run command",
+			"stdout", stdout.String(), "stderr", stderr.String())
+		return err
+	}
+
+	log.V(4).Info(stdout.String())
+
+	return nil
+}