NVIDIA · henryh2 · Oct 31, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
@@ -49,6 +49,16 @@ http:
   # ssl: enables HTTPS protocol if set to `true` (optional).
   ssl: false
 
+# provider: the provider that topograph will use (optional)
+# Valid options include "aws", "oci", "gcp", "cw", "baremetal" or "test".
+# Can be overridden if the provider is specified in a topology request to topograph
+provider: "aws"
+
+# engine: the engine that topograph will use (optional)
+# Valid options include "slurm", "k8s", or "test".
+# Can be overridden if the engine is specified in a topology request to topograph
+engine: "slurm"
+
 # request_aggregation_delay: defines the delay before processing a request (required).
 # Topograph aggregates multiple sequential requests within this delay into a single request,
 # processing only if no new requests arrive during the specified duration.
@@ -109,10 +119,12 @@ Topograph offers three endpoints for interacting with the service. Below are the
 - **URL:** `http://<server>:<port>/v1/generate`
 - **Description:** This endpoint is used to request a new cluster topology.
 - **Payload:** The payload is a JSON object that includes the following fields:
-  - **provider name**: (mandatory) A string specifying the Service Provider, such as `aws`, `oci`, `gcp`, `cw`, `baremetal` or `test`.
+  - **provider name**: (optional) A string specifying the Service Provider, such as `aws`, `oci`, `gcp`, `cw`, `baremetal` or `test`. This parameter will be override the provider set in the topograph config.
   - **provider credentials**: (optional) A key-value map with provider-specific parameters for authentication.
-  - **engine name**: (mandatory) A string specifying the topology output, either `slurm` or `k8s`.
-  - **engine parameters**: A key-value map with engine-specific parameters.
+  - **provider parameters**: (optional) A key-value map with parameters that are used for provider simulation with toposim.
+    - **model_path**: (optional) A string parameter that points to the model file to use for simulating topology.
+  - **engine name**: (optional) A string specifying the topology output, either `slurm`, `k8s`, or `test`. This parameter will override the engine set in the topograph config.
+  - **engine parameters**: (optional) A key-value map with engine-specific parameters.
     - **slurm parameters**:
       - **topology_config_path**: (optional) A string specifying the file path for the topology configuration. If omitted, the topology config content is returned in the HTTP response.
       - **plugin**: (optional) A string specifying topology plugin. Default topology/tree.
@@ -133,6 +145,10 @@ Topograph offers three endpoints for interacting with the service. Below are the
     "creds": {
       "access_key_id": "id",
       "secret_access_key": "secret"
+    },
+    "params": {
+      "use_simulation": "false",
+      "model_path": ""
     }
   },
   "engine": {

@@ -3,9 +3,16 @@ http:
   port: 49021
   ssl: false
 
+# Set provider and engine for topograph to use
+# provider: "test"
+# engine: "test"
+
 # waiting period before processing a request
 request_aggregation_delay: 15s
 
+# URL of an external gRPC service for request processing (optional)
+# forward_service_url:
+
 # number of results per API call (optional)
 page_size: 100
 

@@ -37,8 +37,10 @@ curl http://localhost:49021/healthz
 ```
 
 #### Using Toposim
-To test the service on a simulated cluster, first add the following line to `/etc/topograph/topograph-config.yaml` so that any topology requests are forwarded to toposim.
+To test the service on a simulated cluster, first add the following lines to `/etc/topograph/topograph-config.yaml` so that topograph knows to run topology in simulation and to forward any topology requests to toposim.
 ```bash
+provider: "test"
+engine: "test"
 forward_service_url: dns:localhost:49025
 ```
 Then run the topograph service as normal.
@@ -48,15 +50,15 @@ You must then start the toposim service as such, setting the path to the test mo
 /usr/local/bin/topograph -m /usr/local/bin/tests/models/<cluster-model>.yaml
 ```
 
-You can then verify the topology results via simulation by querying topograph using the `test` provider and engine, and specifying the test model path as a parameter to the provider.
+You can then verify the topology results via simulation by querying topograph, and specifying the test model path as a parameter to the provider.
 If you want to view the tree topology, then use the command:
 ```bash
-id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"name":"test", "params":{"model_path":"/usr/local/bin/topograph/tests/models/<cluster-model>.yaml"}},"engine":{"name":"test"}}' http://localhost:49021/v1/generate)
+id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"params":{"model_path":"/usr/local/bin/tests/models/<cluster-model>.yaml"}}}' http://localhost:49021/v1/generate)
 ```
 
 And if you want to view the block topology (with specified block sizes), use the command:
 ```bash
-id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"name":"test", "params":{"model_path":"/usr/local/bin/topograph/tests/models/<cluster-model>.yaml"}},"engine":{"name":"test", "params":{"plugin":"topology/block", "block_sizes": <block-sizes>}}}' http://localhost:49021/v1/generate)
+id=$(curl -s -X POST -H "Content-Type: application/json" -d '{"provider":{"params":{"model_path":"/usr/local/bin/tests/models/<cluster-model>.yaml"}},"engine":{"params":{"plugin":"topology/block", "block_sizes": "4,8"}}}' http://localhost:49021/v1/generate)
 ```
 
 You can query the results of either topology request with:

@@ -25,12 +25,15 @@ import (
 	"gopkg.in/yaml.v3"
 	"k8s.io/klog/v2"
 
+	"github.com/NVIDIA/topograph/pkg/common"
 	"github.com/NVIDIA/topograph/pkg/utils"
 )
 
 type Config struct {
 	HTTP                    Endpoint          `yaml:"http"`
 	RequestAggregationDelay time.Duration     `yaml:"request_aggregation_delay"`
+	Provider                string            `yaml:"provider,omitempty"`
+	Engine                  string            `yaml:"engine,omitempty"`
 	PageSize                int               `yaml:"page_size,omitempty"`
 	SSL                     *SSL              `yaml:"ssl,omitempty"`
 	CredsPath               *string           `yaml:"credentials_path,omitempty"`
@@ -76,6 +79,20 @@ func (cfg *Config) validate() error {
 		return fmt.Errorf("port is not set")
 	}
 
+	switch cfg.Provider {
+	case common.ProviderAWS, common.ProviderOCI, common.ProviderGCP, common.ProviderCW, common.ProviderBM, common.ProviderTest, "":
+		//nop
+	default:
+		return fmt.Errorf("unsupported provider %s", cfg.Provider)
+	}
+
+	switch cfg.Engine {
+	case common.EngineK8S, common.EngineSLURM, common.EngineTest, "":
+		//nop
+	default:
+		return fmt.Errorf("unsupported engine %s", cfg.Engine)
+	}
+
 	if cfg.RequestAggregationDelay == 0 {
 		return fmt.Errorf("request_aggregation_delay is not set")
 	}

@@ -50,7 +50,7 @@ func (eng *K8sEngine) GenerateOutput(ctx context.Context, tree *common.Vertex, p
 		return nil, err
 	}
 	buf := &bytes.Buffer{}
-	err := translate.ToSLURM(buf, tree)
+	err := translate.ToGraph(buf, tree)
 	if err != nil {
 		return nil, err
 	}

@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"testing"
 
+	"github.com/NVIDIA/topograph/pkg/common"
 	"github.com/NVIDIA/topograph/pkg/translate"
 	"github.com/stretchr/testify/require"
 )
@@ -49,7 +50,7 @@ func TestApplyNodeLabels(t *testing.T) {
 		"Node306": {"topology.kubernetes.io/network-level-1": "xf946c4acef2d5939", "topology.kubernetes.io/network-level-2": "S1"},
 	}
 
-	err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root, labeler)
+	err := NewTopologyLabeler().ApplyNodeLabels(context.TODO(), root.Vertices[common.ValTopologyTree], labeler)
 	require.NoError(t, err)
 	require.Equal(t, data, labeler.data)
 }
@@ -92,7 +92,7 @@ func GenerateOutput(ctx context.Context, tree *common.Vertex, params map[string]
 		tree.Metadata[common.KeyBlockSizes] = blockSize
 	}
 
-	err := translate.ToSLURM(buf, tree)
+	err := translate.ToGraph(buf, tree)
 	if err != nil {
 		return nil, err
 	}

@@ -21,6 +21,8 @@ import (
 	"fmt"
 	"net/http"
 
+	"k8s.io/klog/v2"
+
 	"github.com/NVIDIA/topograph/pkg/common"
 	"github.com/NVIDIA/topograph/pkg/models"
 	"github.com/NVIDIA/topograph/pkg/providers/aws"
@@ -69,21 +71,16 @@ type testProvider struct {
 func GetTestProvider(params map[string]string) (*testProvider, error) {
 	p := &testProvider{}
 
-	var modelPath string
-	if len(params) != 0 {
-		modelPath = params[common.KeyModelPath]
-	}
-
-	if len(modelPath) == 0 {
+	if path, ok := params[common.KeyModelPath]; !ok || len(path) == 0 {
 		p.tree, p.instance2node = translate.GetTreeTestSet(false)
 	} else {
-		model, err := models.NewModelFromFile(modelPath)
+		klog.InfoS("Using simulated topology", "model path", params[common.KeyModelPath])
+		model, err := models.NewModelFromFile(params[common.KeyModelPath])
 		if err != nil {
 			return nil, err // Wrapped by models.NewModelFromFile
 		}
 		p.tree, p.instance2node = model.ToTree()
 	}
-
 	return p, nil
 }
 

@@ -145,6 +145,7 @@ func (model *Model) ToTree() (*common.Vertex, map[string]string) {
 	nodeVertexMap := make(map[string]*common.Vertex)
 	swVertexMap := make(map[string]*common.Vertex)
 	swRootMap := make(map[string]bool)
+	blockVertexMap := make(map[string]*common.Vertex)
 
 	// Create all the vertices for each node
 	for k, v := range model.Nodes {
@@ -158,6 +159,14 @@ func (model *Model) ToTree() (*common.Vertex, map[string]string) {
 		swRootMap[sw.Name] = true
 	}
 
+	// Initializes all the block vertices
+	for _, cb := range model.CapacityBlocks {
+		blockVertexMap[cb.Name] = &common.Vertex{ID: cb.Name, Vertices: make(map[string]*common.Vertex)}
+		for _, node := range cb.Nodes {
+			blockVertexMap[cb.Name].Vertices[node] = nodeVertexMap[node]
+		}
+	}
+
 	// Connect all the switches to their sub-switches and sub-nodes
 	for _, sw := range model.Switches {
 		for _, subsw := range sw.Switches {
@@ -177,11 +186,18 @@ func (model *Model) ToTree() (*common.Vertex, map[string]string) {
 	}
 
 	// Connects all root vertices to the hidden root
-	root := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
+	treeRoot := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
 	for k, v := range swRootMap {
 		if v {
-			root.Vertices[k] = swVertexMap[k]
+			treeRoot.Vertices[k] = swVertexMap[k]
 		}
 	}
+	blockRoot := &common.Vertex{Vertices: make(map[string]*common.Vertex)}
+	for k, v := range blockVertexMap {
+		blockRoot.Vertices[k] = v
+	}
+	root := &common.Vertex{
+		Vertices: map[string]*common.Vertex{common.ValTopologyBlock: blockRoot, common.ValTopologyTree: treeRoot},
+	}
 	return root, instance2node
 }