From d469ac1ce15a343b292d37dd2a4dd4aea93e4e40 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Mon, 21 Oct 2024 11:04:40 -0700 Subject: [PATCH] add support for SLURM block topology format Signed-off-by: Dmitry Shmulevich --- pkg/common/const.go | 6 ++++ pkg/common/types_test.go | 4 +-- pkg/engines/k8s/labeler_test.go | 2 +- pkg/engines/slurm/slurm.go | 17 ++++++--- pkg/factory/provider.go | 2 +- pkg/providers/baremetal/mnnvl.go | 11 +++--- pkg/translate/output.go | 59 +++++++++++++++++++++++++++++++- pkg/translate/output_test.go | 34 +++++++++++++++--- 8 files changed, 115 insertions(+), 20 deletions(-) diff --git a/pkg/common/const.go b/pkg/common/const.go index 3f42221..35475fa 100644 --- a/pkg/common/const.go +++ b/pkg/common/const.go @@ -24,6 +24,7 @@ const ( ProviderBM = "baremetal" ProviderTest = "test" + KeyEngine = "engine" EngineSLURM = "slurm" EngineK8S = "k8s" EngineTest = "test" @@ -32,5 +33,10 @@ const ( KeyTopoConfigPath = "topology_config_path" KeyTopoConfigmapName = "topology_configmap_name" KeyTopoConfigmapNamespace = "topology_configmap_namespace" + KeyBlockSizes = "block_sizes" KeySkipReload = "skip_reload" + + KeyPlugin = "plugin" + ValTopologyTree = "topology/tree" + ValTopologyBlock = "topology/block" ) diff --git a/pkg/common/types_test.go b/pkg/common/types_test.go index d858c3f..672bc64 100644 --- a/pkg/common/types_test.go +++ b/pkg/common/types_test.go @@ -98,8 +98,8 @@ func TestPayload(t *testing.T) { Engine: engine{ Name: "slurm", Params: map[string]string{ - "plugin": "topology/block", - "block_sizes": "30,120", + KeyPlugin: ValTopologyBlock, + KeyBlockSizes: "30,120", }, }, Nodes: []ComputeInstances{ diff --git a/pkg/engines/k8s/labeler_test.go b/pkg/engines/k8s/labeler_test.go index b9d250d..8e0f909 100644 --- a/pkg/engines/k8s/labeler_test.go +++ b/pkg/engines/k8s/labeler_test.go @@ -38,7 +38,7 @@ func (l *testLabeler) AddNodeLabels(_ context.Context, nodeName string, labels m } func TestApplyNodeLabels(t *testing.T) { - root, _ := translate.GetTestSet(true) + root, _ := translate.GetTreeTestSet(true) labeler := &testLabeler{data: make(map[string]map[string]string)} data := map[string]map[string]string{ "Node201": {"topology.kubernetes.io/network-level-1": "S2", "topology.kubernetes.io/network-level-2": "S1"}, diff --git a/pkg/engines/slurm/slurm.go b/pkg/engines/slurm/slurm.go index e4b3f4d..f2ce32e 100644 --- a/pkg/engines/slurm/slurm.go +++ b/pkg/engines/slurm/slurm.go @@ -30,14 +30,12 @@ import ( "github.com/NVIDIA/topograph/pkg/utils" ) -const ( - TopoTreeHeader = ` +const TopologyHeader = ` ############################################################### # Slurm's network topology configuration file for use with the -# topology/tree plugin +# %s plugin ############################################################### ` -) type SlurmEngine struct{} @@ -76,7 +74,16 @@ func GenerateOutput(ctx context.Context, tree *common.Vertex, params map[string] path := params[common.KeyTopoConfigPath] if len(path) != 0 { - buf.WriteString(TopoTreeHeader) + var plugin string + if len(tree.Metadata) != 0 { + plugin = tree.Metadata[common.KeyPlugin] + } + if len(plugin) == 0 { + plugin = common.ValTopologyTree + } + if _, err := buf.WriteString(fmt.Sprintf(TopologyHeader, plugin)); err != nil { + return nil, err + } } err := translate.ToSLURM(buf, tree) diff --git a/pkg/factory/provider.go b/pkg/factory/provider.go index b86b8f0..474ecda 100644 --- a/pkg/factory/provider.go +++ b/pkg/factory/provider.go @@ -67,7 +67,7 @@ type testProvider struct { func GetTestProvider() *testProvider { p := &testProvider{} - p.tree, p.instance2node = translate.GetTestSet(false) + p.tree, p.instance2node = translate.GetTreeTestSet(false) return p } diff --git a/pkg/providers/baremetal/mnnvl.go b/pkg/providers/baremetal/mnnvl.go index 14dc84a..1061b5b 100644 --- a/pkg/providers/baremetal/mnnvl.go +++ b/pkg/providers/baremetal/mnnvl.go @@ -4,10 +4,11 @@ import ( "bufio" "context" "fmt" - "github.com/NVIDIA/topograph/pkg/common" - "github.com/NVIDIA/topograph/pkg/utils" "strconv" "strings" + + "github.com/NVIDIA/topograph/pkg/common" + "github.com/NVIDIA/topograph/pkg/utils" ) // domain contains map of each domainID(clusterUUID) -> list of nodeNames in that domain @@ -84,9 +85,9 @@ func toGraph(domainMap map[string]domain) *common.Vertex { root.Vertices[domainName] = tree } // add root metadata - root.Metadata["engine"] = "slurm" - root.Metadata["plugin"] = "topology/block" - root.Metadata["blocksize"] = strconv.Itoa(blockSize) + root.Metadata[common.KeyEngine] = common.EngineSLURM + root.Metadata[common.KeyPlugin] = common.ValTopologyBlock + root.Metadata[common.KeyBlockSizes] = strconv.Itoa(blockSize) return root } diff --git a/pkg/translate/output.go b/pkg/translate/output.go index 6df889a..8649aa6 100644 --- a/pkg/translate/output.go +++ b/pkg/translate/output.go @@ -27,6 +27,28 @@ import ( ) func ToSLURM(wr io.Writer, root *common.Vertex) error { + if len(root.Metadata) != 0 && root.Metadata[common.KeyPlugin] == common.ValTopologyBlock { + return toBlockSLURM(wr, root, root.Metadata[common.KeyBlockSizes]) + } + return toTreeSLURM(wr, root) +} + +func toBlockSLURM(wr io.Writer, root *common.Vertex, blocksizes string) error { + for _, block := range root.Vertices { + nodes := make([]string, 0, len(block.Vertices)) + for _, node := range block.Vertices { + nodes = append(nodes, node.Name) + } + _, err := wr.Write([]byte(fmt.Sprintf("BlockName=%s Nodes=%s\n", block.ID, strings.Join(compress(nodes), ",")))) + if err != nil { + return err + } + } + _, err := wr.Write([]byte(fmt.Sprintf("BlockSizes=%s\n", blocksizes))) + return err +} + +func toTreeSLURM(wr io.Writer, root *common.Vertex) error { visited := make(map[string]bool) leaves := make(map[string][]string) parents := []*common.Vertex{} @@ -197,7 +219,7 @@ func split(input string) (string, string) { return input[:i], input[i:] } -func GetTestSet(testForLongLabelName bool) (*common.Vertex, map[string]string) { +func GetTreeTestSet(testForLongLabelName bool) (*common.Vertex, map[string]string) { var s3name string if testForLongLabelName { s3name = "S3very-very-long-id-to-check-label-value-limits-of-63-characters" @@ -236,3 +258,38 @@ func GetTestSet(testForLongLabelName bool) (*common.Vertex, map[string]string) { return root, instance2node } + +func GetBlockTestSet() (*common.Vertex, map[string]string) { + instance2node := map[string]string{ + "I14": "Node104", "I15": "Node105", "I16": "Node106", + "I21": "Node201", "I22": "Node202", "I25": "Node205", + } + + n14 := &common.Vertex{ID: "I14", Name: "Node104"} + n15 := &common.Vertex{ID: "I15", Name: "Node105"} + n16 := &common.Vertex{ID: "I16", Name: "Node106"} + + n21 := &common.Vertex{ID: "I21", Name: "Node201"} + n22 := &common.Vertex{ID: "I22", Name: "Node202"} + n25 := &common.Vertex{ID: "I25", Name: "Node205"} + + block1 := &common.Vertex{ + ID: "B1", + Vertices: map[string]*common.Vertex{"I14": n14, "I15": n15, "I16": n16}, + } + block2 := &common.Vertex{ + ID: "B2", + Vertices: map[string]*common.Vertex{"I21": n21, "I22": n22, "I25": n25}, + } + + root := &common.Vertex{ + Vertices: map[string]*common.Vertex{"B1": block1, "B2": block2}, + Metadata: map[string]string{ + common.KeyEngine: common.EngineSLURM, + common.KeyPlugin: common.ValTopologyBlock, + common.KeyBlockSizes: "8", + }, + } + + return root, instance2node +} diff --git a/pkg/translate/output_test.go b/pkg/translate/output_test.go index 389e7e0..1f4b36f 100644 --- a/pkg/translate/output_test.go +++ b/pkg/translate/output_test.go @@ -25,15 +25,26 @@ import ( ) const ( - testConfig1 = `SwitchName=S1 Switches=S[2-3] + testTreeConfig1 = `SwitchName=S1 Switches=S[2-3] SwitchName=S2 Nodes=Node[201-202],Node205 SwitchName=S3 Nodes=Node[304-306] ` - testConfig2 = `SwitchName=S1 Switches=S[2-3] + testTreeConfig2 = `SwitchName=S1 Switches=S[2-3] SwitchName=S3 Nodes=Node[304-306] SwitchName=S2 Nodes=Node[201-202],Node205 ` + + testBlockConfig1 = `BlockName=B1 Nodes=Node[104-106] +BlockName=B2 Nodes=Node[201-202],Node205 +BlockSizes=8 +` + + testBlockConfig2 = `BlockName=B2 Nodes=Node[201-202],Node205 +BlockName=B1 Nodes=Node[104-106] +BlockSizes=8 +` + shortNameExpectedResult = `# switch.3.1=hpcislandid-1 SwitchName=switch.3.1 Switches=switch.2.[1-2] # switch.2.1=network-block-1 @@ -47,13 +58,26 @@ SwitchName=switch.1.2 Nodes=node-2 ` ) -func TestToSLURM(t *testing.T) { - v, _ := GetTestSet(false) +func TestToTreeSLURM(t *testing.T) { + v, _ := GetTreeTestSet(false) + buf := &bytes.Buffer{} + err := ToSLURM(buf, v) + require.NoError(t, err) + switch buf.String() { + case testTreeConfig1, testTreeConfig2: + // nop + default: + t.Errorf("unexpected result %s", buf.String()) + } +} + +func TestToBlockSLURM(t *testing.T) { + v, _ := GetBlockTestSet() buf := &bytes.Buffer{} err := ToSLURM(buf, v) require.NoError(t, err) switch buf.String() { - case testConfig1, testConfig2: + case testBlockConfig1, testBlockConfig2: // nop default: t.Errorf("unexpected result %s", buf.String())