From a2e0233ec90350192a11ec2bc55407e953258669 Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Thu, 31 Oct 2024 10:20:40 -0700 Subject: [PATCH] add Prometheus metrics for nodes with missing topology (#21) Signed-off-by: Dmitry Shmulevich --- pkg/common/const.go | 1 + pkg/ib/ib.go | 6 +++--- pkg/metrics/metrics.go | 26 +++++++++++++++++++++----- pkg/providers/aws/instance_topology.go | 8 +++++--- pkg/providers/baremetal/mnnvl.go | 14 +++++++------- pkg/providers/gcp/instance_topology.go | 2 +- pkg/providers/gcp/provider.go | 2 +- pkg/providers/oci/instance_topology.go | 8 +++++--- pkg/providers/oci/metrics.go | 4 ++-- 9 files changed, 46 insertions(+), 25 deletions(-) diff --git a/pkg/common/const.go b/pkg/common/const.go index 762ccbc..118cf38 100644 --- a/pkg/common/const.go +++ b/pkg/common/const.go @@ -40,4 +40,5 @@ const ( KeyPlugin = "plugin" ValTopologyTree = "topology/tree" ValTopologyBlock = "topology/block" + NoTopology = "no-topology" ) diff --git a/pkg/ib/ib.go b/pkg/ib/ib.go index 054f75a..1152be5 100644 --- a/pkg/ib/ib.go +++ b/pkg/ib/ib.go @@ -65,10 +65,10 @@ func GenerateTopologyConfig(data []byte) (*common.Vertex, error) { } seen = make(map[int]map[string]*Switch) root.simplify(root.getHeight()) - return root.toSlurm() + return root.toGraph() } -func (sw *Switch) toSlurm() (*common.Vertex, error) { +func (sw *Switch) toGraph() (*common.Vertex, error) { vertex := &common.Vertex{ Vertices: make(map[string]*common.Vertex), } @@ -82,7 +82,7 @@ func (sw *Switch) toSlurm() (*common.Vertex, error) { } } else { for id, child := range sw.Children { - v, err := child.toSlurm() + v, err := child.toGraph() if err != nil { return nil, err } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 9158c06..138fb77 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -26,25 +26,37 @@ import ( var ( httpRequestsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "topogen_requests_total", - Help: "Total number of topology generator requests.", + Name: "requests_total", + Help: "Total number of topology generation requests.", + Subsystem: "topograph", }, []string{"provider", "engine", "status"}, ) httpRequestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "topogen_request_duration_seconds", - Help: "Topology generator request duration in seconds.", - Buckets: prometheus.DefBuckets, + Name: "request_duration_seconds", + Help: "Topology generator request duration in seconds.", + Subsystem: "topograph", + Buckets: prometheus.DefBuckets, }, []string{"provider", "engine", "status"}, ) + + missingTopologyNodes = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "missing_topology", + Help: "Total number of nodes with missing topology information.", + Subsystem: "topograph", + }, + []string{"provider"}, + ) ) func init() { prometheus.MustRegister(httpRequestsTotal) prometheus.MustRegister(httpRequestDuration) + prometheus.MustRegister(missingTopologyNodes) } func Add(provider, engine string, code int, duration time.Duration) { @@ -52,3 +64,7 @@ func Add(provider, engine string, code int, duration time.Duration) { httpRequestsTotal.WithLabelValues(provider, engine, status).Inc() httpRequestDuration.WithLabelValues(provider, engine, status).Observe(duration.Seconds()) } + +func SetMissingTopology(provider string, count int) { + missingTopologyNodes.WithLabelValues(provider).Set(float64(count)) +} diff --git a/pkg/providers/aws/instance_topology.go b/pkg/providers/aws/instance_topology.go index d911da6..a3c333e 100644 --- a/pkg/providers/aws/instance_topology.go +++ b/pkg/providers/aws/instance_topology.go @@ -28,6 +28,7 @@ import ( "k8s.io/klog/v2" "github.com/NVIDIA/topograph/pkg/common" + "github.com/NVIDIA/topograph/pkg/metrics" ) var defaultPageSize int32 = 100 @@ -173,9 +174,10 @@ func toGraph(topology []types.InstanceTopology, cis []common.ComputeInstances) ( } if len(i2n) != 0 { - klog.V(4).Infof("Adding unclaimed nodes %v", i2n) + klog.V(4).Infof("Adding nodes w/o topology: %v", i2n) + metrics.SetMissingTopology(common.ProviderAWS, len(i2n)) sw := &common.Vertex{ - ID: "cpu-nodes", + ID: common.NoTopology, Vertices: make(map[string]*common.Vertex), } for instanceID, nodeName := range i2n { @@ -184,7 +186,7 @@ func toGraph(topology []types.InstanceTopology, cis []common.ComputeInstances) ( ID: instanceID, } } - forest["cpu-nodes"] = sw + forest[common.NoTopology] = sw } root := &common.Vertex{ diff --git a/pkg/providers/baremetal/mnnvl.go b/pkg/providers/baremetal/mnnvl.go index d71a7fb..405b1d0 100644 --- a/pkg/providers/baremetal/mnnvl.go +++ b/pkg/providers/baremetal/mnnvl.go @@ -50,7 +50,7 @@ func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) { args := []string{"-h"} stdout, err := utils.Exec(ctx, "sinfo", args, nil) if err != nil { - return nil, fmt.Errorf("Exec error in sinfo\n") + return nil, fmt.Errorf("exec error in sinfo: %v", err) } scanner := bufio.NewScanner(stdout) @@ -76,7 +76,7 @@ func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) { args := []string{"-N", "-R", "ssh", "-w", node, "sudo ibnetdiscover"} stdout, err := utils.Exec(ctx, "pdsh", args, nil) if err != nil { - return nil, fmt.Errorf("Exec error while pdsh IB command\n") + return nil, fmt.Errorf("exec error while pdsh IB command: %v", err) } if strings.Contains(stdout.String(), "Topology file:") { _, hca, _ := ib.ParseIbnetdiscoverFile(stdout.Bytes()) @@ -86,7 +86,7 @@ func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) { partitionVisitedMap[pName] = true ibRoot, err := ib.GenerateTopologyConfig(stdout.Bytes()) if err != nil { - return nil, fmt.Errorf("IB GenerateTopologyConfig failed: %v\n", err) + return nil, fmt.Errorf("IB GenerateTopologyConfig failed: %v", err) } ibCount++ ibKey := ibPrefix + strconv.Itoa(ibCount) @@ -158,7 +158,7 @@ func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes [] args := []string{"-R", "ssh", "-w", strings.Join(nodes, ","), cmd} stdout, err := utils.Exec(ctx, "pdsh", args, nil) if err != nil { - return fmt.Errorf("Exec error while pdsh\n") + return fmt.Errorf("exec error while pdsh: %v", err) } scanner := bufio.NewScanner(stdout) @@ -176,7 +176,7 @@ func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes [] nodeMap[nodeName] = true } if err := scanner.Err(); err != nil { - return fmt.Errorf("Scanner error while reading pdsh output\n") + return fmt.Errorf("scanner error while reading pdsh output: %v", err) } return nil } @@ -211,12 +211,12 @@ func generateTopologyConfig(ctx context.Context, cis []common.ComputeInstances) nodes := getNodeList(cis) err := getClusterOutput(ctx, domainMap, nodes, "nvidia-smi -q | grep ClusterUUID") if err != nil { - return nil, fmt.Errorf("getClusterOutput failed: %v\n", err) + return nil, fmt.Errorf("getClusterOutput failed: %v", err) } // get ibnetdiscover output from all unvisited nodes treeRoot, err := getIbTree(ctx, nodes) if err != nil { - return nil, fmt.Errorf("getIbTree failed: %v\n", err) + return nil, fmt.Errorf("getIbTree failed: %v", err) } return toGraph(domainMap, treeRoot), nil } diff --git a/pkg/providers/gcp/instance_topology.go b/pkg/providers/gcp/instance_topology.go index 4d960fc..d4d6b14 100644 --- a/pkg/providers/gcp/instance_topology.go +++ b/pkg/providers/gcp/instance_topology.go @@ -112,7 +112,7 @@ func GenerateInstanceTopology(ctx context.Context, _ interface{}, instanceToNode return instanceTopology, nil } -func (cfg *InstanceTopology) toSLURM() (*common.Vertex, error) { +func (cfg *InstanceTopology) toGraph() (*common.Vertex, error) { forest := make(map[string]*common.Vertex) nodes := make(map[string]*common.Vertex) diff --git a/pkg/providers/gcp/provider.go b/pkg/providers/gcp/provider.go index f56d845..3947d15 100644 --- a/pkg/providers/gcp/provider.go +++ b/pkg/providers/gcp/provider.go @@ -76,5 +76,5 @@ func (p *Provider) GenerateTopologyConfig(ctx context.Context, creds interface{} return nil, err } - return cfg.toSLURM() + return cfg.toGraph() } diff --git a/pkg/providers/oci/instance_topology.go b/pkg/providers/oci/instance_topology.go index 0812a1d..d3a6c2f 100644 --- a/pkg/providers/oci/instance_topology.go +++ b/pkg/providers/oci/instance_topology.go @@ -29,6 +29,7 @@ import ( "k8s.io/klog/v2" "github.com/NVIDIA/topograph/pkg/common" + "github.com/NVIDIA/topograph/pkg/metrics" ) type level int @@ -206,9 +207,10 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []c } if len(instanceToNodeMap) != 0 { - klog.V(4).Infof("Adding unclaimed nodes %v", instanceToNodeMap) + klog.V(4).Infof("Adding nodes w/o topology: %v", instanceToNodeMap) + metrics.SetMissingTopology(common.ProviderOCI, len(instanceToNodeMap)) sw := &common.Vertex{ - ID: "cpu-nodes", + ID: common.NoTopology, Vertices: make(map[string]*common.Vertex), } for instanceID, nodeName := range instanceToNodeMap { @@ -217,7 +219,7 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []c ID: instanceID, } } - forest["cpu-nodes"] = sw + forest[common.NoTopology] = sw } root := &common.Vertex{ diff --git a/pkg/providers/oci/metrics.go b/pkg/providers/oci/metrics.go index 0d37ce0..1b234a1 100644 --- a/pkg/providers/oci/metrics.go +++ b/pkg/providers/oci/metrics.go @@ -24,7 +24,7 @@ var requestLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Name: "request_latency", Help: "Latency of requests", - Subsystem: "topogen_oci", + Subsystem: "topograph_oci", Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, }, []string{"method", "status"}, @@ -34,7 +34,7 @@ var missingAncestor = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "topogen_missing_ancestor_oci", Help: "Missing ancestor nodes", - Subsystem: "topogen_oci", + Subsystem: "topograph_oci", }, []string{"ancestor_level", "node_name"}, )