Skip to content

Commit

Permalink
add Prometheus metrics for nodes with missing topology (#21)
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Shmulevich <[email protected]>
  • Loading branch information
dmitsh authored Oct 31, 2024
1 parent 86913e0 commit a2e0233
Show file tree
Hide file tree
Showing 9 changed files with 46 additions and 25 deletions.
1 change: 1 addition & 0 deletions pkg/common/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ const (
KeyPlugin = "plugin"
ValTopologyTree = "topology/tree"
ValTopologyBlock = "topology/block"
NoTopology = "no-topology"
)
6 changes: 3 additions & 3 deletions pkg/ib/ib.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ func GenerateTopologyConfig(data []byte) (*common.Vertex, error) {
}
seen = make(map[int]map[string]*Switch)
root.simplify(root.getHeight())
return root.toSlurm()
return root.toGraph()
}

func (sw *Switch) toSlurm() (*common.Vertex, error) {
func (sw *Switch) toGraph() (*common.Vertex, error) {
vertex := &common.Vertex{
Vertices: make(map[string]*common.Vertex),
}
Expand All @@ -82,7 +82,7 @@ func (sw *Switch) toSlurm() (*common.Vertex, error) {
}
} else {
for id, child := range sw.Children {
v, err := child.toSlurm()
v, err := child.toGraph()
if err != nil {
return nil, err
}
Expand Down
26 changes: 21 additions & 5 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,29 +26,45 @@ import (
var (
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "topogen_requests_total",
Help: "Total number of topology generator requests.",
Name: "requests_total",
Help: "Total number of topology generation requests.",
Subsystem: "topograph",
},
[]string{"provider", "engine", "status"},
)

httpRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "topogen_request_duration_seconds",
Help: "Topology generator request duration in seconds.",
Buckets: prometheus.DefBuckets,
Name: "request_duration_seconds",
Help: "Topology generator request duration in seconds.",
Subsystem: "topograph",
Buckets: prometheus.DefBuckets,
},
[]string{"provider", "engine", "status"},
)

missingTopologyNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "missing_topology",
Help: "Total number of nodes with missing topology information.",
Subsystem: "topograph",
},
[]string{"provider"},
)
)

func init() {
prometheus.MustRegister(httpRequestsTotal)
prometheus.MustRegister(httpRequestDuration)
prometheus.MustRegister(missingTopologyNodes)
}

func Add(provider, engine string, code int, duration time.Duration) {
status := fmt.Sprintf("%d", code)
httpRequestsTotal.WithLabelValues(provider, engine, status).Inc()
httpRequestDuration.WithLabelValues(provider, engine, status).Observe(duration.Seconds())
}

func SetMissingTopology(provider string, count int) {
missingTopologyNodes.WithLabelValues(provider).Set(float64(count))
}
8 changes: 5 additions & 3 deletions pkg/providers/aws/instance_topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"k8s.io/klog/v2"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/metrics"
)

var defaultPageSize int32 = 100
Expand Down Expand Up @@ -173,9 +174,10 @@ func toGraph(topology []types.InstanceTopology, cis []common.ComputeInstances) (
}

if len(i2n) != 0 {
klog.V(4).Infof("Adding unclaimed nodes %v", i2n)
klog.V(4).Infof("Adding nodes w/o topology: %v", i2n)
metrics.SetMissingTopology(common.ProviderAWS, len(i2n))
sw := &common.Vertex{
ID: "cpu-nodes",
ID: common.NoTopology,
Vertices: make(map[string]*common.Vertex),
}
for instanceID, nodeName := range i2n {
Expand All @@ -184,7 +186,7 @@ func toGraph(topology []types.InstanceTopology, cis []common.ComputeInstances) (
ID: instanceID,
}
}
forest["cpu-nodes"] = sw
forest[common.NoTopology] = sw
}

root := &common.Vertex{
Expand Down
14 changes: 7 additions & 7 deletions pkg/providers/baremetal/mnnvl.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) {
args := []string{"-h"}
stdout, err := utils.Exec(ctx, "sinfo", args, nil)
if err != nil {
return nil, fmt.Errorf("Exec error in sinfo\n")
return nil, fmt.Errorf("exec error in sinfo: %v", err)
}

scanner := bufio.NewScanner(stdout)
Expand All @@ -76,7 +76,7 @@ func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) {
args := []string{"-N", "-R", "ssh", "-w", node, "sudo ibnetdiscover"}
stdout, err := utils.Exec(ctx, "pdsh", args, nil)
if err != nil {
return nil, fmt.Errorf("Exec error while pdsh IB command\n")
return nil, fmt.Errorf("exec error while pdsh IB command: %v", err)
}
if strings.Contains(stdout.String(), "Topology file:") {
_, hca, _ := ib.ParseIbnetdiscoverFile(stdout.Bytes())
Expand All @@ -86,7 +86,7 @@ func getIbTree(ctx context.Context, nodes []string) (*common.Vertex, error) {
partitionVisitedMap[pName] = true
ibRoot, err := ib.GenerateTopologyConfig(stdout.Bytes())
if err != nil {
return nil, fmt.Errorf("IB GenerateTopologyConfig failed: %v\n", err)
return nil, fmt.Errorf("IB GenerateTopologyConfig failed: %v", err)
}
ibCount++
ibKey := ibPrefix + strconv.Itoa(ibCount)
Expand Down Expand Up @@ -158,7 +158,7 @@ func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes []
args := []string{"-R", "ssh", "-w", strings.Join(nodes, ","), cmd}
stdout, err := utils.Exec(ctx, "pdsh", args, nil)
if err != nil {
return fmt.Errorf("Exec error while pdsh\n")
return fmt.Errorf("exec error while pdsh: %v", err)
}

scanner := bufio.NewScanner(stdout)
Expand All @@ -176,7 +176,7 @@ func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes []
nodeMap[nodeName] = true
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("Scanner error while reading pdsh output\n")
return fmt.Errorf("scanner error while reading pdsh output: %v", err)
}
return nil
}
Expand Down Expand Up @@ -211,12 +211,12 @@ func generateTopologyConfig(ctx context.Context, cis []common.ComputeInstances)
nodes := getNodeList(cis)
err := getClusterOutput(ctx, domainMap, nodes, "nvidia-smi -q | grep ClusterUUID")
if err != nil {
return nil, fmt.Errorf("getClusterOutput failed: %v\n", err)
return nil, fmt.Errorf("getClusterOutput failed: %v", err)
}
// get ibnetdiscover output from all unvisited nodes
treeRoot, err := getIbTree(ctx, nodes)
if err != nil {
return nil, fmt.Errorf("getIbTree failed: %v\n", err)
return nil, fmt.Errorf("getIbTree failed: %v", err)
}
return toGraph(domainMap, treeRoot), nil
}
2 changes: 1 addition & 1 deletion pkg/providers/gcp/instance_topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func GenerateInstanceTopology(ctx context.Context, _ interface{}, instanceToNode
return instanceTopology, nil
}

func (cfg *InstanceTopology) toSLURM() (*common.Vertex, error) {
func (cfg *InstanceTopology) toGraph() (*common.Vertex, error) {
forest := make(map[string]*common.Vertex)
nodes := make(map[string]*common.Vertex)

Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/gcp/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,5 @@ func (p *Provider) GenerateTopologyConfig(ctx context.Context, creds interface{}
return nil, err
}

return cfg.toSLURM()
return cfg.toGraph()
}
8 changes: 5 additions & 3 deletions pkg/providers/oci/instance_topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"k8s.io/klog/v2"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/metrics"
)

type level int
Expand Down Expand Up @@ -206,9 +207,10 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []c
}

if len(instanceToNodeMap) != 0 {
klog.V(4).Infof("Adding unclaimed nodes %v", instanceToNodeMap)
klog.V(4).Infof("Adding nodes w/o topology: %v", instanceToNodeMap)
metrics.SetMissingTopology(common.ProviderOCI, len(instanceToNodeMap))
sw := &common.Vertex{
ID: "cpu-nodes",
ID: common.NoTopology,
Vertices: make(map[string]*common.Vertex),
}
for instanceID, nodeName := range instanceToNodeMap {
Expand All @@ -217,7 +219,7 @@ func toGraph(bareMetalHostSummaries []*core.ComputeBareMetalHostSummary, cis []c
ID: instanceID,
}
}
forest["cpu-nodes"] = sw
forest[common.NoTopology] = sw
}

root := &common.Vertex{
Expand Down
4 changes: 2 additions & 2 deletions pkg/providers/oci/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ var requestLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "request_latency",
Help: "Latency of requests",
Subsystem: "topogen_oci",
Subsystem: "topograph_oci",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"method", "status"},
Expand All @@ -34,7 +34,7 @@ var missingAncestor = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "topogen_missing_ancestor_oci",
Help: "Missing ancestor nodes",
Subsystem: "topogen_oci",
Subsystem: "topograph_oci",
},
[]string{"ancestor_level", "node_name"},
)
Expand Down

0 comments on commit a2e0233

Please sign in to comment.