Skip to content

Commit

Permalink
Merge pull request #4686 from oasisprotocol/ptrus/feature/node-status…
Browse files Browse the repository at this point in the history
…-metrics

go/worker/registration: add node status metrics
  • Loading branch information
ptrus authored Apr 25, 2022
2 parents 5d07b3f + 3fd0d49 commit 38286f0
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 0 deletions.
1 change: 1 addition & 0 deletions .changelog/4686.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
go/worker/registration: add node status metrics
4 changes: 4 additions & 0 deletions docs/oasis-node/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ oasis_worker_executor_liveness_live_rounds | Gauge | Number of live rounds in la
oasis_worker_executor_liveness_total_rounds | Gauge | Number of total rounds in last epoch. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_failed_round_count | Counter | Number of failed roothash rounds. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_node_registered | Gauge | Is oasis node registered (binary). | | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_node_registration_eligible | Gauge | Is oasis node eligible for registration (binary). | | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_node_status_frozen | Gauge | Is oasis node frozen (binary). | | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_node_status_runtime_faults | Gauge | Number of runtime faults. | runtime | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_node_status_runtime_suspended | Gauge | Runtime node suspension status (binary). | runtime | [worker/registration](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/registration/worker.go)
oasis_worker_processed_block_count | Counter | Number of processed roothash blocks. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_processed_event_count | Counter | Number of processed roothash events. | runtime | [worker/common/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/common/committee/node.go)
oasis_worker_storage_commit_latency | Summary | Latency of storage commit calls (state + outputs) (seconds). | runtime | [worker/compute/executor/committee](https://github.com/oasisprotocol/oasis-core/tree/master/go/worker/compute/executor/committee/node.go)
Expand Down
118 changes: 118 additions & 0 deletions go/worker/registration/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
consensus "github.com/oasisprotocol/oasis-core/go/consensus/api"
control "github.com/oasisprotocol/oasis-core/go/control/api"
"github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common/flags"
cmmetrics "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common/metrics"
registry "github.com/oasisprotocol/oasis-core/go/registry/api"
runtimeRegistry "github.com/oasisprotocol/oasis-core/go/runtime/registry"
sentryClient "github.com/oasisprotocol/oasis-core/go/sentry/client"
Expand All @@ -53,6 +54,8 @@ const (
// CfgRegistrationRotateCerts sets the number of epochs that a node's TLS
// certificate should be valid for.
CfgRegistrationRotateCerts = "worker.registration.rotate_certs"

periodicMetricsInterval = 60 * time.Second
)

var (
Expand All @@ -69,9 +72,39 @@ var (
Help: "Is oasis node registered (binary).",
},
)
workerNodeStatusFrozen = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "oasis_worker_node_status_frozen",
Help: "Is oasis node frozen (binary).",
},
)
workerNodeRegistrationEligible = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "oasis_worker_node_registration_eligible",
Help: "Is oasis node eligible for registration (binary).",
},
)
workerNodeStatusFaults = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_node_status_runtime_faults",
Help: "Number of runtime faults.",
},
[]string{"runtime"},
)
workerNodeRuntimeSuspended = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_node_status_runtime_suspended",
Help: "Runtime node suspension status (binary).",
},
[]string{"runtime"},
)

nodeCollectors = []prometheus.Collector{
workerNodeRegistered,
workerNodeStatusFrozen,
workerNodeRegistrationEligible,
workerNodeStatusFaults,
workerNodeRuntimeSuspended,
}

metricsOnce sync.Once
Expand Down Expand Up @@ -585,6 +618,88 @@ Loop:
}
}

func (w *Worker) metricsWorker() {
w.logger.Info("delaying metrics worker start until initial registration")
select {
case <-w.stopCh:
return
case <-w.ctx.Done():
return
case <-w.initialRegCh:
}

w.logger.Debug("starting metrics worker")

t := time.NewTicker(periodicMetricsInterval)
defer t.Stop()

for {
select {
case <-w.stopCh:
return
case <-w.ctx.Done():
return
case <-t.C:
}

// Update metrics.
epoch, err := w.beacon.GetEpoch(w.ctx, consensus.HeightLatest)
if err != nil {
w.logger.Warn("unable to query epoch", "err", err)
continue
}
status, err := w.GetRegistrationStatus(w.ctx)
if err != nil {
w.logger.Warn("unable to get registration status", "err", err)
continue
}
nodeStatus := status.NodeStatus
if nodeStatus == nil {
w.logger.Debug("skipping node status metrics, empty node status")
continue
}

// Frozen metric.
switch nodeStatus.IsFrozen() {
case true:
workerNodeStatusFrozen.Set(1)
case false:
workerNodeStatusFrozen.Set(0)
}

// Election eligible metric.
switch {
case nodeStatus.ElectionEligibleAfter == 0:
workerNodeRegistrationEligible.Set(0)
case nodeStatus.ElectionEligibleAfter >= epoch:
workerNodeRegistrationEligible.Set(0)
default:
workerNodeRegistrationEligible.Set(1)
}

// Runtime metrics.
for _, rt := range w.runtimeRegistry.Runtimes() {
rtLabel := rt.ID().String()

faults := nodeStatus.Faults[rt.ID()]
switch faults {
case nil:
// No faults.
workerNodeRuntimeSuspended.WithLabelValues(rtLabel).Set(0)
workerNodeStatusFaults.WithLabelValues(rtLabel).Set(0)
default:
workerNodeStatusFaults.WithLabelValues(rtLabel).Set(float64(faults.Failures))
switch faults.IsSuspended(epoch) {
case true:
workerNodeRuntimeSuspended.WithLabelValues(rtLabel).Set(1)
case false:
workerNodeRuntimeSuspended.WithLabelValues(rtLabel).Set(0)
}
}
}
}
}

func (w *Worker) doNodeRegistration() {
defer func() {
close(w.quitCh)
Expand Down Expand Up @@ -1182,6 +1297,9 @@ func (w *Worker) Start() error {
}

go w.doNodeRegistration()
if cmmetrics.Enabled() {
go w.metricsWorker()
}

return nil
}
Expand Down

0 comments on commit 38286f0

Please sign in to comment.