Skip to content

Commit

Permalink
metricsconfig: Define health metrics group
Browse files Browse the repository at this point in the history
The idea is that in the future this group will have constrained cardinality and
will be enabled by default (in contrast to another group with potentially high
cardinality "debug" metrics). This commit only refactors the existing metrics
initialization code to use the new framework. The health metrics group contains
all metrics that were documented in the "health metrics" section, but in the
future some of them will likely be moved to another group.

Signed-off-by: Anna Kapuscinska <[email protected]>
  • Loading branch information
lambdanis committed Jun 28, 2024
1 parent b7091f6 commit d0ebc78
Show file tree
Hide file tree
Showing 19 changed files with 234 additions and 160 deletions.
2 changes: 1 addition & 1 deletion cmd/tetragon-metrics-docs/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func main() {
func initMetrics(target string, reg *prometheus.Registry, _ *slog.Logger) error {
switch target {
case "health":
metricsconfig.InitHealthMetricsForDocs(reg)
metricsconfig.EnableHealthMetrics(reg).InitForDocs()
case "resources":
metricsconfig.InitResourcesMetricsForDocs(reg)
case "events":
Expand Down
9 changes: 5 additions & 4 deletions pkg/exporter/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package exporter
import (
"io"

"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -30,10 +31,10 @@ var (
})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(eventsExportedTotal)
registry.MustRegister(eventsExportedBytesTotal)
registry.MustRegister(eventsExportTimestamp)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(eventsExportedTotal)
group.MustRegister(eventsExportedBytesTotal)
group.MustRegister(eventsExportTimestamp)
}

func newExportedBytesCounterWriter(w io.Writer, c prometheus.Counter) io.Writer {
Expand Down
15 changes: 10 additions & 5 deletions pkg/grpc/tracing/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package tracing

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -17,16 +18,20 @@ var (
}, []string{"count"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(LoaderStats)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(LoaderStats)
}

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitMetrics() {
// Initialize metrics with labels
for _, ty := range LoaderTypeStrings {
LoaderStats.WithLabelValues(ty).Add(0)
}

// NOTES:
// * Rename process_loader_stats metric (to e.g. process_loader_events_total) and count label (to e.g. event)?
}

type LoaderType int
Expand Down
5 changes: 3 additions & 2 deletions pkg/metrics/cgroupratemetrics/cgroupratemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package cgroupratemetrics

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand Down Expand Up @@ -41,8 +42,8 @@ var (
}, []string{"type"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(CgroupRateTotal)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(CgroupRateTotal)
}

// Get a new handle on an ErrorTotal metric for an ErrorType
Expand Down
21 changes: 11 additions & 10 deletions pkg/metrics/errormetrics/errormetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"

"github.com/cilium/tetragon/pkg/api/ops"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand Down Expand Up @@ -84,10 +85,17 @@ var (
}, []string{"opcode", "error_type"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(ErrorTotal)
registry.MustRegister(HandlerErrors)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(ErrorTotal)
group.MustRegister(HandlerErrors)
}

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitMetrics() {
// Initialize metrics with labels
for er := range errorTypeLabelValues {
GetErrorTotal(er).Add(0)
Expand All @@ -100,13 +108,6 @@ func InitMetrics(registry *prometheus.Registry) {
// NB: We initialize only ops.MsgOpUndef here, but unknown_opcode can occur for any opcode
// that is not explicitly handled.
GetHandlerErrors(ops.MsgOpUndef, HandlePerfUnknownOp).Add(0)

// NOTES:
// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
// * error, error_type, type - standardize on a label
// * Delete errors_total{type="handler_error"} - it duplicates handler_errors_total
// * Consider further splitting errors_total
// * Rename handler_errors_total to event_handler_errors_total?
}

// Get a new handle on an ErrorTotal metric for an ErrorType
Expand Down
27 changes: 15 additions & 12 deletions pkg/metrics/eventcachemetrics/eventcachemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package eventcachemetrics

import (
"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand Down Expand Up @@ -79,14 +80,21 @@ var (
}, []string{"event_type"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(processInfoErrors)
registry.MustRegister(podInfoErrors)
registry.MustRegister(EventCacheCount)
registry.MustRegister(eventCacheErrorsTotal)
registry.MustRegister(eventCacheRetriesTotal)
registry.MustRegister(parentInfoErrors)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(processInfoErrors)
group.MustRegister(podInfoErrors)
group.MustRegister(EventCacheCount)
group.MustRegister(eventCacheErrorsTotal)
group.MustRegister(eventCacheRetriesTotal)
group.MustRegister(parentInfoErrors)
}

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitMetrics() {
// Initialize metrics with labels
for en := range cacheEntryTypeLabelValues {
EventCacheRetries(en).Add(0)
Expand All @@ -101,11 +109,6 @@ func InitMetrics(registry *prometheus.Registry) {
}
}
}

// NOTES:
// * error, error_type, type - standardize on a label
// * event, event_type, type - standardize on a label
// * Consider merging event cache errors metrics into one with error, event, entry labels
}

// Get a new handle on a processInfoErrors metric for an eventType
Expand Down
19 changes: 11 additions & 8 deletions pkg/metrics/eventmetrics/eventmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,22 @@ var (
}, []string{"policy", "hook"})
)

func InitHealthMetrics(registry *prometheus.Registry) {
registry.MustRegister(FlagCount)
registry.MustRegister(NotifyOverflowedEvents)
// custom collectors are registered independently
func RegisterHealthMetrics(group metrics.Group) {
group.MustRegister(FlagCount)
group.MustRegister(NotifyOverflowedEvents)
group.MustRegisterWithInit(NewBPFCollector())
}

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitHealthMetrics() {
// Initialize metrics with labels
for _, v := range exec.FlagStrings {
FlagCount.WithLabelValues(v).Add(0)
}

// NOTES:
// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
// * event, event_type, type - standardize on a label
}

func InitEventsMetrics(registry *prometheus.Registry) {
Expand Down
21 changes: 11 additions & 10 deletions pkg/metrics/kprobemetrics/kprobemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package kprobemetrics

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand Down Expand Up @@ -45,18 +46,18 @@ var (
})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(MergeErrors)
registry.MustRegister(MergeOkTotal)
registry.MustRegister(MergePushed)

// NOTES:
// * Consider merging ok and errors into one with status label
func RegisterMetrics(group metrics.Group) {
group.MustRegister(MergeErrors)
group.MustRegister(MergeOkTotal)
group.MustRegister(MergePushed)
}

func InitMetricsForDocs(registry *prometheus.Registry) {
InitMetrics(registry)

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitMetricsForDocs() {
// Initialize metrics with example labels
for _, curr := range mergeErrorTypeLabelValues {
for _, prev := range mergeErrorTypeLabelValues {
Expand Down
18 changes: 11 additions & 7 deletions pkg/metrics/opcodemetrics/opcodemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"

"github.com/cilium/tetragon/pkg/api/ops"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -28,21 +29,24 @@ var (
}, []string{"op"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(MsgOpsCount)
registry.MustRegister(LatencyStats)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(MsgOpsCount)
group.MustRegister(LatencyStats)
}

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitMetrics() {
// Initialize all metrics
for opcode := range ops.OpCodeStrings {
if opcode != ops.MsgOpUndef && opcode != ops.MsgOpTest {
GetOpTotal(opcode).Add(0)
LatencyStats.WithLabelValues(fmt.Sprint(int32(opcode)))
}
}

// NOTES:
// * op, msg_op, opcode - standardize on a label (+ add human-readable label)
// * Rename handling_latency to handler_latency_microseconds?
}

// Get a new handle on a msgOpsCount metric for an OpCode
Expand Down
16 changes: 10 additions & 6 deletions pkg/metrics/policyfiltermetrics/policyfiltermetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package policyfiltermetrics

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand Down Expand Up @@ -80,9 +81,16 @@ var (
})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(PolicyFilterOpMetrics, PolicyFilterHookContainerNameMissingMetrics)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(PolicyFilterOpMetrics, PolicyFilterHookContainerNameMissingMetrics)
}

// TODO:
// 1. Define metrics using functions from pkg/metrics
// 2. Move initialization code to metrics definitions if needed or remove it
// if not needed
// 3. Use label values defined as metrics.ConstrainedLabel
func InitMetrics() {
// Initialize metrics with labels
for _, subsys := range subsysLabelValues {
for _, op := range operationLabelValues {
Expand All @@ -93,10 +101,6 @@ func InitMetrics(registry *prometheus.Registry) {
}
}
}

// NOTES:
// * Don't confuse op in policyfilter_metrics_total with ops.OpCode
// * Rename policyfilter_metrics_total to get rid of _metrics?
}

func OpInc(subsys Subsys, op Operation, err string) {
Expand Down
5 changes: 3 additions & 2 deletions pkg/metrics/ratelimitmetrics/ratelimitmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package ratelimitmetrics

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -17,6 +18,6 @@ var (
})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(RateLimitDropped)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(RateLimitDropped)
}
9 changes: 5 additions & 4 deletions pkg/metrics/ringbufmetrics/ringbufmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package ringbufmetrics

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -29,8 +30,8 @@ var (
})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(PerfEventReceived)
registry.MustRegister(PerfEventLost)
registry.MustRegister(PerfEventErrors)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(PerfEventReceived)
group.MustRegister(PerfEventLost)
group.MustRegister(PerfEventErrors)
}
7 changes: 4 additions & 3 deletions pkg/metrics/ringbufqueuemetrics/ringbufqueuemetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package ringbufqueuemetrics

import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)
Expand All @@ -23,7 +24,7 @@ var (
})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(Received)
registry.MustRegister(Lost)
func RegisterMetrics(group metrics.Group) {
group.MustRegister(Received)
group.MustRegister(Lost)
}
Loading

0 comments on commit d0ebc78

Please sign in to comment.