From d06c4f532b68cc9cd929ef18be9ef21b29418e97 Mon Sep 17 00:00:00 2001 From: Richard Towers Date: Fri, 25 Aug 2023 13:50:53 +0000 Subject: [PATCH 1/4] Add a summary metric for route reload durations I'm super curious about how long these take, as we're loading around 1M routes from the database every time the routes reload (over 1M in draft, just under in live). Al reckons about 20s, based on the logs, but it would be good to know for sure. This adds a summary metric to which will allow us to calculate median / 90th / 95th / 99th percentile durations. I've also added labels to the count / duration metrics so we can tell which ones are successes and failures. If you don't put success / failure labels on your duration metrics they can get all mucked up by quick failures and slow successes, which you can't distinguish between. Prometheus summaries / histograms[0] are a bit hard to wrap one's head around, but I think summary is the right choice here. Key factors: 1) With Histograms, you have to specify the timings of the buckets you care about up front (and we don't know how long these reloads take, so that's hard) 2) Summaries let you specify which quantiles you want up front, with the calculation happening "on the client side" (i.e. inside router, before things go to prometheus), which is more expensive at observation time 3) We're not making many observations for this metric, because we only reload routes once every few seconds (max), so the cost of calculating the summary on the client side should be small. The Objectives map sets the quantiles we care about, and an amount of error. In this case, by setting `0.5: 0.01` I'm saying "bucket things so I get a quantile that's between 0.49 and 0.51", and by setting `0.99: 0.005` I'm saying "bucket things so I get a quantile that's between 0.985 and 0.995". They're not exact for performance reasons.[1] [0] - https://prometheus.io/docs/practices/histograms/ [1] - https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/#limiting-the-error-an-upper-bound-for-delta --- lib/metrics.go | 18 +++++++++++++++++- lib/router.go | 10 +++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/lib/metrics.go b/lib/metrics.go index 36b0addc..d507c18d 100644 --- a/lib/metrics.go +++ b/lib/metrics.go @@ -15,11 +15,26 @@ var ( []string{"host"}, ) - routeReloadCountMetric = prometheus.NewCounter( + routeReloadCountMetric = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "router_route_reload_total", Help: "Total number of attempts to reload the routing table", }, + []string{}, + ) + + routeReloadDurationMetric = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "router_route_reload_duration_seconds", + Help: "Histogram of route reload durations in seconds", + Objectives: map[float64]float64{ + 0.5: 0.01, + 0.9: 0.01, + 0.95: 0.01, + 0.99: 0.005, + }, + }, + []string{}, ) routeReloadErrorCountMetric = prometheus.NewCounter( @@ -41,6 +56,7 @@ func registerMetrics(r prometheus.Registerer) { r.MustRegister( internalServerErrorCountMetric, routeReloadCountMetric, + routeReloadDurationMetric, routeReloadErrorCountMetric, routesCountMetric, ) diff --git a/lib/router.go b/lib/router.go index 03e56545..4f2f15f1 100644 --- a/lib/router.go +++ b/lib/router.go @@ -5,6 +5,7 @@ import ( "net/http" "net/url" "os" + "strconv" "sync" "time" @@ -213,11 +214,11 @@ type mongoDatabase interface { // create a new proxy mux, load applications (backends) and routes into it, and // then flip the "mux" pointer in the Router. func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimestamp) { + startTime := time.Now() defer func() { - // increment this metric regardless of whether the route reload succeeded - routeReloadCountMetric.Inc() - + success := true if r := recover(); r != nil { + success = false logWarn("router: recovered from panic in reloadRoutes:", r) logInfo("router: original routes have not been modified") errorMessage := fmt.Sprintf("panic: %v", r) @@ -228,6 +229,9 @@ func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimesta } else { rt.mongoReadToOptime = currentOptime } + labels := prometheus.Labels{"success": strconv.FormatBool(success)} + routeReloadCountMetric.With(labels).Inc() + routeReloadDurationMetric.With(labels).Observe(time.Since(startTime).Seconds()) }() logInfo("router: reloading routes") From 076e1200702aa06b383d6eaccf360d8b2742f62c Mon Sep 17 00:00:00 2001 From: Richard Towers Date: Fri, 25 Aug 2023 14:38:52 +0000 Subject: [PATCH 2/4] Add missing labels to route reload metrics --- lib/metrics.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/metrics.go b/lib/metrics.go index d507c18d..cea5eb39 100644 --- a/lib/metrics.go +++ b/lib/metrics.go @@ -20,7 +20,7 @@ var ( Name: "router_route_reload_total", Help: "Total number of attempts to reload the routing table", }, - []string{}, + []string{"success"}, ) routeReloadDurationMetric = prometheus.NewSummaryVec( @@ -34,7 +34,7 @@ var ( 0.99: 0.005, }, }, - []string{}, + []string{"success"}, ) routeReloadErrorCountMetric = prometheus.NewCounter( From 7783d1433cb70038644ffbe96397b4c490a933f3 Mon Sep 17 00:00:00 2001 From: Richard Towers Date: Tue, 29 Aug 2023 16:40:04 +0000 Subject: [PATCH 3/4] Remove reloadCount duplicated by reloadDuration Prometheus histograms and summaries include a `_count` metric, so there's no need to keep the count separately ourselves once we have this. Example: # HELP router_route_reload_duration_seconds Histogram of route reload durations in seconds # TYPE router_route_reload_duration_seconds summary router_route_reload_duration_seconds{success="true",quantile="0.5"} 0.000344233 router_route_reload_duration_seconds{success="true",quantile="0.9"} 0.00038086 router_route_reload_duration_seconds{success="true",quantile="0.95"} 0.000488091 router_route_reload_duration_seconds{success="true",quantile="0.99"} 0.000839465 router_route_reload_duration_seconds_sum{success="true"} 0.015965703999999997 router_route_reload_duration_seconds_count{success="true"} 44 --- lib/metrics.go | 17 ----------------- lib/router.go | 3 --- 2 files changed, 20 deletions(-) diff --git a/lib/metrics.go b/lib/metrics.go index cea5eb39..6c2480cb 100644 --- a/lib/metrics.go +++ b/lib/metrics.go @@ -15,14 +15,6 @@ var ( []string{"host"}, ) - routeReloadCountMetric = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "router_route_reload_total", - Help: "Total number of attempts to reload the routing table", - }, - []string{"success"}, - ) - routeReloadDurationMetric = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Name: "router_route_reload_duration_seconds", @@ -37,13 +29,6 @@ var ( []string{"success"}, ) - routeReloadErrorCountMetric = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "router_route_reload_error_total", - Help: "Number of failed attempts to reload the routing table", - }, - ) - routesCountMetric = prometheus.NewGauge( prometheus.GaugeOpts{ Name: "router_routes_loaded", @@ -55,9 +40,7 @@ var ( func registerMetrics(r prometheus.Registerer) { r.MustRegister( internalServerErrorCountMetric, - routeReloadCountMetric, routeReloadDurationMetric, - routeReloadErrorCountMetric, routesCountMetric, ) handlers.RegisterMetrics(r) diff --git a/lib/router.go b/lib/router.go index 4f2f15f1..31fb1c35 100644 --- a/lib/router.go +++ b/lib/router.go @@ -224,13 +224,10 @@ func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimesta errorMessage := fmt.Sprintf("panic: %v", r) err := logger.RecoveredError{ErrorMessage: errorMessage} logger.NotifySentry(logger.ReportableError{Error: err}) - - routeReloadErrorCountMetric.Inc() } else { rt.mongoReadToOptime = currentOptime } labels := prometheus.Labels{"success": strconv.FormatBool(success)} - routeReloadCountMetric.With(labels).Inc() routeReloadDurationMetric.With(labels).Observe(time.Since(startTime).Seconds()) }() From 624060b0c0e8ab7a7a4da17da7a8da5881b62db2 Mon Sep 17 00:00:00 2001 From: Richard Towers Date: Tue, 29 Aug 2023 17:02:34 +0000 Subject: [PATCH 4/4] Use prometheus.Timer instead of timing manually This is a little more idiomatic / elegant than doing the timing manually --- lib/router.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/router.go b/lib/router.go index 31fb1c35..7029440d 100644 --- a/lib/router.go +++ b/lib/router.go @@ -214,9 +214,13 @@ type mongoDatabase interface { // create a new proxy mux, load applications (backends) and routes into it, and // then flip the "mux" pointer in the Router. func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimestamp) { - startTime := time.Now() + var success bool + timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { + labels := prometheus.Labels{"success": strconv.FormatBool(success)} + routeReloadDurationMetric.With(labels).Observe(v) + })) defer func() { - success := true + success = true if r := recover(); r != nil { success = false logWarn("router: recovered from panic in reloadRoutes:", r) @@ -227,8 +231,7 @@ func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimesta } else { rt.mongoReadToOptime = currentOptime } - labels := prometheus.Labels{"success": strconv.FormatBool(success)} - routeReloadDurationMetric.With(labels).Observe(time.Since(startTime).Seconds()) + timer.ObserveDuration() }() logInfo("router: reloading routes")