From d06c4f532b68cc9cd929ef18be9ef21b29418e97 Mon Sep 17 00:00:00 2001
From: Richard Towers <richard.towers@digital.cabinet-office.gov.uk>
Date: Fri, 25 Aug 2023 13:50:53 +0000
Subject: [PATCH 1/4] Add a summary metric for route reload durations

I'm super curious about how long these take, as we're loading around 1M routes
from the database every time the routes reload (over 1M in draft, just under in
live). Al reckons about 20s, based on the logs, but it would be good to know for
sure.

This adds a summary metric to which will allow us to calculate median / 90th /
95th / 99th percentile durations.

I've also added labels to the count / duration metrics so we can tell which ones
are successes and failures. If you don't put success / failure labels on your
duration metrics they can get all mucked up by quick failures and slow
successes, which you can't distinguish between.

Prometheus summaries / histograms[0] are a bit hard to wrap one's head around,
but I think summary is the right choice here. Key factors:

1) With Histograms, you have to specify the timings of the buckets you care
  about up front (and we don't know how long these reloads take, so that's hard)
2) Summaries let you specify which quantiles you want up front, with the
  calculation happening "on the client side" (i.e. inside router, before things go
  to prometheus), which is more expensive at observation time
3) We're not making many observations for this metric, because we only reload routes
  once every few seconds (max), so the cost of calculating the summary on the client
  side should be small.

The Objectives map sets the quantiles we care about, and an amount of error. In
this case, by setting `0.5: 0.01` I'm saying "bucket things so I get a quantile
that's between 0.49 and 0.51", and by setting `0.99: 0.005` I'm saying "bucket
things so I get a quantile that's between 0.985 and 0.995". They're not exact
for performance reasons.[1]

[0] - https://prometheus.io/docs/practices/histograms/
[1] - https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/#limiting-the-error-an-upper-bound-for-delta
---
 lib/metrics.go | 18 +++++++++++++++++-
 lib/router.go  | 10 +++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/lib/metrics.go b/lib/metrics.go
index 36b0addc..d507c18d 100644
--- a/lib/metrics.go
+++ b/lib/metrics.go
@@ -15,11 +15,26 @@ var (
 		[]string{"host"},
 	)
 
-	routeReloadCountMetric = prometheus.NewCounter(
+	routeReloadCountMetric = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "router_route_reload_total",
 			Help: "Total number of attempts to reload the routing table",
 		},
+		[]string{},
+	)
+
+	routeReloadDurationMetric = prometheus.NewSummaryVec(
+		prometheus.SummaryOpts{
+			Name: "router_route_reload_duration_seconds",
+			Help: "Histogram of route reload durations in seconds",
+			Objectives: map[float64]float64{
+				0.5:  0.01,
+				0.9:  0.01,
+				0.95: 0.01,
+				0.99: 0.005,
+			},
+		},
+		[]string{},
 	)
 
 	routeReloadErrorCountMetric = prometheus.NewCounter(
@@ -41,6 +56,7 @@ func registerMetrics(r prometheus.Registerer) {
 	r.MustRegister(
 		internalServerErrorCountMetric,
 		routeReloadCountMetric,
+		routeReloadDurationMetric,
 		routeReloadErrorCountMetric,
 		routesCountMetric,
 	)
diff --git a/lib/router.go b/lib/router.go
index 03e56545..4f2f15f1 100644
--- a/lib/router.go
+++ b/lib/router.go
@@ -5,6 +5,7 @@ import (
 	"net/http"
 	"net/url"
 	"os"
+	"strconv"
 	"sync"
 	"time"
 
@@ -213,11 +214,11 @@ type mongoDatabase interface {
 // create a new proxy mux, load applications (backends) and routes into it, and
 // then flip the "mux" pointer in the Router.
 func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimestamp) {
+	startTime := time.Now()
 	defer func() {
-		// increment this metric regardless of whether the route reload succeeded
-		routeReloadCountMetric.Inc()
-
+		success := true
 		if r := recover(); r != nil {
+			success = false
 			logWarn("router: recovered from panic in reloadRoutes:", r)
 			logInfo("router: original routes have not been modified")
 			errorMessage := fmt.Sprintf("panic: %v", r)
@@ -228,6 +229,9 @@ func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimesta
 		} else {
 			rt.mongoReadToOptime = currentOptime
 		}
+		labels := prometheus.Labels{"success": strconv.FormatBool(success)}
+		routeReloadCountMetric.With(labels).Inc()
+		routeReloadDurationMetric.With(labels).Observe(time.Since(startTime).Seconds())
 	}()
 
 	logInfo("router: reloading routes")

From 076e1200702aa06b383d6eaccf360d8b2742f62c Mon Sep 17 00:00:00 2001
From: Richard Towers <richard.towers@digital.cabinet-office.gov.uk>
Date: Fri, 25 Aug 2023 14:38:52 +0000
Subject: [PATCH 2/4] Add missing labels to route reload metrics

---
 lib/metrics.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/metrics.go b/lib/metrics.go
index d507c18d..cea5eb39 100644
--- a/lib/metrics.go
+++ b/lib/metrics.go
@@ -20,7 +20,7 @@ var (
 			Name: "router_route_reload_total",
 			Help: "Total number of attempts to reload the routing table",
 		},
-		[]string{},
+		[]string{"success"},
 	)
 
 	routeReloadDurationMetric = prometheus.NewSummaryVec(
@@ -34,7 +34,7 @@ var (
 				0.99: 0.005,
 			},
 		},
-		[]string{},
+		[]string{"success"},
 	)
 
 	routeReloadErrorCountMetric = prometheus.NewCounter(

From 7783d1433cb70038644ffbe96397b4c490a933f3 Mon Sep 17 00:00:00 2001
From: Richard Towers <richard.towers@digital.cabinet-office.gov.uk>
Date: Tue, 29 Aug 2023 16:40:04 +0000
Subject: [PATCH 3/4] Remove reloadCount duplicated by reloadDuration

Prometheus histograms and summaries include a `_count` metric, so there's no
need to keep the count separately ourselves once we have this.

Example:

  # HELP router_route_reload_duration_seconds Histogram of route reload durations in seconds
  # TYPE router_route_reload_duration_seconds summary
  router_route_reload_duration_seconds{success="true",quantile="0.5"} 0.000344233
  router_route_reload_duration_seconds{success="true",quantile="0.9"} 0.00038086
  router_route_reload_duration_seconds{success="true",quantile="0.95"} 0.000488091
  router_route_reload_duration_seconds{success="true",quantile="0.99"} 0.000839465
  router_route_reload_duration_seconds_sum{success="true"} 0.015965703999999997
  router_route_reload_duration_seconds_count{success="true"} 44
---
 lib/metrics.go | 17 -----------------
 lib/router.go  |  3 ---
 2 files changed, 20 deletions(-)

diff --git a/lib/metrics.go b/lib/metrics.go
index cea5eb39..6c2480cb 100644
--- a/lib/metrics.go
+++ b/lib/metrics.go
@@ -15,14 +15,6 @@ var (
 		[]string{"host"},
 	)
 
-	routeReloadCountMetric = prometheus.NewCounterVec(
-		prometheus.CounterOpts{
-			Name: "router_route_reload_total",
-			Help: "Total number of attempts to reload the routing table",
-		},
-		[]string{"success"},
-	)
-
 	routeReloadDurationMetric = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
 			Name: "router_route_reload_duration_seconds",
@@ -37,13 +29,6 @@ var (
 		[]string{"success"},
 	)
 
-	routeReloadErrorCountMetric = prometheus.NewCounter(
-		prometheus.CounterOpts{
-			Name: "router_route_reload_error_total",
-			Help: "Number of failed attempts to reload the routing table",
-		},
-	)
-
 	routesCountMetric = prometheus.NewGauge(
 		prometheus.GaugeOpts{
 			Name: "router_routes_loaded",
@@ -55,9 +40,7 @@ var (
 func registerMetrics(r prometheus.Registerer) {
 	r.MustRegister(
 		internalServerErrorCountMetric,
-		routeReloadCountMetric,
 		routeReloadDurationMetric,
-		routeReloadErrorCountMetric,
 		routesCountMetric,
 	)
 	handlers.RegisterMetrics(r)
diff --git a/lib/router.go b/lib/router.go
index 4f2f15f1..31fb1c35 100644
--- a/lib/router.go
+++ b/lib/router.go
@@ -224,13 +224,10 @@ func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimesta
 			errorMessage := fmt.Sprintf("panic: %v", r)
 			err := logger.RecoveredError{ErrorMessage: errorMessage}
 			logger.NotifySentry(logger.ReportableError{Error: err})
-
-			routeReloadErrorCountMetric.Inc()
 		} else {
 			rt.mongoReadToOptime = currentOptime
 		}
 		labels := prometheus.Labels{"success": strconv.FormatBool(success)}
-		routeReloadCountMetric.With(labels).Inc()
 		routeReloadDurationMetric.With(labels).Observe(time.Since(startTime).Seconds())
 	}()
 

From 624060b0c0e8ab7a7a4da17da7a8da5881b62db2 Mon Sep 17 00:00:00 2001
From: Richard Towers <richard.towers@digital.cabinet-office.gov.uk>
Date: Tue, 29 Aug 2023 17:02:34 +0000
Subject: [PATCH 4/4] Use prometheus.Timer instead of timing manually

This is a little more idiomatic / elegant than doing the timing manually
---
 lib/router.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/router.go b/lib/router.go
index 31fb1c35..7029440d 100644
--- a/lib/router.go
+++ b/lib/router.go
@@ -214,9 +214,13 @@ type mongoDatabase interface {
 // create a new proxy mux, load applications (backends) and routes into it, and
 // then flip the "mux" pointer in the Router.
 func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimestamp) {
-	startTime := time.Now()
+	var success bool
+	timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) {
+		labels := prometheus.Labels{"success": strconv.FormatBool(success)}
+		routeReloadDurationMetric.With(labels).Observe(v)
+	}))
 	defer func() {
-		success := true
+		success = true
 		if r := recover(); r != nil {
 			success = false
 			logWarn("router: recovered from panic in reloadRoutes:", r)
@@ -227,8 +231,7 @@ func (rt *Router) reloadRoutes(db *mgo.Database, currentOptime bson.MongoTimesta
 		} else {
 			rt.mongoReadToOptime = currentOptime
 		}
-		labels := prometheus.Labels{"success": strconv.FormatBool(success)}
-		routeReloadDurationMetric.With(labels).Observe(time.Since(startTime).Seconds())
+		timer.ObserveDuration()
 	}()
 
 	logInfo("router: reloading routes")