Skip to content
This repository has been archived by the owner on Feb 27, 2023. It is now read-only.

Commit

Permalink
Merge pull request #164 from alexbrand/histograms
Browse files Browse the repository at this point in the history
monitoring: use histograms for api latency and cycle time metrics
  • Loading branch information
stevesloka authored Jun 28, 2018
2 parents 4caead0 + 3787d0c commit e2fa4aa
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 45 deletions.
221 changes: 205 additions & 16 deletions deployment/grafana/02-grafana-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ data:
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1528906914777,
"iteration": 1529959091609,
"links": [],
"panels": [
{
Expand Down Expand Up @@ -1224,10 +1224,10 @@ data:
"x": 6,
"y": 20
},
"id": 13,
"id": 25,
"legend": {
"alignAsTable": false,
"avg": false,
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
Expand All @@ -1250,18 +1250,24 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "avg(gimbal_discoverer_api_latency_ms{backendname=~\"$Backend\"}) by (backendname)",
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*loadbalancers.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"legendFormat": "{{backendname}}",
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*loadbalancers.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Openstack API Latency",
"title": "Openstack API Latency: Load Balancers Endpoint",
"tooltip": {
"shared": true,
"sort": 0,
Expand Down Expand Up @@ -1300,18 +1306,196 @@ data:
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"description": "Time to process all items within a cluster",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 20
},
"id": 13,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*pools.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*pools.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Openstack API Latency: Pools Endpoint",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 20
},
"id": 26,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*listeners.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_api_latency_milliseconds_bucket{path=~\".*listeners.*\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Openstack API Latency: Listeners",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ms",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"description": "Time to process all items within a cluster",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 27
},
"id": 11,
"legend": {
"alignAsTable": true,
"avg": false,
"avg": true,
"current": true,
"max": false,
"min": false,
Expand All @@ -1333,13 +1517,18 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "gimbal_discoverer_cycle_duration_ms{backendname=~\"$Backend\"} ",
"expr": "histogram_quantile(0.5, sum(rate(gimbal_discoverer_cycle_duration_seconds_bucket{backendname=~\"$Backend\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{backendname}}",
"legendFormat": "{{kubernetes_pod_name}} 50%",
"refId": "A"
},
{
"expr": "histogram_quantile(0.99, sum(rate(gimbal_discoverer_cycle_duration_seconds_bucket{backendname=~\"$Backend\"}[5m])) by (le, kubernetes_pod_name))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{kubernetes_pod_name}} 99%",
"refId": "B"
}
],
"thresholds": [],
Expand All @@ -1361,7 +1550,7 @@ data:
},
"yaxes": [
{
"format": "ms",
"format": "s",
"label": null,
"logBase": 1,
"max": null,
Expand Down Expand Up @@ -1459,7 +1648,7 @@ data:
"timezone": "",
"title": "Gimbal Discovery",
"uid": "ex4WqmZmk",
"version": 21
"version": 2
}
envoy.json: |
{
Expand Down
56 changes: 29 additions & 27 deletions discovery/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@ type DiscovererMetrics struct {
}

const (
ServiceEventTimestampGauge = "gimbal_service_event_timestamp"
EndpointsEventTimestampGauge = "gimbal_endpoints_event_timestamp"
ServiceErrorTotalCounter = "gimbal_service_error_total"
EndpointsErrorTotalCounter = "gimbal_endpoints_error_total"
QueueSizeGauge = "gimbal_queuesize"
DiscovererAPILatencyMSGauge = "gimbal_discoverer_api_latency_ms"
DiscovererCycleDurationMSGauge = "gimbal_discoverer_cycle_duration_ms"
DiscovererErrorTotal = "gimbal_discoverer_error_total"
DiscovererUpstreamServicesGauge = "gimbal_discoverer_upstream_services_total"
DiscovererReplicatedServicesGauge = "gimbal_discoverer_replicated_services_total"
DiscovererInvalidServicesGauge = "gimbal_discoverer_invalid_services_total"
DiscovererUpstreamEndpointsGauge = "gimbal_discoverer_upstream_endpoints_total"
DiscovererReplicatedEndpointsGauge = "gimbal_discoverer_replicated_endpoints_total"
DiscovererInvalidEndpointsGauge = "gimbal_discoverer_invalid_endpoints_total"
DiscovererInfoGauge = "gimbal_discoverer_info"
ServiceEventTimestampGauge = "gimbal_service_event_timestamp"
EndpointsEventTimestampGauge = "gimbal_endpoints_event_timestamp"
ServiceErrorTotalCounter = "gimbal_service_error_total"
EndpointsErrorTotalCounter = "gimbal_endpoints_error_total"
QueueSizeGauge = "gimbal_queuesize"
DiscovererAPILatencyMsHistogram = "gimbal_discoverer_api_latency_milliseconds"
DiscovererCycleDurationSecondsHistogram = "gimbal_discoverer_cycle_duration_seconds"
DiscovererErrorTotal = "gimbal_discoverer_error_total"
DiscovererUpstreamServicesGauge = "gimbal_discoverer_upstream_services_total"
DiscovererReplicatedServicesGauge = "gimbal_discoverer_replicated_services_total"
DiscovererInvalidServicesGauge = "gimbal_discoverer_invalid_services_total"
DiscovererUpstreamEndpointsGauge = "gimbal_discoverer_upstream_endpoints_total"
DiscovererReplicatedEndpointsGauge = "gimbal_discoverer_replicated_endpoints_total"
DiscovererInvalidEndpointsGauge = "gimbal_discoverer_invalid_endpoints_total"
DiscovererInfoGauge = "gimbal_discoverer_info"
)

// NewMetrics returns a map of Prometheus metrics
Expand Down Expand Up @@ -89,17 +89,19 @@ func NewMetrics(BackendType, BackendName string) DiscovererMetrics {
},
[]string{"backendname", "backendtype"},
),
DiscovererAPILatencyMSGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: DiscovererAPILatencyMSGauge,
Help: "The milliseconds it takes for requests to return from a remote discoverer api",
DiscovererAPILatencyMsHistogram: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: DiscovererAPILatencyMsHistogram,
Help: "The milliseconds it takes for requests to return from a remote discoverer api",
Buckets: []float64{20, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 20000, 50000, 120000}, // milliseconds. largest bucket is 2 minutes.
},
[]string{"backendname", "backendtype", "path"},
),
DiscovererCycleDurationMSGauge: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: DiscovererCycleDurationMSGauge,
Help: "The milliseconds it takes for all objects to be synced from a remote discoverer api",
DiscovererCycleDurationSecondsHistogram: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: DiscovererCycleDurationSecondsHistogram,
Help: "The seconds it takes for all objects to be synced from a remote backend",
Buckets: prometheus.LinearBuckets(60, 60, 10), // 10 buckets, each 30 wide
},
[]string{"backendname", "backendtype"},
),
Expand Down Expand Up @@ -228,17 +230,17 @@ func (d *DiscovererMetrics) QueueSizeGaugeMetric(size int) {

// CycleDurationMetric formats a cycle duration gauge prometheus metric
func (d *DiscovererMetrics) CycleDurationMetric(duration time.Duration) {
m, ok := d.Metrics[DiscovererCycleDurationMSGauge].(*prometheus.GaugeVec)
m, ok := d.Metrics[DiscovererCycleDurationSecondsHistogram].(*prometheus.HistogramVec)
if ok {
m.WithLabelValues(d.BackendName, d.BackendType).Set(math.Floor(duration.Seconds() * 1e3))
m.WithLabelValues(d.BackendName, d.BackendType).Observe(math.Floor(duration.Seconds()))
}
}

// APILatencyMetric formats a cycle duration gauge prometheus metric
func (d *DiscovererMetrics) APILatencyMetric(path string, duration time.Duration) {
m, ok := d.Metrics[DiscovererAPILatencyMSGauge].(*prometheus.GaugeVec)
m, ok := d.Metrics[DiscovererAPILatencyMsHistogram].(*prometheus.HistogramVec)
if ok {
m.WithLabelValues(d.BackendName, d.BackendType, path).Set(math.Floor(duration.Seconds() * 1e3))
m.WithLabelValues(d.BackendName, d.BackendType, path).Observe(math.Floor(duration.Seconds() * 1e3))
}
}

Expand Down
4 changes: 2 additions & 2 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ Detailed documentation on stats within Envoy is available on their site: https:/
- **gimbal_queuesize (gauge):** Number of items in process queue with the following labels:
- backendname
- backendtype
- **gimbal_discoverer_api_latency_ms (gauge):** The milliseconds it takes for requests to return from a remote discoverer api (for example Openstack)
- **gimbal_discoverer_api_latency_milliseconds (histogram):** The milliseconds it takes for requests to return from a remote discoverer api (for example OpenStack)
- backendname
- backendtype
- path: API request path
- **gimbal_discoverer_cycle_duration_ms (gauge):** The milliseconds it takes for all objects to be synced from a remote discoverer api (for example Openstack)
- **gimbal_discoverer_cycle_duration_seconds (histogram):** The seconds it takes for all objects to be synced from a remote backend (for example OpenStack)
- backendname
- backendtype
- **gimbal_discoverer_api_error_total (counter):** Number of errors that have occurred when accessing the OpenStack API
Expand Down

0 comments on commit e2fa4aa

Please sign in to comment.