Skip to content

Commit

Permalink
Merge 3f383a9 into bd6918e
Browse files Browse the repository at this point in the history
  • Loading branch information
cbartz authored Jan 30, 2024
2 parents bd6918e + 3f383a9 commit 8005413
Show file tree
Hide file tree
Showing 7 changed files with 785 additions and 0 deletions.
85 changes: 85 additions & 0 deletions grafana_alert_rules/crashed_runner.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"id": 1,
"uid": "S_cgecKSz",
"orgID": 1,
"folderUID": "8kORy5KSk",
"ruleGroup": "github-runner-failure",
"title": "Crashed runner",
"condition": "B",
"data": [
{
"refId": "A",
"queryType": "range",
"relativeTimeRange": {
"from": 3600,
"to": 0
},
"datasourceUid": "P30805665297C0350",
"model": {
"editorMode": "code",
"expr": "(sum_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\", crashed_runners=\"crashed_runners\" | event = `reconciliation` | unwrap crashed_runners [$__range]))",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"queryType": "range",
"refId": "A"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 3600,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
}
],
"updated": "2024-01-04T13:01:42Z",
"noDataState": "NoData",
"execErrState": "Error",
"for": "0s",
"annotations": {
"summary": "A runner in unit {{ $labels.juju_unit }} crashed."
},
"labels": {
"severity": "high",
"type": "runner-failure"
}
}
132 changes: 132 additions & 0 deletions grafana_alert_rules/job_queue.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"id": 4,
"uid": "TJxNipKSz",
"orgID": 1,
"folderUID": "8kORy5KSk",
"ruleGroup": "github-runner-capacity",
"title": "Job Queue duration too high",
"condition": "C",
"data": [
{
"refId": "A",
"queryType": "range",
"relativeTimeRange": {
"from": 3600,
"to": 0
},
"datasourceUid": "P30805665297C0350",
"model": {
"editorMode": "code",
"expr": "quantile_over_time(0.5, {filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\",duration=\"queue_duration\",flavor=\"flavor\" | __error__=\"\" | event=\"runner_start\" | unwrap duration[1h]) by(flavor)",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"queryType": "range",
"refId": "A"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 3600,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
},
{
"refId": "C",
"queryType": "",
"relativeTimeRange": {
"from": 600,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0,
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"datasource": {
"name": "Expression",
"type": "__expr__",
"uid": "__expr__"
},
"expression": "$B \u003e 1800",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "math"
}
}
],
"updated": "2024-01-05T07:20:12Z",
"noDataState": "OK",
"execErrState": "Error",
"for": "1h",
"annotations": {
"description": "Job queue duration is higher than 30 minutes for half of the runners with flavor {{$labels.flavor}}",
"summary": "Job queue duration is too high for flavor {{$labels.flavor}}"
},
"labels": {
"severity": "high",
"type": "runner-capacity"
}
}
134 changes: 134 additions & 0 deletions grafana_alert_rules/long_running_jobs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
{
"id": 11,
"uid": "WwkgEWtSk",
"orgID": 1,
"folderUID": "8kORy5KSk",
"ruleGroup": "github-runner-failure",
"title": "Long-running jobs",
"condition": "C",
"data": [
{
"refId": "A",
"queryType": "instant",
"relativeTimeRange": {
"from": 7200,
"to": 0
},
"datasourceUid": "P30805665297C0350",
"model": {
"editorMode": "code",
"expr": "count by (flavor)(sum by(instance, flavor)(max_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\",idle_runners=\"idle_runners\",flavor=\"flavor\" | event=\"reconciliation\" | unwrap idle_runners[$__range])) == 0 ) / count by (flavor)(sum by(instance, flavor)(count_over_time({filename=\"/var/log/github-runner-metrics.log\"} | json event=\"event\",idle_runners=\"idle_runners\",flavor=\"flavor\" | event=\"reconciliation\" [$__range])))",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"queryType": "instant",
"refId": "A"
}
},
{
"refId": "B",
"queryType": "",
"relativeTimeRange": {
"from": 7200,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0,
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"datasource": {
"name": "Expression",
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
},
{
"refId": "C",
"queryType": "",
"relativeTimeRange": {
"from": 600,
"to": 0
},
"datasourceUid": "-100",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0,
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"datasource": {
"name": "Expression",
"type": "__expr__",
"uid": "__expr__"
},
"expression": "$B \u003e 0.3",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "C",
"type": "math"
}
}
],
"updated": "2024-01-30T15:27:05Z",
"noDataState": "OK",
"execErrState": "Error",
"for": "2h",
"annotations": {
"description": "More than 30% of flavour {{$labels.flavor}} units had no idle runners for 2 hours.",
"summary": "Too many long-running jobs for flavour {{$labels.flavor}}"
},
"labels": {
"severity": "high",
"type": "runner-failure"
}
}
Loading

0 comments on commit 8005413

Please sign in to comment.