From 00ccb5b26cc1698c94c778cec3ec8123007409d8 Mon Sep 17 00:00:00 2001 From: Eve Fritz Date: Mon, 7 Oct 2024 11:49:48 +0200 Subject: [PATCH] feat(RDS): create RDSLowDiskSpaceCount to display instances with less than 80% disk space --- .../prometheus_tests/RDSLowDiskSpaceCount.yml | 26 +++++++++++++++++++ charts/prometheus-rds-alerts/values.yaml | 9 +++++++ content/runbooks/rds/RDSLowDiskSpaceCount.md | 23 ++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 charts/prometheus-rds-alerts/prometheus_tests/RDSLowDiskSpaceCount.yml create mode 100644 content/runbooks/rds/RDSLowDiskSpaceCount.md diff --git a/charts/prometheus-rds-alerts/prometheus_tests/RDSLowDiskSpaceCount.yml b/charts/prometheus-rds-alerts/prometheus_tests/RDSLowDiskSpaceCount.yml new file mode 100644 index 0000000..c8d77c7 --- /dev/null +++ b/charts/prometheus-rds-alerts/prometheus_tests/RDSLowDiskSpaceCount.yml @@ -0,0 +1,26 @@ +rule_files: + - rules.yml + +evaluation_interval: 1m + +tests: + + - name: RDSLowDiskSpaceCount + interval: 1m + input_series: + - series: 'rds_free_storage_bytes{aws_account_id="111111111111",aws_region="eu-west-3",dbidentifier="db1"}' + values: '3221225472x10' # 3GB + - series: 'rds_allocated_storage_bytes{aws_account_id="111111111111",aws_region="eu-west-3",dbidentifier="db1"}' + values: '21474836480x15' # 20GB + alert_rule_test: + - alertname: RDSLowDiskSpaceCount + eval_time: 15m + exp_alerts: + - exp_labels: + aws_account_id: 111111111111 + aws_region: eu-west-3 + severity: warning + exp_annotations: + description: "One or more RDS instances has <20% free disk space" + summary: "Less than 20% free disk space on at least one instance" + runbook_url: "https://qonto.github.io/database-monitoring-framework/0.0.0/runbooks/rds/RDSLowDiskSpaceCount" diff --git a/charts/prometheus-rds-alerts/values.yaml b/charts/prometheus-rds-alerts/values.yaml index 22adfae..ebdba60 100644 --- a/charts/prometheus-rds-alerts/values.yaml +++ b/charts/prometheus-rds-alerts/values.yaml @@ -38,6 +38,15 @@ rules: summary: "{{ $labels.instance }} is reporting errors" description: "{{ $labels.instance }} is reporting {{ $value }} errors per minute" + RDSLowDiskSpaceCount: + expr: count(10 < max by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) < 20) by (aws_account_id,aws_region) >= 1 + for: 15m + labels: + severity: warning + annotations: + summary: "Less than 20% free disk space on at least one instance" + description: 'One or more RDS instances has <20% free disk space' + RDSDiskSpaceLimit: expr: max by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) < 10 for: 15m diff --git a/content/runbooks/rds/RDSLowDiskSpaceCount.md b/content/runbooks/rds/RDSLowDiskSpaceCount.md new file mode 100644 index 0000000..b74f872 --- /dev/null +++ b/content/runbooks/rds/RDSLowDiskSpaceCount.md @@ -0,0 +1,23 @@ +--- +title: Free disk space is under 20 percent for at least one instance +--- + +# RDSLowDiskSpaceCount + +## Meaning + +Alert is triggered when at least one RDS instance is under the threshold on storage left. + +## Impact + +The PostgreSQL instance(s) might stop to prevent data corruption if no more disk space is available. + +## Diagnosis + +1. Find affected instance list in prometheus with: + + ```promql + max by (aws_account_id, aws_region, dbidentifier) (rds_free_storage_bytes{} * 100 / rds_allocated_storage_bytes{}) < 20 + ``` + +1. Refer to [RDSDiskSpaceLimit](RDSDiskSpaceLimit.md) for each of them as it's the same alert just ringing a bit earlier.