Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DWPD to Hardware Overview dashboard #621

Merged
merged 5 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -637,8 +637,8 @@
"overrides": []
},
"gridPos": {
"h": 12,
"w": 20,
"h": 13,
"w": 9,
"x": 0,
"y": 17
},
Expand Down Expand Up @@ -674,6 +674,95 @@
],
"title": "Disk Temperatures",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "The data written to the disk in the last 24h period divided by the physical capacity of the disk",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 13,
"w": 10,
"x": 9,
"y": 17
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"editorMode": "code",
"expr": "delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000 / nvme_physical_size_bytes{instance=~\"$node\"}",
"legendFormat": "{{instance}} - {{device}}",
"range": true,
"refId": "A"
}
],
"title": "DWPD",
"type": "timeseries"
}
],
"refresh": false,
Expand Down
18 changes: 17 additions & 1 deletion etc/kayobe/kolla/config/prometheus/smart.rules
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,20 @@ groups:
summary: "SMART monitor reports bad disk on (instance {{ $labels.instance }})"
description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}"

{% endraw %}
- alert: DWPDTooHigh
expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > 1
labels:
severity: alert
annotations:
summary: "High 30-Day Average DWPD for {{ $labels.instance }}"
description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD"

- alert: DWPDTooHighWarning
expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > 1
labels:
severity: warning
annotations:
summary: "High 7-Day Average DWPD for {{ $labels.instance }}"
description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD"

{% endraw %}
10 changes: 10 additions & 0 deletions releasenotes/notes/dwpd-6b9fb0c8d6d3a570.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
features:
- |
Adds a panel in the Hardware Overview dashboard to show DWPD (Drive writes
per day) for NVMEs. This is calculated by dividing the total bytes written
technowhizz marked this conversation as resolved.
Show resolved Hide resolved
in the past 24 hours by the drive capacity. This is currently only
supported on NVMEs.
- |
Adds alerts that will fire after 1 DWPD is sustained for 7 days, and a
critical alert if 1 DWPD is sustained for 30 days.