diff --git a/config.yaml b/config.yaml index 60924863..a273a583 100644 --- a/config.yaml +++ b/config.yaml @@ -21,6 +21,12 @@ options: description: | Channel to install the DCGM snap if the hardware has NVIDIA GPU. By default, it will install from latest/stable + smartctl-exporter-snap-channel: + type: string + default: "latest/stable" + description: | + Channel to install the Smartctl exporter snap if the hardware has smart disk. By default, it will install + from latest/stable. exporter-log-level: type: string default: "INFO" diff --git a/src/charm.py b/src/charm.py index 62ea2a9a..11900a41 100755 --- a/src/charm.py +++ b/src/charm.py @@ -12,7 +12,7 @@ from ops.framework import EventBase, StoredState from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus -from hw_tools import HWTool, HWToolHelper, detect_available_tools +from hw_tools import HWTool, HWToolHelper, detect_available_tools, remove_legacy_smartctl_exporter from service import BaseExporter, DCGMExporter, ExporterError, HardwareExporter, SmartCtlExporter logger = logging.getLogger(__name__) @@ -81,10 +81,10 @@ def exporters(self) -> List[BaseExporter]: ) if stored_tools & SmartCtlExporter.hw_tools(): - exporters.append(SmartCtlExporter(self.charm_dir, self.model.config)) + exporters.append(SmartCtlExporter(self.model.config)) if stored_tools & DCGMExporter.hw_tools(): - exporters.append(DCGMExporter(self.model.config)) + exporters.append(DCGMExporter(self.charm_dir, self.model.config)) return exporters @@ -97,6 +97,8 @@ def get_stored_tools(self) -> Set[HWTool]: if not self._stored.stored_tools: # type: ignore[truthy-function] available_tools = detect_available_tools() # type: ignore[unreachable] self._stored.stored_tools = {tool.value for tool in available_tools} + if "smartctl" in self._stored.stored_tools: # type: ignore[operator] + self._stored.stored_tools.remove("smartctl") # type: ignore[attr-defined] return {HWTool(value) for value in self._stored.stored_tools} # type: ignore[attr-defined] def _on_redetect_hardware(self, event: ops.ActionEvent) -> None: @@ -130,6 +132,8 @@ def _on_install_or_upgrade(self, event: EventBase) -> None: """Install or upgrade charm.""" self.model.unit.status = MaintenanceStatus("Installing resources...") + remove_legacy_smartctl_exporter() + stored_tools = self.get_stored_tools() msg: str diff --git a/src/config.py b/src/config.py index 47bcd17b..a5fe935b 100644 --- a/src/config.py +++ b/src/config.py @@ -36,20 +36,6 @@ class HardwareExporterSettings(ExporterSettings): # pylint: disable = too-few-p HARDWARE_EXPORTER_SETTINGS = HardwareExporterSettings() -class SmartCtlExporterSettings(ExporterSettings): # pylint: disable = too-few-public-methods - """Constant settings for SmartCtl Exporter.""" - - name: str = "smartctl-exporter" - config_path: Path = Path(f"/etc/{name}-config.yaml") - service_path: Path = Path(f"/etc/systemd/system/{name}.service") - config_template: str = f"{name}-config.yaml.j2" - service_template: str = f"{name}.service.j2" - crash_msg: str = "SmartCtl exporter crashed unexpectedly, please refer to systemd logs..." - - -SMARTCTL_EXPORTER_SETTINGS = SmartCtlExporterSettings() - - class SystemVendor(str, Enum): """Different hardware system vendor.""" @@ -77,8 +63,7 @@ class HWTool(str, Enum): IPMI_SEL = "ipmi_sel" IPMI_SENSOR = "ipmi_sensor" REDFISH = "redfish" - SMARTCTL = "smartctl" - SMARTCTL_EXPORTER = "smartctl_exporter" + SMARTCTL_EXPORTER = "smartctl-exporter" DCGM = "dcgm" diff --git a/src/gpu_metrics/dcgm_metrics.csv b/src/gpu_metrics/dcgm_metrics.csv new file mode 100644 index 00000000..5fb0a4d8 --- /dev/null +++ b/src/gpu_metrics/dcgm_metrics.csv @@ -0,0 +1,160 @@ +############################################################################### +# [ WARNING ] +# Configuration file maintained by Juju. Local changes may be overwritten. +############################################################################### + +# Selected metrics for dcgm-exporter +# Default metric list https://github.com/NVIDIA/dcgm-exporter/blob/main/etc/default-counters.csv + +# Format +# If line starts with a '#' it is considered a comment +# Boolean values decode to - 1 = enabled 0 = disabled +# DCGM FIELD, Prometheus metric type, help message + + + + +# DEFAULT METRICS +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + +# PCIE +DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL, gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB). +DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB). + +# NVLink +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information and features +DCGM_FI_DRIVER_VERSION, label, Driver Version + + + + +# CUSTOM METRICS +# Clocks +DCGM_FI_DEV_VIDEO_CLOCK, gauge, Video encoder/decoder clock (in MHz). + +# Temperature +DCGM_FI_DEV_FAN_SPEED, gauge, Fan speed (in 0-100%) + +# Power +DCGM_FI_DEV_POWER_USAGE_INSTANT, gauge, Current instantaneous power usage (in W). + +# Errors and violations +DCGM_FI_DEV_CLOCK_THROTTLE_REASONS, counter, Throttling reasons bitmask +DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + +# Memory usage +DCGM_FI_DEV_FB_RESERVED, gauge, Frame buffer memory reserved (in MB). +DCGM_FI_DEV_FB_USED_PERCENT, gauge, Frame buffer percentage used (in 0-100%) - Used/(Total - Reserved) + +# ECC +DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + +# VGPU +DCGM_FI_DEV_VGPU_UTILIZATIONS, gauge, vGPUs utilization + +# Bar +DCGM_FI_DEV_BAR1_USED, gauge, Used BAR1 (in MB) +DCGM_FI_DEV_BAR1_FREE, gauge, Free BAR1 (in MB) + +# DCP metrics +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. +DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. +DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. +DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. +DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. +DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. + +# Static configuration information and features +DCGM_FI_NVML_VERSION, label, NVML Version +DCGM_FI_DEV_BRAND, label, Device Brand +DCGM_FI_DEV_SERIAL, label, Device Serial Number +DCGM_FI_DEV_NAME, label, Device Name +DCGM_FI_DEV_MINOR_NUMBER, label, Device node minor (/dev/nvidia#) +DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY, label, Cuda compute capability for the device (The major version is the upper 32 bits and the minor version is the lower 32 bits) +DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device + +DCGM_FI_DEV_COMPUTE_MODE, label, Compute mode +DCGM_FI_DEV_PERSISTENCE_MODE, label, Persistance mode (1 or 0) +DCGM_FI_DEV_CC_MODE, label, ConfidentialCompute/AmpereProtectedMemory status (1 or 0) +DCGM_FI_DEV_ECC_CURRENT, label, Current ECC mode +DCGM_FI_DEV_VIRTUAL_MODE, label, Virtualization mode +DCGM_FI_DEV_AUTOBOOST, label, Auto-boost enabled + +DCGM_FI_DEV_BAR1_TOTAL, label, Total BAR1 (in MB) + +DCGM_FI_DEV_MAX_SM_CLOCK, label, Maximum supported SM clock +DCGM_FI_DEV_MAX_MEM_CLOCK, label, Maximum supported Memory clock + +DCGM_FI_DEV_GPU_MAX_OP_TEMP, label, Maximum operating temperature +DCGM_FI_DEV_SLOWDOWN_TEMP, label, Slowdown temperature +DCGM_FI_DEV_SHUTDOWN_TEMP, label, Shutdown temperature + +DCGM_FI_DEV_POWER_MGMT_LIMIT, label, Current Power limit +DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN, label, Minimum Power limit +DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX, label, Maximum Power limit +DCGM_FI_DEV_ENFORCED_POWER_LIMIT, label, Effective Power limit that the driver enforces after taking into account all limiters + +DCGM_FI_DEV_FB_TOTAL, label, Total Frame buffer (in MB) + +DCGM_FI_DEV_COUNT, label, Number of devices on the node diff --git a/src/grafana_dashboards/GPU.json b/src/grafana_dashboards/GPU.json new file mode 100644 index 00000000..74a2fbde --- /dev/null +++ b/src/grafana_dashboards/GPU.json @@ -0,0 +1,1046 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:192", + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "This dashboard is to display the metrics for NVIDIA and AMD GPUs", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 12239, + "graphTooltip": 0, + "id": 228, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 18, + "x": 0, + "y": 0 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}", + "instant": false, + "interval": "", + "legendFormat": "NVIDIA GPU: {{gpu}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "label_replace(node_hwmon_temp_celsius{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")\n", + "hide": false, + "interval": "", + "legendFormat": "AMD GPU: {{gpu}}", + "range": true, + "refId": "B" + } + ], + "title": "GPU Temperature", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 14, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"})", + "interval": "", + "legendFormat": "NVDIA GPUs", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg(node_hwmon_temp_celsius{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"})", + "hide": false, + "legendFormat": "AMD GPUs", + "range": true, + "refId": "B" + } + ], + "title": "GPU Avg. Temp", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 18, + "x": 0, + "y": 8 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}", + "interval": "", + "legendFormat": "NVIDIA GPU: {{gpu}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "label_replace(node_hwmon_power_average_watt{agent_hostname=~\"$instance\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")", + "hide": false, + "legendFormat": "AMD GPU: {{gpu}}", + "range": true, + "refId": "B" + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "C" + }, + "properties": [ + { + "id": "displayName", + "value": "Total" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 16, + "links": [], + "options": { + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"})", + "instant": true, + "interval": "", + "legendFormat": "NVIDIA GPUs", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_hwmon_power_average_watt{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"})", + "hide": false, + "instant": true, + "legendFormat": "AMD GPUs", + "range": false, + "refId": "B" + }, + { + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "($A+$B)", + "hide": false, + "refId": "C", + "type": "math" + } + ], + "title": "GPU Total Power", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}", + "interval": "", + "legendFormat": "NVIDIA GPU: {{gpu}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "label_replace(node_drm_gpu_busy_percent{instance=~\"^$instance.*\"} * on(card) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")\n", + "hide": false, + "legendFormat": "AMD GPU: {{gpu}}", + "range": true, + "refId": "B" + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_FAN_SPEED{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}", + "interval": "", + "legendFormat": "NVIDIA GPU: {{gpu}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "label_replace(\n (\n node_hwmon_fan_rpm{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) /\n (\n node_hwmon_fan_max_rpm{instance=~\"^$instance.*\", chip=~\"$amd_gpu\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) * 100,\n \"gpu\", \"$1\", \"card\", \"card([0-9]+)\"\n)", + "hide": false, + "legendFormat": "AMD GPU: {{gpu}}", + "range": true, + "refId": "B" + } + ], + "title": "GPU Fan Speed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 2, + "interval": "", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_MEM_CLOCK{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"} * 1000000", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "NVIDIA GPU: {{gpu}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "label_replace(node_hwmon_freq_freq_mhz{agent_hostname=~\"^$instance.*\", chip=~\"$amd_gpu\", sensor=\"sclk\"} * on(chip) group_right() node_drm_card_info{chip=~\"$amd_gpu\"} * 1000000, \"gpu\", \"$1\", \"card\", \"card([0-9]+)\")", + "hide": false, + "legendFormat": "AMD GPU: {{gpu}}", + "range": true, + "refId": "B" + } + ], + "title": "GPU Memory Clocks", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{Hostname=~\"$instance\", gpu=~\"$nvidia_gpu\"}", + "interval": "", + "legendFormat": "NVIDIA GPU: {{gpu}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "label_replace(\n (\n node_drm_memory_vram_used_bytes{instance=~\"^$instance.*\"} * on(card) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) \n / \n (\n node_drm_memory_vram_size_bytes{instance=~\"^$instance.*\"} * on(card) group_right() node_drm_card_info{chip=~\"$amd_gpu\"}\n ) * 100,\n \"gpu\", \"$1\", \"card\", \"card([0-9]+)\"\n)", + "hide": false, + "legendFormat": "AMD GPU: {{gpu}}", + "range": true, + "refId": "B" + } + ], + "title": "GPU Memory Utilization", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "juju_cos_f8e88307-f0df-40a4-88f1-745e6ab57e8e_prometheus_0", + "value": "juju_cos_f8e88307-f0df-40a4-88f1-745e6ab57e8e_prometheus_0" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "uid": "$datasource" + }, + "definition": "label_values(node_hwmon_chip_names,agent_hostname)", + "hide": 0, + "includeAll": true, + "label": "Host", + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(node_hwmon_chip_names,agent_hostname)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "uid": "$datasource" + }, + "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "hide": 2, + "includeAll": true, + "label": "NVIDIA GPU", + "multi": true, + "name": "nvidia_gpu", + "options": [], + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "uid": "$datasource" + }, + "definition": "label_values(node_drm_card_info,chip)", + "hide": 2, + "includeAll": true, + "label": "AMD GPU", + "multi": true, + "name": "amd_gpu", + "options": [], + "query": { + "query": "label_values(node_drm_card_info,chip)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "GPU Dashboard", + "uid": "Oxed_c6Wzv", + "version": 1, + "weekStart": "" +} diff --git a/src/hw_tools.py b/src/hw_tools.py index 76e883a8..a3ec7bee 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -3,21 +3,19 @@ Define strategy for install, remove and verifier for different hardware. """ -import io import logging import os import shutil import stat import subprocess -import tarfile from abc import ABCMeta, abstractmethod -from http import HTTPStatus from pathlib import Path from typing import Dict, List, Set, Tuple import requests import urllib3 from charms.operator_libs_linux.v0 import apt +from charms.operator_libs_linux.v1 import systemd from charms.operator_libs_linux.v2 import snap from ops.model import ModelError, Resources @@ -63,14 +61,6 @@ def __init__(self, tool: HWTool, path: Path): self.message = f"Tool: {tool} path: {path} size is zero" -class ResourceInstallationError(Exception): - """Exception raised when a hardware tool installation fails.""" - - def __init__(self, tool: HWTool): - """Init.""" - super().__init__(f"Installation failed for tool: {tool}") - - def copy_to_snap_common_bin(source: Path, filename: str) -> None: """Copy file to $SNAP_COMMON/bin folder.""" Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True) @@ -239,6 +229,16 @@ def __init__(self, channel: str) -> None: self.channel = channel +class SmartCtlExporterStrategy(SnapStrategy): + """SmartCtl strategy class.""" + + _name = HWTool.SMARTCTL_EXPORTER + + def __init__(self, channel: str) -> None: + """Init.""" + self.channel = channel + + class StorCLIStrategy(TPRStrategyABC): """Strategy to install storcli.""" @@ -444,75 +444,6 @@ def check(self) -> bool: return True -class SmartCtlStrategy(APTStrategyABC): - """Strategy for installing ipmi.""" - - pkg = "smartmontools" - _name = HWTool.SMARTCTL - - def install(self) -> None: - apt_helpers.add_pkg_with_candidate_version(self.pkg) - - def remove(self) -> None: - # Skip removing because this may cause dependency error - # for other services on the same machine. - logger.info("%s skip removing %s", self._name, self.pkg) - - def check(self) -> bool: - """Check package status.""" - return check_deb_pkg_installed(self.pkg) - - -class SmartCtlExporterStrategy(StrategyABC): # pylint: disable=R0903 - """Install smartctl exporter binary.""" - - _name = HWTool.SMARTCTL_EXPORTER - - _resource_dir = Path("/opt/SmartCtlExporter/") - _release = ( - "https://github.com/prometheus-community/" - "smartctl_exporter/releases/download/v0.12.0/smartctl_exporter-0.12.0.linux-amd64.tar.gz" - ) - _exporter_name = "smartctl_exporter" - _exporter_path = Path(_resource_dir / "smartctl_exporter") - - def install(self) -> None: - """Install exporter binary from internet.""" - logger.debug("Installing SmartCtlExporter") - self._resource_dir.mkdir(parents=True, exist_ok=True) - - resp = requests.get(self._release, timeout=60) - if resp.status_code != HTTPStatus.OK: - logger.error("Failed to download smartctl exporter binary.") - raise ResourceInstallationError(self._name) - - success = False - fileobj = io.BytesIO(resp.content) - with tarfile.open(fileobj=fileobj, mode="r:gz") as tar: - for member in tar.getmembers(): - if member.name.endswith(self._exporter_name): - with open(self._exporter_path, "wb") as outfile: - member_file = tar.extractfile(member) - if member_file: - outfile.write(member_file.read()) - success = True - if success: - make_executable(self._exporter_path) - if not success: - logger.error("Failed to install SmartCtlExporter binary.") - raise ResourceInstallationError(self._name) - - def remove(self) -> None: - """Remove downloaded exporter binary.""" - logger.debug("Remove SmartCtlExporter") - shutil.rmtree(self._resource_dir) - - def check(self) -> bool: - """Check package status.""" - logger.debug("Check SmartCtlExporter resources") - return self._exporter_path.is_file() - - def _raid_hw_verifier_hwinfo() -> Set[HWTool]: """Verify if a supported RAID card exists on the machine using the hwinfo command.""" hwinfo_output = hwinfo("storage") @@ -650,7 +581,7 @@ def bmc_hw_verifier() -> Set[HWTool]: def disk_hw_verifier() -> Set[HWTool]: """Verify if the disk exists on the machine.""" - return {HWTool.SMARTCTL} if lshw(class_filter="disk") else set() + return {HWTool.SMARTCTL_EXPORTER} if lshw(class_filter="disk") else set() def nvidia_gpu_verifier() -> Set[HWTool]: @@ -664,6 +595,25 @@ def detect_available_tools() -> Set[HWTool]: return raid_hw_verifier() | bmc_hw_verifier() | disk_hw_verifier() | nvidia_gpu_verifier() +def remove_legacy_smartctl_exporter() -> None: + """Remove any legacy tool from older revision. + + Workaround for migrating legacy smartctl exporter to snap package. + """ + name = "smartctl-exporter" + smartctl_exporter = Path("opt/SmartCtlExporter/") + smartctl_exporter_config_path = Path(f"/etc/{name}-config.yaml") + smartctl_exporter_service_path = Path(f"/etc/systemd/system/{name}.service") + if smartctl_exporter_service_path.exists(): + systemd.service_stop(name) + systemd.service_disable(name) + smartctl_exporter_service_path.unlink() + if smartctl_exporter_config_path.exists(): + smartctl_exporter_config_path.unlink() + if smartctl_exporter.exists(): + shutil.rmtree("/opt/SmartCtlExporter/") + + class HWToolHelper: """Helper to install vendor's or hardware related tools.""" @@ -680,7 +630,6 @@ def strategies(self) -> List[StrategyABC]: IPMIDCMIStrategy(), IPMISENSORStrategy(), RedFishStrategy(), - SmartCtlStrategy(), ] def fetch_tools( # pylint: disable=W0102 diff --git a/src/prometheus_alert_rules/dcgm.yaml b/src/prometheus_alert_rules/dcgm.yaml new file mode 100644 index 00000000..97e43cd2 --- /dev/null +++ b/src/prometheus_alert_rules/dcgm.yaml @@ -0,0 +1,93 @@ +# The alerts use DCGM_FI_DEV_CLOCK_THROTTLE_REASONS metric to detect throttling events on NVIDIA GPUs, +# which is a bitmask of throttle reasons found here: https://docs.nvidia.com/datacenter/dcgm/2.1/dcgm-api/group__dcgmFieldConstants.html. +# The 8 least significant bits are used for the alerts, with each bit representing a different throttle reason. + +groups: +- name: NVIDIA DCGM Throttling Alerts + rules: + - alert: GPUPowerBrakeThrottle + # isolate the least significant 8 bits with % 256 + # check whether bit 7 (starts from bit 0) has been set with the >= bool 128 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 256 >= 128 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.Hostname }}) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + LABELS = {{ $labels }} + - alert: GPUThermalHWThrottle + # isolate the least significant 7 bits with % 128 + # check whether bit 6 (starts from bit 0) has been set with the >= bool 64 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 128 >= 64 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.Hostname }}) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} + This is an indicator of: + - Temperature being too high + LABELS = {{ $labels }} + - alert: GPUThermalSWThrottle + # isolate the least significant 6 bits with % 64 + # check whether bit 5 (starts from bit 0) has been set with the >= bool 32 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 64 >= 32 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Software Thermal throttling detected. (instance {{ $labels.Hostname }}) + description: | + SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.gpu }} + This is an indicator of: + - Current GPU temperature above the GPU Max Operating Temperature + - Current memory temperature above the Memory Max Operating Temperature + LABELS = {{ $labels }} + - alert: GPUSyncBoostThrottle + # isolate the least significant 5 bits with % 32 + # check whether bit 4 (starts from bit 0) has been set with the >= bool 16 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 32 >= 16 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Sync Boost throttling detected. (instance {{ $labels.Hostname }}) + description: | + This NVIDIA GPU: {{ $labels.gpu }} has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. + All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. + Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. + LABELS = {{ $labels }} + - alert: GPUSlowdownThrottle + # isolate the least significant 4 bits with % 16 + # check whether bit 3 (starts from bit 0) has been set with the >= bool 8 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 16 >= 8 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.Hostname }}) + description: | + HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} + This is an indicator of: + - Temperature being too high + - External Power Brake Assertion is triggered (e.g. by the system power supply) + - Power draw is too high and Fast Trigger protection is reducing the clocks + - May be also reported during PState or clock change + LABELS = {{ $labels }} + - alert: GPUPowerThrottle + # isolate the least significant 3 bits with % 8 + # check whether bit 2 (starts from bit 0) has been set with the >= bool 4 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 8 >= 4 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Software Power throttling detected. (instance {{ $labels.Hostname }}) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.gpu }} + LABELS = {{ $labels }} diff --git a/src/service.py b/src/service.py index 154ce865..d71f4e9e 100644 --- a/src/service.py +++ b/src/service.py @@ -1,6 +1,7 @@ """Exporter service helper.""" import os +import shutil from abc import ABC, abstractmethod from logging import getLogger from pathlib import Path @@ -17,7 +18,6 @@ from config import ( HARDWARE_EXPORTER_COLLECTOR_MAPPING, HARDWARE_EXPORTER_SETTINGS, - SMARTCTL_EXPORTER_SETTINGS, ExporterSettings, HWTool, ) @@ -309,67 +309,6 @@ def remove_file(path: Path) -> bool: return success -class SmartCtlExporter(RenderableExporter): - """A class representing the smartctl exporter and the metric endpoints.""" - - required_config: bool = False - - def __init__(self, charm_dir: Path, config: ConfigData) -> None: - """Initialize the Hardware Exporter class.""" - super().__init__(charm_dir, config, SMARTCTL_EXPORTER_SETTINGS) - - self.port = int(config["smartctl-exporter-port"]) - self.collect_timeout = int(config["collect-timeout"]) - self.log_level = str(config["exporter-log-level"]) - self.strategy = SmartCtlExporterStrategy() - - def render_service(self) -> bool: - """Render required files for service.""" - service_rendered = self._render_service( - { - "PORT": str(self.port), - "LEVEL": self.log_level, - } - ) - return service_rendered - - def configure(self) -> bool: - """Override base configure to render the service file. - - This is because smartctl_exporter doesn't support providing config file. - The config options need to be provided as flags while exectuting - smartctl_exporter. So, the service file must be re-rendered when a config - value is changed. - """ - service_rendered = self.render_service() - if service_rendered: - systemd.daemon_reload() - return service_rendered - - @staticmethod - def hw_tools() -> Set[HWTool]: - """Return hardware tools to watch.""" - return {HWTool.SMARTCTL} - - def install_resources(self) -> bool: - restart = False - if self.check_active(): - systemd.service_stop(self.exporter_name) - restart = True - self.strategy.install() - if restart: - systemd.service_restart(self.exporter_name) - logger.debug("Finish install resources for %s", self.exporter_name) - return True - - def resources_exist(self) -> bool: - return self.strategy.check() - - def remove_resources(self) -> bool: - self.strategy.remove() - return True - - class SnapExporter(BaseExporter): """A class representing a snap exporter.""" @@ -381,7 +320,11 @@ class SnapExporter(BaseExporter): def __init__(self, config: ConfigData): """Init.""" self.config = config - self.snap_client = snap.SnapCache()[self.strategy.snap] + + @property + def snap_client(self) -> snap.Snap: + """Return the snap client.""" + return snap.SnapCache()[self.strategy.snap] @staticmethod def hw_tools() -> Set[HWTool]: @@ -395,7 +338,6 @@ def install(self) -> bool: """ try: self.strategy.install() - # dcgm-exporter is disabled by default self.enable_and_start() return self.snap_client.present is True except Exception: # pylint: disable=broad-except @@ -429,6 +371,18 @@ def restart(self) -> None: """Restart the exporter daemon.""" self.snap_client.restart(reload=True) + def set(self, snap_config: dict) -> bool: + """Set config options for the snap service. + + Return true if successfully updated snap config, otherwise false. + """ + try: + self.snap_client.set(snap_config, typed=True) + except snap.SnapError as err: + logger.error("Failed to update snap configs %s: %s", self.strategy.snap, err) + return False + return True + def check_health(self) -> bool: """Check if all services are active. @@ -449,18 +403,66 @@ class DCGMExporter(SnapExporter): exporter_name: str = "dcgm" port: int = 9400 + snap_common: Path = Path("/var/snap/dcgm/common/") + metric_config: str = "dcgm-exporter-metrics-file" - def __init__(self, config: ConfigData): + def __init__(self, charm_dir: Path, config: ConfigData): """Init.""" self.strategy = DCGMExporterStrategy(str(config["dcgm-snap-channel"])) + self.charm_dir = charm_dir + self.metrics_file = self.charm_dir / "src/gpu_metrics/dcgm_metrics.csv" + self.metric_config_value = self.metrics_file.name super().__init__(config) + def install(self) -> bool: + """Install the DCGM exporter and configure custom metrics.""" + if not super().install(): + return False + + logger.info("Creating a custom metrics file and configuring the DCGM snap to use it") + try: + shutil.copy(self.metrics_file, self.snap_common) + self.snap_client.set({self.metric_config: self.metric_config_value}) + self.snap_client.restart(reload=True) + except Exception as err: # pylint: disable=broad-except + logger.error("Failed to configure custom DCGM metrics: %s", err) + return False + + return True + @staticmethod def hw_tools() -> Set[HWTool]: """Return hardware tools to watch.""" return {HWTool.DCGM} +class SmartCtlExporter(SnapExporter): + """A class representing the smartctl exporter and the metric endpoints.""" + + exporter_name: str = "smartctl-exporter" + + def __init__(self, config: ConfigData) -> None: + """Initialize the SmartctlExporter class.""" + self.port = int(config["smartctl-exporter-port"]) + self.log_level = str(config["exporter-log-level"]) + self.strategy = SmartCtlExporterStrategy(str(config["smartctl-exporter-snap-channel"])) + super().__init__(config) + + @staticmethod + def hw_tools() -> Set[HWTool]: + """Return hardware tools to watch.""" + return {HWTool.SMARTCTL_EXPORTER} + + def configure(self) -> bool: + """Set the necessary exporter configurations or change snap channel.""" + return super().configure() and self.set( + { + "log.level": self.log_level.lower(), + "web.listen-address": f":{self.port}", + } + ) + + class HardwareExporter(RenderableExporter): """A class representing the hardware exporter and the metric endpoints.""" diff --git a/templates/smartctl-exporter.service.j2 b/templates/smartctl-exporter.service.j2 deleted file mode 100644 index ef9fb374..00000000 --- a/templates/smartctl-exporter.service.j2 +++ /dev/null @@ -1,20 +0,0 @@ -[Unit] -Description=smartctl exporter service -After=network-online.target - -[Service] -Type=simple -PIDFile=/run/smartctl_exporter.pid -ExecStart=/opt/SmartCtlExporter/smartctl_exporter --web.listen-address=:{{ PORT }} -User=root -Group=root -SyslogIdentifier=smartctl_exporter -SyslogLevel={{ LEVEL }} -Restart=on-failure -RemainAfterExit=no -RestartSec=100ms -StandardOutput=journal -StandardError=journal - -[Install] -WantedBy=multi-user.target diff --git a/tests/unit/test_alert_rules/test_dcgm.yaml b/tests/unit/test_alert_rules/test_dcgm.yaml new file mode 100644 index 00000000..3e860486 --- /dev/null +++ b/tests/unit/test_alert_rules/test_dcgm.yaml @@ -0,0 +1,397 @@ +rule_files: + - ../../../src/prometheus_alert_rules/dcgm.yaml + +evaluation_interval: 1m + +tests: +# HW Power Brake Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="0"}' + values: '128' + alert_rule_test: + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-0 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + LABELS = map[Hostname:ubuntu-0 gpu:0] + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: # alerts shouldn't fire since bit 6 isn't set + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: # alerts shouldn't fire since bit 2 isn't set + +# HW Thermal Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="1"}' + values: '64' + alert_rule_test: + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-0 + gpu: 1 + severity: warning + exp_annotations: + summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 1 + This is an indicator of: + - Temperature being too high + LABELS = map[Hostname:ubuntu-0 gpu:1] + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: # alerts shouldn't fire since bit 7 isn't set + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: # alerts shouldn't fire since bit 2 isn't set + +# SW Thermal Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-1", gpu="0"}' + values: '32' + alert_rule_test: + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-1 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Software Thermal throttling detected. (instance ubuntu-1) + description: | + SW Thermal Slowdown is engaged on NVIDIA GPU: 0 + This is an indicator of: + - Current GPU temperature above the GPU Max Operating Temperature + - Current memory temperature above the Memory Max Operating Temperature + LABELS = map[Hostname:ubuntu-1 gpu:0] + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: # alerts shouldn't fire since bit 7 isn't set + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: # alerts shouldn't fire since bit 6 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: # alerts shouldn't fire since bit 2 isn't set + +# Sync Boost Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-1", gpu="1"}' + values: '16' + alert_rule_test: + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-1 + gpu: 1 + severity: warning + exp_annotations: + summary: GPU Sync Boost throttling detected. (instance ubuntu-1) + description: | + This NVIDIA GPU: 1 has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. + All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. + Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. + LABELS = map[Hostname:ubuntu-1 gpu:1] + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: # alerts shouldn't fire since bit 7 isn't set + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: # alerts shouldn't fire since bit 6 isn't set + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: # alerts shouldn't fire since bit 2 isn't set + +# HW Slowdown Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-2", gpu="0"}' + values: '8' + alert_rule_test: + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-2 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-2) + description: | + HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - Temperature being too high + - External Power Brake Assertion is triggered (e.g. by the system power supply) + - Power draw is too high and Fast Trigger protection is reducing the clocks + - May be also reported during PState or clock change + LABELS = map[Hostname:ubuntu-2 gpu:0] + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: # alerts shouldn't fire since bit 7 isn't set + - eval_time: 5m + alertname: GPUThermalThrottle + exp_alerts: # alerts shouldn't fire since bit 6 isn't set + - eval_time: 5m + alertname: GPUThermalThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: # alerts shouldn't fire since bit 2 isn't set + +# SW Power Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-2", gpu="1"}' + values: '4' + alert_rule_test: + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-2 + gpu: 1 + severity: warning + exp_annotations: + summary: GPU Software Power throttling detected. (instance ubuntu-2) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: 1 + LABELS = map[Hostname:ubuntu-2 gpu:1] + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: # alerts shouldn't fire since bit 7 isn't set + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: # alerts shouldn't fire since bit 6 isn't set + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set + +# No throttling +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="0"}' + values: '1' + alert_rule_test: + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: # alerts shouldn't fire since bit 7 isn't set + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: # alerts shouldn't fire since bit 6 isn't set + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: # alerts shouldn't fire since bit 2 isn't set + +# All throttling reasons active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-3", gpu="2"}' + values: '511' + alert_rule_test: + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-3) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 2 + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + LABELS = map[Hostname:ubuntu-3 gpu:2] + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Hardware Thermal throttling detected. (instance ubuntu-3) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 2 + This is an indicator of: + - Temperature being too high + LABELS = map[Hostname:ubuntu-3 gpu:2] + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Software Thermal throttling detected. (instance ubuntu-3) + description: | + SW Thermal Slowdown is engaged on NVIDIA GPU: 2 + This is an indicator of: + - Current GPU temperature above the GPU Max Operating Temperature + - Current memory temperature above the Memory Max Operating Temperature + LABELS = map[Hostname:ubuntu-3 gpu:2] + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Sync Boost throttling detected. (instance ubuntu-3) + description: | + This NVIDIA GPU: 2 has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. + All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. + Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. + LABELS = map[Hostname:ubuntu-3 gpu:2] + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-3) + description: | + HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 2 + This is an indicator of: + - Temperature being too high + - External Power Brake Assertion is triggered (e.g. by the system power supply) + - Power draw is too high and Fast Trigger protection is reducing the clocks + - May be also reported during PState or clock change + LABELS = map[Hostname:ubuntu-3 gpu:2] + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Software Power throttling detected. (instance ubuntu-3) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: 2 + LABELS = map[Hostname:ubuntu-3 gpu:2] + +# Multiple throttling reasons +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="0"}' + values: '196' + alert_rule_test: + - eval_time: 5m + alertname: GPUPowerBrakeThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-0 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + LABELS = map[Hostname:ubuntu-0 gpu:0] + - eval_time: 5m + alertname: GPUThermalHWThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-0 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - Temperature being too high + LABELS = map[Hostname:ubuntu-0 gpu:0] + - eval_time: 5m + alertname: GPUPowerThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-0 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Software Power throttling detected. (instance ubuntu-0) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: 0 + LABELS = map[Hostname:ubuntu-0 gpu:0] + - eval_time: 5m + alertname: GPUThermalSWThrottle + exp_alerts: # alerts shouldn't fire since bit 5 isn't set + - eval_time: 5m + alertname: GPUSyncBoostThrottle + exp_alerts: # alerts shouldn't fire since bit 4 isn't set + - eval_time: 5m + alertname: GPUSlowdownThrottle + exp_alerts: # alerts shouldn't fire since bit 3 isn't set diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 68bc3890..bb643713 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -15,13 +15,7 @@ import charm from charm import ExporterError, HardwareObserverCharm from config import HWTool -from service import ( - HARDWARE_EXPORTER_SETTINGS, - SMARTCTL_EXPORTER_SETTINGS, - DCGMExporter, - HardwareExporter, - SmartCtlExporter, -) +from service import HARDWARE_EXPORTER_SETTINGS, DCGMExporter, HardwareExporter, SmartCtlExporter class TestCharm(unittest.TestCase): @@ -72,12 +66,12 @@ def test_harness(self) -> None: ), ( "Enable two exporters", - {HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, {"hardware-exporter", "smartctl-exporter"}, ), ( "Enable all exporters", - {HWTool.IPMI_SEL, HWTool.SMARTCTL, HWTool.DCGM}, + {HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER, HWTool.DCGM}, {"hardware-exporter", "smartctl-exporter", "dcgm"}, ), ] @@ -98,7 +92,7 @@ def test_exporters( mock_hw_exporter.return_value = hw_exporter smart_exporter = mock.MagicMock() - smart_exporter.exporter_name = SMARTCTL_EXPORTER_SETTINGS.name + smart_exporter.exporter_name = SmartCtlExporter.exporter_name mock_smart_exporter.hw_tools.return_value = SmartCtlExporter.hw_tools() mock_smart_exporter.return_value = smart_exporter @@ -117,7 +111,7 @@ def test_exporters( ( "happy case", "install", - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, (True, ""), [mock.MagicMock(), mock.MagicMock()], [True, True], @@ -125,7 +119,7 @@ def test_exporters( ( "happy case", "upgrade", - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, (True, ""), [mock.MagicMock(), mock.MagicMock()], [True, True], @@ -133,7 +127,7 @@ def test_exporters( ( "missing resource", "install", - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, (False, "miss something"), [mock.MagicMock(), mock.MagicMock()], [True, True], @@ -141,7 +135,7 @@ def test_exporters( ( "missing resource", "upgrade", - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, (False, "miss something"), [mock.MagicMock(), mock.MagicMock()], [True, True], @@ -149,7 +143,7 @@ def test_exporters( ( "Exporter install fail", "install", - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, (True, ""), [mock.MagicMock(), mock.MagicMock()], [False, True], @@ -157,7 +151,7 @@ def test_exporters( ( "Exporter install fail", "upgrade", - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, (True, ""), [mock.MagicMock(), mock.MagicMock()], [False, True], @@ -227,14 +221,14 @@ def test_remove(self): self.harness.charm.get_stored_tools.return_value = { HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, - HWTool.SMARTCTL, + HWTool.SMARTCTL_EXPORTER, } self.harness.charm.on.remove.emit() self.harness.charm.hw_tool_helper.remove.assert_called_with( self.harness.charm.model.resources, - {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL}, + {HWTool.IPMI_SENSOR, HWTool.IPMI_SEL, HWTool.SMARTCTL_EXPORTER}, ) for mock_exporter in mock_exporters: mock_exporter.uninstall.assert_called() @@ -792,3 +786,8 @@ def test_validate_configs( self.harness.begin() result = self.harness.charm.validate_configs() self.assertEqual(result, expect) + + def test_get_stored_tools_remove_legacy_smartctl(self): + self.harness.begin() + self.harness.charm._stored.stored_tools = {"smartctl"} + assert self.harness.charm.get_stored_tools() == set() diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 4e51fba8..0def3264 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -1,8 +1,6 @@ import stat import subprocess -import tempfile import unittest -from http import HTTPStatus from pathlib import Path from unittest import mock @@ -31,11 +29,8 @@ PercCLIStrategy, ResourceChecksumError, ResourceFileSizeZeroError, - ResourceInstallationError, SAS2IRCUStrategy, SAS3IRCUStrategy, - SmartCtlExporterStrategy, - SmartCtlStrategy, SnapStrategy, SSACLIStrategy, StorCLIStrategy, @@ -55,6 +50,7 @@ raid_hw_verifier, redfish_available, remove_deb, + remove_legacy_smartctl_exporter, symlink, ) from keys import HP_KEYS @@ -729,126 +725,6 @@ def test_remove(self, mock_apt): mock_apt.remove_package.assert_not_called() -class TestSmartCtlStrategy(unittest.TestCase): - @mock.patch("apt_helpers.get_candidate_version") - @mock.patch("apt_helpers.apt") - def test_install(self, mock_apt, mock_candidate_version): - strategy = SmartCtlStrategy() - mock_candidate_version.return_value = "some-candidate-version" - strategy.install() - - mock_apt.add_package.assert_called_with( - "smartmontools", version="some-candidate-version", update_cache=False - ) - - @mock.patch("hw_tools.apt") - def test_remove(self, mock_apt): - strategy = SmartCtlStrategy() - strategy.remove() - - mock_apt.remove_package.assert_not_called() - - @mock.patch("hw_tools.check_deb_pkg_installed") - def test_check(self, mock_check_deb_method): - strategy = SmartCtlStrategy() - strategy.check() - - mock_check_deb_method.assert_called_with("smartmontools") - - -class TestSmartCtlExporterStrategy(unittest.TestCase): - def setUp(self): - self.temp_dir = tempfile.TemporaryDirectory() - self.tmp_path = Path(self.temp_dir.name) - - def tearDown(self): - self.temp_dir.cleanup() - - @mock.patch("hw_tools.requests.get") - @mock.patch("hw_tools.tarfile.open") - @mock.patch("hw_tools.make_executable") - def test_install_success( - self, - mock_make_executable, - mock_tar_open, - mock_requests_get, - ): - strategy = SmartCtlExporterStrategy() - strategy._resource_dir = self.tmp_path - strategy._exporter_path = self.tmp_path / "smartctl_exporter" - - mock_response = mock.MagicMock(status_code=HTTPStatus.OK) - mock_response.content = b"dummy content" - mock_requests_get.return_value = mock_response - mock_member = mock.MagicMock(name="member") - mock_member.name = "smartctl_exporter" - mock_member_file = mock.MagicMock() - mock_member_file.read.return_value = b"dummy content" - mock_tar_open.return_value.__enter__.return_value.getmembers.return_value = [mock_member] - mock_tar_open.return_value.__enter__.return_value.extractfile.return_value = ( - mock_member_file # noqa: E501 - ) - - strategy.install() - - mock_requests_get.assert_called_with(strategy._release, timeout=60) - # mock_tar_open.assert_called_with(fileobj=BytesIO(b"dummy content"), mode="r:gz") - mock_make_executable.assert_called_with(strategy._exporter_path) - self.assertTrue(strategy._resource_dir.exists()) - - @mock.patch("hw_tools.requests.get") - def test_install_download_failure(self, mock_requests_get): - strategy = SmartCtlExporterStrategy() - strategy._resource_dir = self.tmp_path - strategy._exporter_path = self.tmp_path / "smartctl_exporter" - - mock_response = mock.MagicMock(status_code=HTTPStatus.NOT_FOUND) - mock_requests_get.return_value = mock_response - - with self.assertRaises(ResourceInstallationError): - strategy.install() - - @mock.patch("hw_tools.requests.get") - @mock.patch("hw_tools.tarfile.open") - def test_install_parse_failure(self, mock_tar_open, mock_requests_get): - strategy = SmartCtlExporterStrategy() - strategy._resource_dir = self.tmp_path - strategy._exporter_path = self.tmp_path / "smartctl_exporter" - - mock_response = mock.MagicMock(status_code=HTTPStatus.OK) - mock_response.content = b"dummy content" - mock_requests_get.return_value = mock_response - mock_member = mock.MagicMock(name="member") - mock_member.name = "random name" - mock_member_file = mock.MagicMock() - mock_member_file.read.return_value = b"dummy content" - mock_tar_open.return_value.__enter__.return_value.getmembers.return_value = [mock_member] - mock_tar_open.return_value.__enter__.return_value.extractfile.return_value = ( - mock_member_file # noqa: E501 - ) - - with self.assertRaises(ResourceInstallationError): - strategy.install() - - @mock.patch("hw_tools.shutil.rmtree") - def test_remove(self, mock_shutil_rmtree): - strategy = SmartCtlExporterStrategy() - - strategy.remove() - - mock_shutil_rmtree.assert_called_with(strategy._resource_dir) - - def test_check(self): - strategy = SmartCtlExporterStrategy() - strategy._exporter_path = mock.MagicMock() - strategy._exporter_path.is_file.return_value = True - - result = strategy.check() - self.assertTrue(result) - - strategy._exporter_path.is_file.assert_called() - - @mock.patch("hw_tools.disk_hw_verifier", return_value={7, 8, 9}) @mock.patch("hw_tools.bmc_hw_verifier", return_value={1, 2, 3}) @mock.patch("hw_tools.raid_hw_verifier", return_value={4, 5, 6}) @@ -982,7 +858,7 @@ class TestDiskHWVerifier(unittest.TestCase): @mock.patch("hw_tools.lshw", return_value=[True]) def test_disk_available(self, mock_lshw): tools = disk_hw_verifier() - self.assertEqual(tools, {HWTool.SMARTCTL}) + self.assertEqual(tools, {HWTool.SMARTCTL_EXPORTER}) @mock.patch("hw_tools.lshw", return_value=[]) def test_disk_not_available(self, mock_lshw): @@ -1265,3 +1141,36 @@ def test_snap_strategy_check(snap_exporter, mock_snap_lib, services, expected): mock_snap_lib.SnapCache.return_value = {"my-snap": mock_snap_client} assert snap_exporter.check() is expected + + +@mock.patch("hw_tools.Path.unlink") +@mock.patch("hw_tools.Path.exists") +@mock.patch("hw_tools.shutil") +@mock.patch("hw_tools.systemd") +def test_remove_legacy_smartctl_exporter_exist( + mock_systemd, mock_shutil, mock_path_exists, mock_path_unlink +): + mock_path_exists.return_value = True + remove_legacy_smartctl_exporter() + + mock_systemd.service_stop.assert_called_once() + mock_systemd.service_disable.assert_called_once() + mock_path_unlink.assert_called() + assert mock_path_unlink.call_count == 2 + mock_shutil.rmtree.assert_called_once() + + +@mock.patch("hw_tools.Path.unlink") +@mock.patch("hw_tools.Path.exists") +@mock.patch("hw_tools.shutil") +@mock.patch("hw_tools.systemd") +def test_remove_legacy_smartctl_exporter_not_exists( + mock_systemd, mock_shutil, mock_path_exists, mock_path_unlink +): + mock_path_exists.return_value = False + remove_legacy_smartctl_exporter() + + mock_systemd.service_stop.assert_not_called() + mock_systemd.service_disable.assert_not_called() + mock_path_unlink.assert_not_called() + mock_shutil.rmtree.assert_not_called() diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index 312afb46..a0fb8835 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -8,6 +8,7 @@ import pytest import yaml +from charms.operator_libs_linux.v2 import snap from parameterized import parameterized from redfish.rest.v1 import InvalidCredentialsError @@ -714,103 +715,65 @@ def test_hw_tools(self): ) -class TestSmartMetricExporter(unittest.TestCase): - """Test SmartCtlExporter's methods.""" +class TestDCGMSnapExporter(unittest.TestCase): + """Test DCGM Snap exporter's methods.""" def setUp(self) -> None: """Set up harness for each test case.""" - systemd_lib_patcher = mock.patch.object(service, "systemd") - self.mock_systemd = systemd_lib_patcher.start() - self.addCleanup(systemd_lib_patcher.stop) + snap_lib_patcher = mock.patch.object(service, "snap") + shutil_lib_patcher = mock.patch.object(service, "shutil") + self.mock_snap = snap_lib_patcher.start() + self.mock_shutil = shutil_lib_patcher.start() + self.addCleanup(snap_lib_patcher.stop) + self.addCleanup(shutil_lib_patcher.stop) search_path = pathlib.Path(f"{__file__}/../../..").resolve() self.mock_config = { - "smartctl-exporter-port": 10201, - "collect-timeout": 10, - "exporter-log-level": "INFO", + "dcgm-snap-channel": "latest/stable", } - self.exporter = service.SmartCtlExporter(search_path, self.mock_config) - - def test_render_service(self): - """Test render service.""" - self.exporter._render_service = mock.MagicMock() - self.exporter._render_service.return_value = "some result" - - result = self.exporter.render_service() - self.assertEqual(result, "some result") - self.exporter._render_service.assert_called_with( - { - "PORT": str(self.exporter.port), - "LEVEL": self.exporter.log_level, - } - ) - - @parameterized.expand( - [ - (True,), - (False,), - ] - ) - def test_set_config(self, service_render_success): - """Test render config.""" - self.exporter.render_service = mock.MagicMock() - self.exporter.render_service.return_value = service_render_success + self.exporter = service.DCGMExporter(search_path, self.mock_config) + self.exporter.strategy = mock.MagicMock() - result = self.exporter.configure() - self.assertEqual(result, service_render_success) + def test_exporter_name(self): + self.assertEqual(self.exporter.exporter_name, "dcgm") def test_hw_tools(self): - self.assertEqual(self.exporter.hw_tools(), {HWTool.SMARTCTL}) + self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) - @mock.patch("service.systemd", return_value=mock.MagicMock()) - def test_install_resource_restart(self, mock_systemd): - self.exporter.strategy = mock.MagicMock() - self.exporter.check_active = mock.MagicMock() - self.exporter.check_active.return_value = True + def test_install_failed(self): + self.exporter.snap_client.present = False - self.exporter.install_resources() + exporter_install_ok = self.exporter.install() self.exporter.strategy.install.assert_called() - self.exporter.check_active.assert_called() - mock_systemd.service_stop.assert_called_with(self.exporter.exporter_name) - mock_systemd.service_restart.assert_called_with(self.exporter.exporter_name) + self.mock_shutil.copy.assert_not_called() + self.assertFalse(exporter_install_ok) - @mock.patch("service.systemd", return_value=mock.MagicMock()) - def test_install_resource_no_restart(self, mock_systemd): - self.exporter.strategy = mock.MagicMock() - self.exporter.check_active = mock.MagicMock() - self.exporter.check_active.return_value = False + def test_install_success(self): + self.exporter.snap_client.present = True - self.exporter.install_resources() + exporter_install_ok = self.exporter.install() self.exporter.strategy.install.assert_called() - self.exporter.check_active.assert_called() - mock_systemd.service_stop.assert_not_called() - mock_systemd.service_restart.assert_not_called() - - def test_resource_exists(self): - self.exporter.strategy = mock.MagicMock() - - self.exporter.resources_exist() - self.exporter.strategy.check.assert_called() - - def test_resources_exist(self): - self.exporter.strategy = mock.MagicMock() - self.exporter.strategy.check.return_value = "some result" - - result = self.exporter.resources_exist() - - self.assertEqual(result, "some result") - self.exporter.strategy.check.assert_called() + self.mock_shutil.copy.assert_called_with( + self.exporter.metrics_file, self.exporter.snap_common + ) + self.exporter.snap_client.set.assert_called_with( + {self.exporter.metric_config: self.exporter.metric_config_value} + ) + self.exporter.snap_client.restart.assert_called_with(reload=True) + self.assertTrue(exporter_install_ok) - def test_resource_remove(self): - self.exporter.strategy = mock.MagicMock() + def test_install_metrics_copy_fail(self): + self.exporter.snap_client.present = True + self.mock_shutil.copy.side_effect = FileNotFoundError - result = self.exporter.remove_resources() - self.assertEqual(result, True) + exporter_install_ok = self.exporter.install() - self.exporter.strategy.remove.assert_called() + self.exporter.strategy.install.assert_called() + self.exporter.snap_client.restart.assert_not_called() + self.assertFalse(exporter_install_ok) class TestWriteToFile(unittest.TestCase): @@ -957,6 +920,19 @@ def test_snap_exporter_restart(snap_exporter): snap_exporter.snap_client.restart.assert_called_once_with(reload=True) +def test_snap_exporter_set(snap_exporter): + snap_config = {} + assert snap_exporter.set(snap_config) is True + snap_exporter.snap_client.set.assert_called_once_with(snap_config, typed=True) + + +def test_snap_exporter_set_failed(snap_exporter): + snap_config = {} + snap_exporter.snap_client.set.side_effect = snap.SnapError() + assert snap_exporter.set(snap_config) is False + snap_exporter.snap_client.set.assert_called_once_with(snap_config, typed=True) + + def test_snap_exporter_check_health(snap_exporter): snap_exporter.check_health() snap_exporter.strategy.check.assert_called_once() @@ -971,14 +947,22 @@ def test_snap_exporter_configure(mock_install, snap_exporter, install_result, ex mock_install.assert_called_once() -def test_dcgm_exporter(): +@pytest.mark.parametrize("result, expected_result", [(True, True), (False, False)]) +@mock.patch("service.SnapExporter.install") +@mock.patch("service.SnapExporter.set") +def test_smartctl_exporter_configure(mock_set, mock_install, result, expected_result): mock_config = { - "dcgm-snap-channel": "latest/stable", + "smartctl-exporter-port": "10000", + "exporter-log-level": "info", + "smartctl-exporter-snap-channel": "latest/stable", } - exporter = service.DCGMExporter(mock_config) - assert exporter.exporter_name == "dcgm" - assert exporter.hw_tools() == {HWTool.DCGM} + mock_set.return_value = result + mock_install.return_value = result + exporter = service.SmartCtlExporter(mock_config) + assert exporter.exporter_name == "smartctl-exporter" + assert exporter.hw_tools() == {HWTool.SMARTCTL_EXPORTER} + assert exporter.configure() is expected_result if __name__ == "__main__":