diff --git a/assets/grafana/celeborn-dashboard.json b/assets/grafana/celeborn-dashboard.json index fb1f005777c..a32e02befe2 100644 --- a/assets/grafana/celeborn-dashboard.json +++ b/assets/grafana/celeborn-dashboard.json @@ -2951,11 +2951,305 @@ ], "title": "metrics_IsDecommissioningWorker_Value", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "size of partition files in bytes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 110 + }, + "id": 235, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "metrics_PartitionFileSizeBytes_Mean{role=\"Worker\", instance=~\"${instance}\"}", + "hide": false, + "instant": false, + "legendFormat": "${baseLegend}", + "range": true, + "refId": "A" + } + ], + "title": "metrics_ PartitionFileSizeBytes_Mean", + "type": "timeseries" } ], "title": "Worker", "type": "row" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "p99 size of partition files in size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 70 + }, + "id": 236, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "metrics_PartitionFileSizeBytes_P99{role=\"Worker\", instance=~\"${instance}\"}", + "hide": false, + "instant": false, + "legendFormat": "${baseLegend}", + "range": true, + "refId": "A" + } + ], + "title": "metrics_ PartitionFileSizeBytes_P99", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "max size of partition files in size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 110 + }, + "id": 237, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "metrics_PartitionFileSizeBytes_max{role=\"Worker\", instance=~\"${instance}\"}", + "hide": false, + "instant": false, + "legendFormat": "${baseLegend}", + "range": true, + "refId": "A" + } + ], + "title": "metrics_ PartitionFileSizeBytes_MAX", + "type": "timeseries" + }, { "collapsed": true, "gridPos": { @@ -5091,7 +5385,7 @@ "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "${baseLegend}", "range": true, "refId": "A", "useBackend": false @@ -5191,7 +5485,7 @@ "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "${baseLegend}", "range": true, "refId": "A", "useBackend": false @@ -5291,7 +5585,7 @@ "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "${baseLegend}", "range": true, "refId": "A", "useBackend": false @@ -5390,7 +5684,7 @@ "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "${baseLegend}", "range": true, "refId": "A", "useBackend": false @@ -5490,7 +5784,7 @@ "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "${baseLegend}", "range": true, "refId": "A", "useBackend": false @@ -5590,7 +5884,7 @@ "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, - "legendFormat": "__auto", + "legendFormat": "${baseLegend}", "range": true, "refId": "A", "useBackend": false diff --git a/docs/monitoring.md b/docs/monitoring.md index 55367291285..871e4e28bd4 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -206,6 +206,7 @@ These metrics are exposed by Celeborn worker. | PausePushDataAndReplicateTime | The time for a worker to stop receiving pushData from clients and other workers because of back pressure. | | PausePushData | The count for a worker to stop receiving pushData from clients because of back pressure. | | PausePushDataAndReplicate | The count for a worker to stop receiving pushData from clients and other workers because of back pressure. | + | PartitionFileSizeBytes | The size of partition files committed in current worker. | | TakeBufferTime | The time for a worker to take out a buffer from a disk flusher. | | FlushDataTime | The time for a worker to write a buffer which is 256KB by default to storage. | | CommitFilesTime | The time for a worker to flush buffers and close files related to specified shuffle. | diff --git a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java index c711752fbf7..8eae732a11c 100644 --- a/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java +++ b/worker/src/main/java/org/apache/celeborn/service/deploy/worker/storage/PartitionDataWriter.java @@ -549,8 +549,10 @@ protected synchronized long close( } } if (diskFileInfo != null) { + source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), diskFileInfo.getFileLength()); return diskFileInfo.getFileLength(); } else { + source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), memoryFileInfo.getFileLength()); return memoryFileInfo.getFileLength(); } } diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala index 8275e3595b9..c2d64ef6c8d 100644 --- a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala +++ b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala @@ -84,6 +84,7 @@ class WorkerSource(conf: CelebornConf) extends AbstractSource(conf, Role.WORKER) addTimer(CLEAN_EXPIRED_SHUFFLE_KEYS_TIME) addHistogram(FETCH_CHUNK_TRANSFER_SIZE) + addHistogram(PARTITION_FILE_SIZE) def getCounterCount(metricsName: String): Long = { val metricNameWithLabel = metricNameWithCustomizedLabels(metricsName, Map.empty) @@ -214,6 +215,7 @@ object WorkerSource { val DEVICE_OS_TOTAL_CAPACITY = "DeviceOSTotalBytes" val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes" val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes" + val PARTITION_FILE_SIZE = "PartitionFileSizeBytes" // congestion control val POTENTIAL_CONSUME_SPEED = "PotentialConsumeSpeed"