Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CELEBORN-1817] add committed file size metrics #3047

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
306 changes: 300 additions & 6 deletions assets/grafana/celeborn-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -2951,11 +2951,305 @@
],
"title": "metrics_IsDecommissioningWorker_Value",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "size of partition files in bytes",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 110
},
"id": 235,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_PartitionFileSizeBytes_Mean{role=\"Worker\", instance=~\"${instance}\"}",
"hide": false,
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_ PartitionFileSizeBytes_Mean",
"type": "timeseries"
}
],
"title": "Worker",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "p99 size of partition files in size",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 70
},
"id": 236,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_PartitionFileSizeBytes_P99{role=\"Worker\", instance=~\"${instance}\"}",
"hide": false,
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_ PartitionFileSizeBytes_P99",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "max size of partition files in size",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 110
},
"id": 237,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "metrics_PartitionFileSizeBytes_max{role=\"Worker\", instance=~\"${instance}\"}",
"hide": false,
"instant": false,
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A"
}
],
"title": "metrics_ PartitionFileSizeBytes_MAX",
"type": "timeseries"
},
{
"collapsed": true,
"gridPos": {
Expand Down Expand Up @@ -5091,7 +5385,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5191,7 +5485,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5291,7 +5585,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5390,7 +5684,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5490,7 +5784,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down Expand Up @@ -5590,7 +5884,7 @@
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "__auto",
"legendFormat": "${baseLegend}",
"range": true,
"refId": "A",
"useBackend": false
Expand Down
1 change: 1 addition & 0 deletions docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ These metrics are exposed by Celeborn worker.
| PausePushDataAndReplicateTime | The time for a worker to stop receiving pushData from clients and other workers because of back pressure. |
| PausePushData | The count for a worker to stop receiving pushData from clients because of back pressure. |
| PausePushDataAndReplicate | The count for a worker to stop receiving pushData from clients and other workers because of back pressure. |
| PartitionFileSizeBytes | The size of partition files committed in current worker. |
| TakeBufferTime | The time for a worker to take out a buffer from a disk flusher. |
| FlushDataTime | The time for a worker to write a buffer which is 256KB by default to storage. |
| CommitFilesTime | The time for a worker to flush buffers and close files related to specified shuffle. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,10 @@ protected synchronized long close(
}
}
if (diskFileInfo != null) {
source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), diskFileInfo.getFileLength());
return diskFileInfo.getFileLength();
} else {
source.updateHistogram(WorkerSource.PARTITION_FILE_SIZE(), memoryFileInfo.getFileLength());
return memoryFileInfo.getFileLength();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class WorkerSource(conf: CelebornConf) extends AbstractSource(conf, Role.WORKER)
addTimer(CLEAN_EXPIRED_SHUFFLE_KEYS_TIME)

addHistogram(FETCH_CHUNK_TRANSFER_SIZE)
addHistogram(PARTITION_FILE_SIZE)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a histogram, you can add the mean, max, 99th, and 95th values into the grafana dashboard.


def getCounterCount(metricsName: String): Long = {
val metricNameWithLabel = metricNameWithCustomizedLabels(metricsName, Map.empty)
Expand Down Expand Up @@ -214,6 +215,7 @@ object WorkerSource {
val DEVICE_OS_TOTAL_CAPACITY = "DeviceOSTotalBytes"
val DEVICE_CELEBORN_FREE_CAPACITY = "DeviceCelebornFreeBytes"
val DEVICE_CELEBORN_TOTAL_CAPACITY = "DeviceCelebornTotalBytes"
val PARTITION_FILE_SIZE = "PartitionFileSizeBytes"

// congestion control
val POTENTIAL_CONSUME_SPEED = "PotentialConsumeSpeed"
Expand Down
Loading