From 3997898900c3ac58f93faf5e914860229c212741 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Tue, 17 Dec 2024 00:21:48 -0300
Subject: [PATCH 1/6] Revert #4499 (#4512)

There is a clear way to partition ErrorType enums in the criteria
proposed, reverting this one.

Part of #4271
---
 .../_internal/bot/tasks/utasks/__init__.py    | 64 ++-----------------
 1 file changed, 6 insertions(+), 58 deletions(-)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
index 1dea6b20ed5..c9879799962 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
@@ -84,50 +84,6 @@ def __init__(self, subtask: _Subtask):
     self._subtask = subtask
     self._labels = None
     self.utask_main_failure = None
-    self._utask_success_conditions = [
-        uworker_msg_pb2.ErrorType.NO_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.MINIMIZE_UNREPRODUCIBLE_CRASH,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED,  # pylint: disable=no-member
-    ]
-    self._utask_maybe_retry_conditions = [
-        uworker_msg_pb2.ErrorType.ANALYZE_BUILD_SETUP,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISIONS_LIST,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.TESTCASE_SETUP,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.MINIMIZE_SETUP,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZ_TARGET_SELECTED,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_NO_CRASH,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_TIMEOUT,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_SETUP_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_BUILD_SETUP_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_TIMEOUT_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.SYMBOLIZE_BUILD_SETUP_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED_IN_MAIN_FILE_PHASE,  # pylint: disable=no-member
-    ]
-    self._utask_failure_conditions = [
-        uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISION_INDEX,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.UNHANDLED,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.VARIANT_BUILD_SETUP,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.FUZZ_BUILD_SETUP_FAILURE,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZER,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_REVISION_LIST_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_NOT_FOUND,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.PROGRESSION_BAD_BUILD,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_BUILD_NOT_FOUND,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.REGRESSION_BAD_BUILD_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_FAILED,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.CORPUS_PRUNING_FUZZER_SETUP_FAILED,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.CORPUS_PRUNING_ERROR,  # pylint: disable=no-member
-        uworker_msg_pb2.ErrorType.FUZZ_BAD_BUILD,  # pylint: disable=no-member
-    ]
 
     if subtask == _Subtask.PREPROCESS:
       self._preprocess_start_time_ns = self.start_time_ns
@@ -169,18 +125,6 @@ def set_task_details(self,
       # Ensure we always have a value after this method returns.
       assert self._preprocess_start_time_ns is not None
 
-  def _infer_uworker_main_outcome(self, exc_type, uworker_error):
-    '''Infers, on a best effort basis, whether an uworker output implies
-      success or failure. If an unequivocal response is not possible,
-      classifies as maybe_retry.'''
-    if exc_type or uworker_error in self._utask_failure_conditions:
-      outcome = 'error'
-    elif uworker_error in self._utask_maybe_retry_conditions:
-      outcome = 'maybe_retry'
-    else:
-      outcome = 'success'
-    return outcome
-
   def __exit__(self, _exc_type, _exc_value, _traceback):
     # Ignore exception details, let Python continue unwinding the stack.
 
@@ -201,8 +145,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     # The only case where a task might fail without throwing, is in
     # utask_main, by returning an ErrorType proto which indicates
     # failure.
-    outcome = self._infer_uworker_main_outcome(_exc_type,
-                                               self.utask_main_failure)
+    outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
     monitoring_metrics.TASK_OUTCOME_COUNT.increment({
         **self._labels, 'outcome': outcome
     })
@@ -223,6 +166,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
         trimmed_labels)
 
+    if error_condition != 'UNHANDLED_EXCEPTION':
+      task = self._labels['task']
+      subtask = self._labels['subtask']
+      logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')
+
 
 def ensure_uworker_env_type_safety(uworker_env):
   """Converts all values in |uworker_env| to str types.

From b16161ea74a9c289bc13fcf91eb4243be74ebe99 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Mon, 23 Dec 2024 08:34:15 -0300
Subject: [PATCH 2/6] [Monitoring] Partition UTask outcomes correctly into
 success and error (#4516)

### Motivation

#4458 implemented an error rate for utasks, only considering exceptions.
In #4499 , outcomes were split between success, failure and maybe_retry
conditions. There we learned that the volume of retryable outcomes is
negligible, so it makes sense to count them as failures.

Listing out all the success conditions under _MetricRecorder is not
desirable. However, we are consciously taking this technical debt so we
can deliver #4271 .

A refactor of uworker main will be later performed, so we can split the
success and failure conditions, both of which are mixed in
uworker_output.ErrorType.

Reference for tech debt acknowledgement: #4517
---
 .../_internal/bot/tasks/utasks/__init__.py    | 31 +++++++++++++------
 .../_internal/metrics/monitoring_metrics.py   |  4 +--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
index c9879799962..b8d8f0f6634 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
@@ -84,6 +84,17 @@ def __init__(self, subtask: _Subtask):
     self._subtask = subtask
     self._labels = None
     self.utask_main_failure = None
+    self._utask_success_conditions = [
+        None,  # This can be a successful return value in, ie, fuzz task
+        uworker_msg_pb2.ErrorType.NO_ERROR,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE,  # pylint: disable=no-member
+        uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED,  # pylint: disable=no-member
+    ]
 
     if subtask == _Subtask.PREPROCESS:
       self._preprocess_start_time_ns = self.start_time_ns
@@ -125,6 +136,12 @@ def set_task_details(self,
       # Ensure we always have a value after this method returns.
       assert self._preprocess_start_time_ns is not None
 
+  def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool:
+    """Returns True if task succeeded, False otherwise."""
+    if exc_type or uworker_error not in self._utask_success_conditions:
+      return False
+    return True
+
   def __exit__(self, _exc_type, _exc_value, _traceback):
     # Ignore exception details, let Python continue unwinding the stack.
 
@@ -145,11 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     # The only case where a task might fail without throwing, is in
     # utask_main, by returning an ErrorType proto which indicates
     # failure.
-    outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
+    task_succeeded = self._infer_uworker_main_outcome(_exc_type,
+                                                      self.utask_main_failure)
     monitoring_metrics.TASK_OUTCOME_COUNT.increment({
-        **self._labels, 'outcome': outcome
+        **self._labels, 'task_succeeded': task_succeeded
     })
-    if outcome == "success":
+    if task_succeeded:
       error_condition = 'N/A'
     elif _exc_type:
       error_condition = 'UNHANDLED_EXCEPTION'
@@ -161,16 +179,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
     # labels limit recommended by gcp.
     trimmed_labels = self._labels
     del trimmed_labels['job']
-    trimmed_labels['outcome'] = outcome
+    trimmed_labels['task_succeeded'] = task_succeeded
     trimmed_labels['error_condition'] = error_condition
     monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
         trimmed_labels)
 
-    if error_condition != 'UNHANDLED_EXCEPTION':
-      task = self._labels['task']
-      subtask = self._labels['subtask']
-      logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')
-
 
 def ensure_uworker_env_type_safety(uworker_env):
   """Converts all values in |uworker_env| to str types.
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
index cac65fd4ec9..cb54c9da545 100644
--- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
+++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -263,7 +263,7 @@
         monitor.StringField('subtask'),
         monitor.StringField('mode'),
         monitor.StringField('platform'),
-        monitor.StringField('outcome'),
+        monitor.BooleanField('task_succeeded'),
     ])
 
 TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
@@ -274,7 +274,7 @@
         monitor.StringField('subtask'),
         monitor.StringField('mode'),
         monitor.StringField('platform'),
-        monitor.StringField('outcome'),
+        monitor.BooleanField('task_succeeded'),
         monitor.StringField('error_condition'),
     ])
 

From f4dc5ca81845d6677ce5b165e05ded9782f6e5a4 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Mon, 23 Dec 2024 15:11:06 -0300
Subject: [PATCH 3/6] [Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to
 track testcases stuck in analyze (#4547)

### Motivation

We currently have no way to tell if analyze task was successfully
executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would
only track duration for tasks that did finish.

An analyze_pending field is added to the Testcase entity in datastore,
which is set to False by default, to True for manually uploaded
testcases, and to False once analyze task postprocess runs.

It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a
status label, so we can know at what step the testcase is stuck, thus
allowing us to alert if analyze is taking longer to finish than
expected.

The alert itself could be, for instance, P50 age of untriaged testcase
(status=analyze_pending) > 3h.

Also, this retroactively addresses comments from #4481:

* Fixes docstring for emit_testcase_triage_duration_metric
* Removes assertions
* Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION,
since it now accounts for fuzzer generated testcases
* Use a boolean "from_fuzzer" field, instead of "origin" string, in
TESTCASE_TRIAGE_DURATION
---
 .../bot/tasks/utasks/analyze_task.py          |  2 +
 .../_internal/common/testcase_utils.py        | 40 ++++------------
 src/clusterfuzz/_internal/cron/triage.py      | 46 +++++++++++--------
 .../_internal/datastore/data_handler.py       |  3 +-
 .../_internal/datastore/data_types.py         |  4 ++
 .../_internal/metrics/monitoring_metrics.py   |  9 ++--
 6 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
index 785974f7353..1f0b7e8f58b 100644
--- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
+++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
@@ -552,6 +552,8 @@ def _update_testcase(output):
   if analyze_task_output.platform_id:
     testcase.platform_id = analyze_task_output.platform_id
 
+  testcase.analyze_pending = False
+
   testcase.put()
 
 
diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py
index 73fe9edfd4d..4314dcd55fa 100644
--- a/src/clusterfuzz/_internal/common/testcase_utils.py
+++ b/src/clusterfuzz/_internal/common/testcase_utils.py
@@ -31,59 +31,39 @@
 
 
 def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
-  '''Finds out if a testcase is fuzzer generated or manually uploaded,
-      and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.'''
-  testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
-  if not testcase_upload_metadata:
-    logs.warning(f'No upload metadata found for testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  if not testcase_upload_metadata.timestamp:
-    logs.warning(
-        f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
-        ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
-    return
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
+  """Finds out if a testcase is fuzzer generated or manually uploaded,
+      and emits the TESTCASE_TRIAGE_DURATION metric."""
   testcase = data_handler.get_testcase_by_id(testcase_id)
 
   if not testcase:
     logs.warning(f'No testcase found with id {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   if not testcase.job_type:
     logs.warning(f'No job_type associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
   from_fuzzer = not get_testcase_upload_metadata(testcase_id)
 
-  assert step in [
-      'analyze_launched', 'analyze_completed', 'minimize_completed',
-      'regression_completed', 'impact_completed', 'issue_updated'
-  ]
-
   if not testcase.get_age_in_seconds():
     logs.warning(f'No timestamp associated to testcase {testcase_id},'
-                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+                 ' failed to emit TESTCASE_TRIAGE_DURATION metric.')
     return
 
-  testcase_age_in_hours = testcase.get_age_in_seconds() / 3600
+  testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60)
 
-  logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase '
+  logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase '
             f'{testcase_id} (age = {testcase_age_in_hours} hours.) '
-            'in step {step}.')
+            f'in step {step}, from_fuzzer: {from_fuzzer}.')
 
-  monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
+  monitoring_metrics.TESTCASE_TRIAGE_DURATION.add(
       testcase_age_in_hours,
       labels={
           'job': testcase.job_type,
           'step': step,
-          'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded'
+          'from_fuzzer': from_fuzzer
       })
 
 
diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py
index 1f3b3759227..ccfb03e8ae0 100644
--- a/src/clusterfuzz/_internal/cron/triage.py
+++ b/src/clusterfuzz/_internal/cron/triage.py
@@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker):
 
 def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase):
   testcase_age = testcase.get_age_in_seconds()
+  if not testcase_age:
+    return
   monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add(
       testcase_age,
       labels={
@@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric():
         })
 
 
-def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase):
+PENDING_ANALYZE = 'pending_analyze'
+PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
+PENDING_PROGRESSION = 'pending_progression'
+PENDING_GROUPING = 'pending_grouping'
+PENDING_FILING = 'pending_filing'
+
+
+def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
+                                        step: str):
   """Emmits a metric to track age of untriaged testcases."""
-  if not testcase.timestamp:
+  if not testcase.get_age_in_seconds():
     return
 
   logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} '
-            f'(age = {testcase.get_age_in_seconds()})')
+            f'(age = {testcase.get_age_in_seconds()}), step = {step}')
   monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add(
       testcase.get_age_in_seconds() / 3600,
       labels={
           'job': testcase.job_type,
           'platform': testcase.platform,
+          'step': step,
       })
 
 
-PENDING_CRITICAL_TASKS = 'pending_critical_tasks'
-PENDING_PROGRESSION = 'pending_progression'
-PENDING_GROUPING = 'pending_grouping'
-PENDING_FILING = 'pending_filing'
-
-
 def main():
   """Files bugs."""
   try:
@@ -409,7 +414,7 @@ def main():
     if testcase.get_metadata('progression_pending'):
       _set_testcase_stuck_state(testcase, True)
       logs.info(f'Skipping testcase {testcase_id}, progression pending')
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
       _increment_untriaged_testcase_count(testcase.job_type,
                                           PENDING_PROGRESSION)
       continue
@@ -432,10 +437,12 @@ def main():
     # Require that all tasks like minimizaton, regression testing, etc have
     # finished.
     if not critical_tasks_completed:
-      _emit_untriaged_testcase_age_metric(testcase)
+      status = PENDING_CRITICAL_TASKS
+      if testcase.analyze_pending:
+        status = PENDING_ANALYZE
+      _emit_untriaged_testcase_age_metric(testcase, status)
       _set_testcase_stuck_state(testcase, True)
-      _increment_untriaged_testcase_count(testcase.job_type,
-                                          PENDING_CRITICAL_TASKS)
+      _increment_untriaged_testcase_count(testcase.job_type, status)
       logs.info(
           f'Skipping testcase {testcase_id}, critical tasks still pending.')
       continue
@@ -452,7 +459,7 @@ def main():
     # metadata works well.
     if not testcase.group_id and not dates.time_has_expired(
         testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -460,7 +467,7 @@ def main():
 
     if not testcase.get_metadata('ran_grouper'):
       # Testcase should be considered by the grouper first before filing.
-      _emit_untriaged_testcase_age_metric(testcase)
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
       _set_testcase_stuck_state(testcase, True)
       _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
       logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
@@ -490,16 +497,15 @@ def main():
     # Clean up old triage messages that would be not applicable now.
     testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
 
-    # A testcase is untriaged, until immediately before a bug is opened
-    _emit_untriaged_testcase_age_metric(testcase)
-    _set_testcase_stuck_state(testcase, False)
-    _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
-
     # File the bug first and then create filed bug metadata.
     if not _file_issue(testcase, issue_tracker, throttler):
+      _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
+      _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
       logs.info(f'Issue filing failed for testcase id {testcase_id}')
       continue
 
+    _set_testcase_stuck_state(testcase, False)
+
     _create_filed_bug_metadata(testcase)
     issue_filer.notify_issue_update(testcase, 'new')
 
diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py
index abe8d5c677d..6b378609c4d 100644
--- a/src/clusterfuzz/_internal/datastore/data_handler.py
+++ b/src/clusterfuzz/_internal/datastore/data_handler.py
@@ -921,7 +921,7 @@ def critical_tasks_completed(testcase):
     return testcase.minimized_keys and testcase.regression
 
   return bool(testcase.minimized_keys and testcase.regression and
-              testcase.is_impact_set_flag)
+              testcase.is_impact_set_flag and not testcase.analyze_pending)
 
 
 # ------------------------------------------------------------------------------
@@ -1379,6 +1379,7 @@ def create_user_uploaded_testcase(key,
 
   testcase.timestamp = utils.utcnow()
   testcase.created = testcase.timestamp
+  testcase.analyze_pending = True
   testcase.uploader_email = uploader_email
   testcase.put()
 
diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py
index 8aa009ad36d..cbc63c7f4ba 100644
--- a/src/clusterfuzz/_internal/datastore/data_types.py
+++ b/src/clusterfuzz/_internal/datastore/data_types.py
@@ -583,6 +583,10 @@ class Testcase(Model):
   # Tracks if a testcase is stuck during triage.
   stuck_in_triage = ndb.BooleanProperty(default=False)
 
+  # Tracks if analyze task is pending.
+  # Defaults to false, since most testcases are fuzzer produced.
+  analyze_pending = ndb.BooleanProperty(default=False)
+
   def is_chromium(self):
     return self.project_name in ('chromium', 'chromium-testing')
 
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
index cb54c9da545..dfd92e100e4 100644
--- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
+++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -231,17 +231,17 @@
     ],
 )
 
-TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
-    'uploaded_testcase_analysis/triage_duration_secs',
+TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
+    'testcase_analysis/triage_duration_hours',
     description=('Time elapsed between testcase upload and completion'
-                 ' of relevant tasks in the testcase upload lifecycle.'
+                 ' of relevant tasks in the testcase lifecycle.'
                  ' Origin can be either from a fuzzer, or a manual'
                  ' upload. Measured in hours.'),
     bucketer=monitor.GeometricBucketer(),
     field_spec=[
         monitor.StringField('step'),
         monitor.StringField('job'),
-        monitor.StringField('origin'),
+        monitor.BooleanField('from_fuzzer'),
     ],
 )
 
@@ -365,6 +365,7 @@
     field_spec=[
         monitor.StringField('job'),
         monitor.StringField('platform'),
+        monitor.StringField('step'),
     ])
 
 UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric(

From 6add668ba4c316695d51133dc2518058af4f908d Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Thu, 26 Dec 2024 15:42:49 -0300
Subject: [PATCH 4/6] [Monitoring] Create a dashboard for overall clusterfuzz
 health (#4497)

### Motivation

As a final step for the monitoring project, this PR creates a dashboard
for overall system health.

The reasoning behind it is to have:
* Business level metrics (fuzzing hours, generated testcases, issues
filed, issues closed, testcases for which processing is pending)
* Testcase metrics (untriaged testcase age and count)
* SQS metrics (queue size, and published messages, per topic)
* Datastore/GCS metrics (number of requests, error rate, and latencies)
* Utask level metrics (duration, number of executions, error rate,
latency)

These are sufficient to apply the [RED
methodology](https://grafana.com/blog/2018/08/02/the-red-method-how-to-instrument-your-services/)
(rate, error and duration), provide high level metrics we can alert on,
and aid in troubleshooting outages with a well defined methodology.

There were two options to commit this to version control: terraform, or
butler definitions. The first was chosen, since it is the preffered long
term solution, and it is also simpler to implement, since it supports
copy pasting the JSON definition from GCP.

### Attention points

This should be automatically imported from main.tf, so it (should be)
sufficient to just place the .tf file in the same folder, and have
butler deploy handle the terraform apply step.

### How to review

Head over to go/cf-chrome-health-beta, internally. It is not expected
that the actual dashboard definition is reviewed, it is just a dump of
what was manually created in GCP.

Part of #4271
---
 infra/terraform/monitoring.tf | 1624 +++++++++++++++++++++++++++++++++
 1 file changed, 1624 insertions(+)
 create mode 100644 infra/terraform/monitoring.tf

diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf
new file mode 100644
index 00000000000..a725e4ea075
--- /dev/null
+++ b/infra/terraform/monitoring.tf
@@ -0,0 +1,1624 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" {
+  dashboard_id = "clusterfuzz_sli_dashboard"
+  display_name = "ClusterFuzz General Health"
+  dashboard_json = <<JSON
+{
+  "displayName": "Clusterfuzz Relability Metrics",
+  "mosaicLayout": {
+    "columns": 48,
+    "tiles": [
+      {
+        "yPos": 76,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "utask_main execution count",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 108,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[${__interval}]))\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "preprocess execution count",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 140,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"postprocess\"}[${__interval}]))\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "postprocess execution count",
+          "id": ""
+        }
+      },
+      {
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Business level metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 72,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Task metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 172,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Testcase metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 4,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"custom.googleapis.com/task/fuzz/job/total_time\" resource.type=\"gce_instance\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": []
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Fuzzing hours",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 4,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "topk(10, sum by (job)(rate(custom_googleapis_com:task_fuzz_job_total_time{monitored_resource=\"gce_instance\"}[${__interval}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Fuzzing hours (top 10 jobs)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 4,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "topk(10, sum by (fuzzer)(rate(custom_googleapis_com:task_fuzz_fuzzer_total_time{monitored_resource=\"gce_instance\"}[${__interval}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Fuzzing hours (top 10 fuzzers)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 20,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:issues_filing{monitored_resource=\"gce_instance\",status=\"success\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Issues filed in a 24h window",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 20,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:issues_closing_success{monitored_resource=\"gce_instance\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Issues closed in a 24h window",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 192,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase age (p50 - hours)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "minAlignmentPeriod": "60s",
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "crossSeriesReducer": "REDUCE_PERCENTILE_50",
+                      "groupByFields": [
+                        "metric.label.\"step\""
+                      ],
+                      "perSeriesAligner": "ALIGN_DELTA"
+                    },
+                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
+                  }
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 192,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase age (p95 - hours)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "minAlignmentPeriod": "60s",
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "crossSeriesReducer": "REDUCE_PERCENTILE_95",
+                      "groupByFields": [
+                        "metric.label.\"step\""
+                      ],
+                      "perSeriesAligner": "ALIGN_DELTA"
+                    },
+                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
+                  }
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 192,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase age (p99 - hours)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "minAlignmentPeriod": "60s",
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "crossSeriesReducer": "REDUCE_PERCENTILE_99",
+                      "groupByFields": [
+                        "metric.label.\"step\""
+                      ],
+                      "perSeriesAligner": "ALIGN_DELTA"
+                    },
+                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
+                  }
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "yPos": 36,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:task_fuzz_fuzzer_known_crash_count{monitored_resource=\"gce_instance\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Known crash counts over the last 24h",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 36,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:task_fuzz_fuzzer_new_crash_count{monitored_resource=\"gce_instance\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "New crash counts over the last 24h",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 124,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"preprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p50 preprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 92,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"uworker_main\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p50 utask_main duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 156,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"postprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p50 postprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 124,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"preprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p95 preprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 92,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"uworker_main\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p95 utask_main duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 156,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"postprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p95 postprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 176,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Testcase triage duration (p50 - hours) - by step",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 176,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.90,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Testcase triage duration (p90 - hours) - by step",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 176,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Testcase triage duration (p95 - hours) - by step",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 224,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "GCS metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 228,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"storage.googleapis.com/api/request_count\" resource.type=\"gcs_bucket\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"method\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "GCS - Request count by method",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 228,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * (sum by (method)(rate(storage_googleapis_com:api_request_count{monitored_resource=\"gcs_bucket\", response_code!=\"OK\"}[${__interval}])) \n/\nsum by (method)(rate(storage_googleapis_com:api_request_count{monitored_resource=\"gcs_bucket\"}[${__interval}])))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "GCS - Error rate by method",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 228,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"storage.googleapis.com/api/request_count\" resource.type=\"gcs_bucket\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"response_code\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "GCS Bucket - Request count by response code",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 248,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"datastore.googleapis.com/api/request_count\" resource.type=\"datastore_request\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"api_method\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Request count by method",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 248,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": " 100 * sum by (api_method)(rate(datastore_googleapis_com:api_request_count{monitored_resource=\"datastore_request\", response_code!='OK'}[${__interval}])) /\nsum by (api_method)(rate(datastore_googleapis_com:api_request_count{monitored_resource=\"datastore_request\"}[${__interval}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Error rate by method",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 264,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket{monitored_resource=\"firestore.googleapis.com/Database\"}[${__interval}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Latency by method (p50)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 248,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"datastore.googleapis.com/api/request_count\" resource.type=\"datastore_request\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"response_code\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Request count by response code",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 264,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.90,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket{monitored_resource=\"firestore.googleapis.com/Database\"}[${__interval}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Latency by method (p90)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 264,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket{monitored_resource=\"firestore.googleapis.com/Database\"}[${__interval}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Latency by method (p95)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 244,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Datastore metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 52,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "PubSub metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 56,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"pubsub.googleapis.com/subscription/num_undelivered_messages\" resource.type=\"pubsub_subscription\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_MEAN",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metadata.system_labels.\"topic_id\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "PubSub - Unacked messages (per topic)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 56,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"pubsub.googleapis.com/topic/send_request_count\" resource.type=\"pubsub_topic\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "resource.label.\"topic_id\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "PubSub - Sent messages (per topic)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 108,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"preprocess\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[${__interval}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "preprocess unhandled exception rate (by task)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 76,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "uworker_main unhandled exception rate (by task)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 140,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\", subtask=\"postprocess\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\", subtask=\"postprocess\"}[${__interval}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "postprocess unhandled exception rate (by task)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 208,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase count (by status)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (status)(last_over_time((custom_googleapis_com:issues_untriaged_testcase_count{monitored_resource=\"gce_instance\"}[1h])))\n",
+                  "unitOverride": ""
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 76,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", task_succeeded=\"false\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "uworker_main overall failure rate (by task) - CAN BE DRILLED BY JOB",
+          "id": ""
+        }
+      }
+    ]
+  },
+  "dashboardFilters": [],
+  "labels": {}
+}
+JSON
+}
\ No newline at end of file

From 7e09fe3b4657183cc992ae5864b18e4deae4775d Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Thu, 26 Dec 2024 15:53:11 -0300
Subject: [PATCH 5/6] Revert #4497 (#4560)

Reverting #4497 due to terraform apply erroring out, will reland once it
is fixed.
---
 infra/terraform/monitoring.tf | 1624 ---------------------------------
 1 file changed, 1624 deletions(-)
 delete mode 100644 infra/terraform/monitoring.tf

diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf
deleted file mode 100644
index a725e4ea075..00000000000
--- a/infra/terraform/monitoring.tf
+++ /dev/null
@@ -1,1624 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" {
-  dashboard_id = "clusterfuzz_sli_dashboard"
-  display_name = "ClusterFuzz General Health"
-  dashboard_json = <<JSON
-{
-  "displayName": "Clusterfuzz Relability Metrics",
-  "mosaicLayout": {
-    "columns": 48,
-    "tiles": [
-      {
-        "yPos": 76,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))\n",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "utask_main execution count",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 108,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[${__interval}]))\n",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "preprocess execution count",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 140,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"postprocess\"}[${__interval}]))\n",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "postprocess execution count",
-          "id": ""
-        }
-      },
-      {
-        "width": 48,
-        "height": 4,
-        "widget": {
-          "title": "Business level metrics",
-          "text": {
-            "content": "",
-            "format": "MARKDOWN",
-            "style": {
-              "backgroundColor": "#FFFFFF",
-              "textColor": "#212121",
-              "horizontalAlignment": "H_CENTER",
-              "verticalAlignment": "V_TOP",
-              "padding": "P_EXTRA_SMALL",
-              "fontSize": "FS_LARGE",
-              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
-            }
-          },
-          "id": ""
-        }
-      },
-      {
-        "yPos": 72,
-        "width": 48,
-        "height": 4,
-        "widget": {
-          "title": "Task metrics",
-          "text": {
-            "content": "",
-            "format": "MARKDOWN",
-            "style": {
-              "backgroundColor": "#FFFFFF",
-              "textColor": "#212121",
-              "horizontalAlignment": "H_CENTER",
-              "verticalAlignment": "V_TOP",
-              "padding": "P_EXTRA_SMALL",
-              "fontSize": "FS_LARGE",
-              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
-            }
-          },
-          "id": ""
-        }
-      },
-      {
-        "yPos": 172,
-        "width": 48,
-        "height": 4,
-        "widget": {
-          "title": "Testcase metrics",
-          "text": {
-            "content": "",
-            "format": "MARKDOWN",
-            "style": {
-              "backgroundColor": "#FFFFFF",
-              "textColor": "#212121",
-              "horizontalAlignment": "H_CENTER",
-              "verticalAlignment": "V_TOP",
-              "padding": "P_EXTRA_SMALL",
-              "fontSize": "FS_LARGE",
-              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
-            }
-          },
-          "id": ""
-        }
-      },
-      {
-        "yPos": 4,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"custom.googleapis.com/task/fuzz/job/total_time\" resource.type=\"gce_instance\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_RATE",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": []
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Fuzzing hours",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 4,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "topk(10, sum by (job)(rate(custom_googleapis_com:task_fuzz_job_total_time{monitored_resource=\"gce_instance\"}[${__interval}])))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Fuzzing hours (top 10 jobs)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 4,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "topk(10, sum by (fuzzer)(rate(custom_googleapis_com:task_fuzz_fuzzer_total_time{monitored_resource=\"gce_instance\"}[${__interval}])))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Fuzzing hours (top 10 fuzzers)",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 20,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum(increase(custom_googleapis_com:issues_filing{monitored_resource=\"gce_instance\",status=\"success\"}[24h]))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Issues filed in a 24h window",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 20,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum(increase(custom_googleapis_com:issues_closing_success{monitored_resource=\"gce_instance\"}[24h]))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Issues closed in a 24h window",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 192,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "title": "Untriaged testcase age (p50 - hours)",
-          "xyChart": {
-            "chartOptions": {
-              "mode": "COLOR"
-            },
-            "dataSets": [
-              {
-                "minAlignmentPeriod": "60s",
-                "plotType": "LINE",
-                "targetAxis": "Y1",
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "crossSeriesReducer": "REDUCE_PERCENTILE_50",
-                      "groupByFields": [
-                        "metric.label.\"step\""
-                      ],
-                      "perSeriesAligner": "ALIGN_DELTA"
-                    },
-                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
-                  }
-                }
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            }
-          }
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 192,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "title": "Untriaged testcase age (p95 - hours)",
-          "xyChart": {
-            "chartOptions": {
-              "mode": "COLOR"
-            },
-            "dataSets": [
-              {
-                "minAlignmentPeriod": "60s",
-                "plotType": "LINE",
-                "targetAxis": "Y1",
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "crossSeriesReducer": "REDUCE_PERCENTILE_95",
-                      "groupByFields": [
-                        "metric.label.\"step\""
-                      ],
-                      "perSeriesAligner": "ALIGN_DELTA"
-                    },
-                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
-                  }
-                }
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            }
-          }
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 192,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "title": "Untriaged testcase age (p99 - hours)",
-          "xyChart": {
-            "chartOptions": {
-              "mode": "COLOR"
-            },
-            "dataSets": [
-              {
-                "minAlignmentPeriod": "60s",
-                "plotType": "LINE",
-                "targetAxis": "Y1",
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "crossSeriesReducer": "REDUCE_PERCENTILE_99",
-                      "groupByFields": [
-                        "metric.label.\"step\""
-                      ],
-                      "perSeriesAligner": "ALIGN_DELTA"
-                    },
-                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
-                  }
-                }
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            }
-          }
-        }
-      },
-      {
-        "yPos": 36,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum(increase(custom_googleapis_com:task_fuzz_fuzzer_known_crash_count{monitored_resource=\"gce_instance\"}[24h]))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Known crash counts over the last 24h",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 36,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum(increase(custom_googleapis_com:task_fuzz_fuzzer_new_crash_count{monitored_resource=\"gce_instance\"}[24h]))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "New crash counts over the last 24h",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 124,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"preprocess\",\n    }[1h])\n  )\n)",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "p50 preprocess duration by task (seconds)",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 92,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"uworker_main\",\n    }[1h])\n  )\n)",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "p50 utask_main duration by task (seconds)",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 156,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"postprocess\",\n    }[1h])\n  )\n)",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "p50 postprocess duration by task (seconds)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 124,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"preprocess\",\n    }[1h])\n  )\n)",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "p95 preprocess duration by task (seconds)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 92,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"uworker_main\",\n    }[1h])\n  )\n)",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "p95 utask_main duration by task (seconds)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 156,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"postprocess\",\n    }[1h])\n  )\n)",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "p95 postprocess duration by task (seconds)",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 176,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n\n",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Testcase triage duration (p50 - hours) - by step",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 176,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.90,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n\n",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Testcase triage duration (p90 - hours) - by step",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 176,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Testcase triage duration (p95 - hours) - by step",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 224,
-        "width": 48,
-        "height": 4,
-        "widget": {
-          "title": "GCS metrics",
-          "text": {
-            "content": "",
-            "format": "MARKDOWN",
-            "style": {
-              "backgroundColor": "#FFFFFF",
-              "textColor": "#212121",
-              "horizontalAlignment": "H_CENTER",
-              "verticalAlignment": "V_TOP",
-              "padding": "P_EXTRA_SMALL",
-              "fontSize": "FS_LARGE",
-              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
-            }
-          },
-          "id": ""
-        }
-      },
-      {
-        "yPos": 228,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"storage.googleapis.com/api/request_count\" resource.type=\"gcs_bucket\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_RATE",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": [
-                        "metric.label.\"method\""
-                      ]
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "GCS - Request count by method",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 228,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "100 * (sum by (method)(rate(storage_googleapis_com:api_request_count{monitored_resource=\"gcs_bucket\", response_code!=\"OK\"}[${__interval}])) \n/\nsum by (method)(rate(storage_googleapis_com:api_request_count{monitored_resource=\"gcs_bucket\"}[${__interval}])))",
-                  "unitOverride": "%",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "GCS - Error rate by method",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 228,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"storage.googleapis.com/api/request_count\" resource.type=\"gcs_bucket\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_RATE",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": [
-                        "metric.label.\"response_code\""
-                      ]
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "GCS Bucket - Request count by response code",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 248,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"datastore.googleapis.com/api/request_count\" resource.type=\"datastore_request\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_RATE",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": [
-                        "metric.label.\"api_method\""
-                      ]
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Datastore - Request count by method",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 248,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": " 100 * sum by (api_method)(rate(datastore_googleapis_com:api_request_count{monitored_resource=\"datastore_request\", response_code!='OK'}[${__interval}])) /\nsum by (api_method)(rate(datastore_googleapis_com:api_request_count{monitored_resource=\"datastore_request\"}[${__interval}]))",
-                  "unitOverride": "%",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Datastore - Error rate by method",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 264,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.50,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket{monitored_resource=\"firestore.googleapis.com/Database\"}[${__interval}])))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Datastore - Latency by method (p50)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 248,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"datastore.googleapis.com/api/request_count\" resource.type=\"datastore_request\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_RATE",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": [
-                        "metric.label.\"response_code\""
-                      ]
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Datastore - Request count by response code",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 264,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.90,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket{monitored_resource=\"firestore.googleapis.com/Database\"}[${__interval}])))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Datastore - Latency by method (p90)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 264,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "histogram_quantile(0.95,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket{monitored_resource=\"firestore.googleapis.com/Database\"}[${__interval}])))",
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "Datastore - Latency by method (p95)",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 244,
-        "width": 48,
-        "height": 4,
-        "widget": {
-          "title": "Datastore metrics",
-          "text": {
-            "content": "",
-            "format": "MARKDOWN",
-            "style": {
-              "backgroundColor": "#FFFFFF",
-              "textColor": "#212121",
-              "horizontalAlignment": "H_CENTER",
-              "verticalAlignment": "V_TOP",
-              "padding": "P_EXTRA_SMALL",
-              "fontSize": "FS_LARGE",
-              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
-            }
-          },
-          "id": ""
-        }
-      },
-      {
-        "yPos": 52,
-        "width": 48,
-        "height": 4,
-        "widget": {
-          "title": "PubSub metrics",
-          "text": {
-            "content": "",
-            "format": "MARKDOWN",
-            "style": {
-              "backgroundColor": "#FFFFFF",
-              "textColor": "#212121",
-              "horizontalAlignment": "H_CENTER",
-              "verticalAlignment": "V_TOP",
-              "padding": "P_EXTRA_SMALL",
-              "fontSize": "FS_LARGE",
-              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
-            }
-          },
-          "id": ""
-        }
-      },
-      {
-        "yPos": 56,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"pubsub.googleapis.com/subscription/num_undelivered_messages\" resource.type=\"pubsub_subscription\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_MEAN",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": [
-                        "metadata.system_labels.\"topic_id\""
-                      ]
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "PubSub - Unacked messages (per topic)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 56,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "timeSeriesFilter": {
-                    "filter": "metric.type=\"pubsub.googleapis.com/topic/send_request_count\" resource.type=\"pubsub_topic\"",
-                    "aggregation": {
-                      "alignmentPeriod": "60s",
-                      "perSeriesAligner": "ALIGN_RATE",
-                      "crossSeriesReducer": "REDUCE_SUM",
-                      "groupByFields": [
-                        "resource.label.\"topic_id\""
-                      ]
-                    }
-                  },
-                  "unitOverride": "",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "minAlignmentPeriod": "60s",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "PubSub - Sent messages (per topic)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 108,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"preprocess\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"preprocess\"}[${__interval}]))",
-                  "unitOverride": "%",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "preprocess unhandled exception rate (by task)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 32,
-        "yPos": 76,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))",
-                  "unitOverride": "%",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "uworker_main unhandled exception rate (by task)",
-          "id": ""
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 140,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\", subtask=\"postprocess\", error_condition=\"UNHANDLED_EXCEPTION\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type{monitored_resource=\"gce_instance\", subtask=\"postprocess\"}[${__interval}]))",
-                  "unitOverride": "%",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "postprocess unhandled exception rate (by task)",
-          "id": ""
-        }
-      },
-      {
-        "yPos": 208,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "title": "Untriaged testcase count (by status)",
-          "xyChart": {
-            "chartOptions": {
-              "mode": "COLOR"
-            },
-            "dataSets": [
-              {
-                "plotType": "LINE",
-                "targetAxis": "Y1",
-                "timeSeriesQuery": {
-                  "prometheusQuery": "sum by (status)(last_over_time((custom_googleapis_com:issues_untriaged_testcase_count{monitored_resource=\"gce_instance\"}[1h])))\n",
-                  "unitOverride": ""
-                }
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            }
-          }
-        }
-      },
-      {
-        "xPos": 16,
-        "yPos": 76,
-        "width": 16,
-        "height": 16,
-        "widget": {
-          "xyChart": {
-            "dataSets": [
-              {
-                "timeSeriesQuery": {
-                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", task_succeeded=\"false\"}[${__interval}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"}[${__interval}]))",
-                  "unitOverride": "%",
-                  "outputFullDuration": false
-                },
-                "plotType": "LINE",
-                "legendTemplate": "",
-                "targetAxis": "Y1",
-                "dimensions": [],
-                "measures": [],
-                "breakdowns": []
-              }
-            ],
-            "thresholds": [],
-            "yAxis": {
-              "label": "",
-              "scale": "LINEAR"
-            },
-            "chartOptions": {
-              "mode": "COLOR",
-              "showLegend": false,
-              "displayHorizontal": false
-            }
-          },
-          "title": "uworker_main overall failure rate (by task) - CAN BE DRILLED BY JOB",
-          "id": ""
-        }
-      }
-    ]
-  },
-  "dashboardFilters": [],
-  "labels": {}
-}
-JSON
-}
\ No newline at end of file

From 9a0f0a664f4d8c2a0c0c09ce9930de58d1ba9649 Mon Sep 17 00:00:00 2001
From: Vitor Guidi <vitorguidi@gmail.com>
Date: Thu, 26 Dec 2024 17:43:02 -0300
Subject: [PATCH 6/6] [Monitoring] Restore dashboard terraform definition
 (#4561)

This brings back #4497 . The issue was that, in promql definitions, the
'{}' characters were lacking a double escape (\\\\{ and \\\\}).
Also, the only parameter to the terraform object should be
dashboard_json.

Testing: terraform validate


![image](https://github.com/user-attachments/assets/f67434d0-c8e4-4bc0-be38-27188c98ea49)

Part of #4271
---
 infra/terraform/monitoring.tf | 1622 +++++++++++++++++++++++++++++++++
 1 file changed, 1622 insertions(+)
 create mode 100644 infra/terraform/monitoring.tf

diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf
new file mode 100644
index 00000000000..7d9a540d76d
--- /dev/null
+++ b/infra/terraform/monitoring.tf
@@ -0,0 +1,1622 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" {
+  dashboard_json = <<JSON
+{
+  "displayName": "Clusterfuzz Relability Metrics",
+  "mosaicLayout": {
+    "columns": 48,
+    "tiles": [
+      {
+        "yPos": 76,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome\\{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"\\}[$\\{__interval\\}]))\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "utask_main execution count",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 108,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome\\{monitored_resource=\"gce_instance\",subtask=\"preprocess\"\\}[$\\{__interval\\}]))\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "preprocess execution count",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 140,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome\\{monitored_resource=\"gce_instance\",subtask=\"postprocess\"\\}[$\\{__interval\\}]))\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "postprocess execution count",
+          "id": ""
+        }
+      },
+      {
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Business level metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 72,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Task metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 172,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Testcase metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 4,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"custom.googleapis.com/task/fuzz/job/total_time\" resource.type=\"gce_instance\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": []
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Fuzzing hours",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 4,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "topk(10, sum by (job)(rate(custom_googleapis_com:task_fuzz_job_total_time\\{monitored_resource=\"gce_instance\"\\}[$\\{__interval\\}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Fuzzing hours (top 10 jobs)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 4,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "topk(10, sum by (fuzzer)(rate(custom_googleapis_com:task_fuzz_fuzzer_total_time\\{monitored_resource=\"gce_instance\"\\}[$\\{__interval\\}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Fuzzing hours (top 10 fuzzers)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 20,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:issues_filing{monitored_resource=\"gce_instance\",status=\"success\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Issues filed in a 24h window",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 20,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:issues_closing_success{monitored_resource=\"gce_instance\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Issues closed in a 24h window",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 192,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase age (p50 - hours)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "minAlignmentPeriod": "60s",
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "crossSeriesReducer": "REDUCE_PERCENTILE_50",
+                      "groupByFields": [
+                        "metric.label.\"step\""
+                      ],
+                      "perSeriesAligner": "ALIGN_DELTA"
+                    },
+                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
+                  }
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 192,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase age (p95 - hours)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "minAlignmentPeriod": "60s",
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "crossSeriesReducer": "REDUCE_PERCENTILE_95",
+                      "groupByFields": [
+                        "metric.label.\"step\""
+                      ],
+                      "perSeriesAligner": "ALIGN_DELTA"
+                    },
+                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
+                  }
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 192,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase age (p99 - hours)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "minAlignmentPeriod": "60s",
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "crossSeriesReducer": "REDUCE_PERCENTILE_99",
+                      "groupByFields": [
+                        "metric.label.\"step\""
+                      ],
+                      "perSeriesAligner": "ALIGN_DELTA"
+                    },
+                    "filter": "metric.type=\"custom.googleapis.com/issues/untriaged_testcase_age\" resource.type=\"gce_instance\""
+                  }
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "yPos": 36,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:task_fuzz_fuzzer_known_crash_count{monitored_resource=\"gce_instance\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Known crash counts over the last 24h",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 36,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum(increase(custom_googleapis_com:task_fuzz_fuzzer_new_crash_count{monitored_resource=\"gce_instance\"}[24h]))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "New crash counts over the last 24h",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 124,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"preprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p50 preprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 92,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"uworker_main\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p50 utask_main duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 156,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"postprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p50 postprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 124,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"preprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p95 preprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 92,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"uworker_main\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p95 utask_main duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 156,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, task) (\n    increase(custom_googleapis_com:utask_subtask_duration_secs_bucket{\n      monitored_resource=\"gce_instance\",\n      subtask=\"postprocess\",\n    }[1h])\n  )\n)",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "p95 postprocess duration by task (seconds)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 176,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Testcase triage duration (p50 - hours) - by step",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 176,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.90,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Testcase triage duration (p90 - hours) - by step",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 176,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,\n  sum by (le, step) (\n    increase(custom_googleapis_com:testcase_analysis_triage_duration_hours_bucket{\n      monitored_resource=\"gce_instance\",\n    }[1h])\n  )\n)\n",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Testcase triage duration (p95 - hours) - by step",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 224,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "GCS metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 228,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"storage.googleapis.com/api/request_count\" resource.type=\"gcs_bucket\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"method\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "GCS - Request count by method",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 228,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * (sum by (method)(rate(storage_googleapis_com:api_request_count\\{monitored_resource=\"gcs_bucket\", response_code!=\"OK\"\\}[$\\{__interval\\}])) \n/\nsum by (method)(rate(storage_googleapis_com:api_request_count\\{monitored_resource=\"gcs_bucket\"\\}[$\\{__interval\\}])))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "GCS - Error rate by method",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 228,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"storage.googleapis.com/api/request_count\" resource.type=\"gcs_bucket\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"response_code\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "GCS Bucket - Request count by response code",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 248,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"datastore.googleapis.com/api/request_count\" resource.type=\"datastore_request\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"api_method\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Request count by method",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 248,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": " 100 * sum by (api_method)(rate(datastore_googleapis_com:api_request_count\\{monitored_resource=\"datastore_request\", response_code!='OK'\\}[$\\{__interval\\}])) /\nsum by (api_method)(rate(datastore_googleapis_com:api_request_count\\{monitored_resource=\"datastore_request\"\\}[$\\{__interval\\}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Error rate by method",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 264,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.50,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket\\{monitored_resource=\"firestore.googleapis.com/Database\"\\}[$\\{__interval\\}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Latency by method (p50)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 248,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"datastore.googleapis.com/api/request_count\" resource.type=\"datastore_request\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metric.label.\"response_code\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Request count by response code",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 264,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.90,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket\\{monitored_resource=\"firestore.googleapis.com/Database\"\\}[$\\{__interval\\}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Latency by method (p90)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 264,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "histogram_quantile(0.95,sum by (api_method,le)(increase(firestore_googleapis_com:api_request_latencies_bucket\\{monitored_resource=\"firestore.googleapis.com/Database\"\\}[$\\{__interval\\}])))",
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "Datastore - Latency by method (p95)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 244,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "Datastore metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 52,
+        "width": 48,
+        "height": 4,
+        "widget": {
+          "title": "PubSub metrics",
+          "text": {
+            "content": "",
+            "format": "MARKDOWN",
+            "style": {
+              "backgroundColor": "#FFFFFF",
+              "textColor": "#212121",
+              "horizontalAlignment": "H_CENTER",
+              "verticalAlignment": "V_TOP",
+              "padding": "P_EXTRA_SMALL",
+              "fontSize": "FS_LARGE",
+              "pointerLocation": "POINTER_LOCATION_UNSPECIFIED"
+            }
+          },
+          "id": ""
+        }
+      },
+      {
+        "yPos": 56,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"pubsub.googleapis.com/subscription/num_undelivered_messages\" resource.type=\"pubsub_subscription\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_MEAN",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "metadata.system_labels.\"topic_id\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "PubSub - Unacked messages (per topic)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 56,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "timeSeriesFilter": {
+                    "filter": "metric.type=\"pubsub.googleapis.com/topic/send_request_count\" resource.type=\"pubsub_topic\"",
+                    "aggregation": {
+                      "alignmentPeriod": "60s",
+                      "perSeriesAligner": "ALIGN_RATE",
+                      "crossSeriesReducer": "REDUCE_SUM",
+                      "groupByFields": [
+                        "resource.label.\"topic_id\""
+                      ]
+                    }
+                  },
+                  "unitOverride": "",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "minAlignmentPeriod": "60s",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "PubSub - Sent messages (per topic)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 108,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type\\{monitored_resource=\"gce_instance\",subtask=\"preprocess\", error_condition=\"UNHANDLED_EXCEPTION\"\\}[$\\{__interval\\}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type\\{monitored_resource=\"gce_instance\",subtask=\"preprocess\"\\}[$\\{__interval\\}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "preprocess unhandled exception rate (by task)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 32,
+        "yPos": 76,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type\\{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", error_condition=\"UNHANDLED_EXCEPTION\"\\}[$\\{__interval\\}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type\\{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"\\}[$\\{__interval\\}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "uworker_main unhandled exception rate (by task)",
+          "id": ""
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 140,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type\\{monitored_resource=\"gce_instance\", subtask=\"postprocess\", error_condition=\"UNHANDLED_EXCEPTION\"\\}[$\\{__interval\\}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome_by_error_type\\{monitored_resource=\"gce_instance\", subtask=\"postprocess\"\\}[$\\{__interval\\}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "postprocess unhandled exception rate (by task)",
+          "id": ""
+        }
+      },
+      {
+        "yPos": 208,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "title": "Untriaged testcase count (by status)",
+          "xyChart": {
+            "chartOptions": {
+              "mode": "COLOR"
+            },
+            "dataSets": [
+              {
+                "plotType": "LINE",
+                "targetAxis": "Y1",
+                "timeSeriesQuery": {
+                  "prometheusQuery": "sum by (status)(last_over_time((custom_googleapis_com:issues_untriaged_testcase_count{monitored_resource=\"gce_instance\"}[1h])))\n",
+                  "unitOverride": ""
+                }
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            }
+          }
+        }
+      },
+      {
+        "xPos": 16,
+        "yPos": 76,
+        "width": 16,
+        "height": 16,
+        "widget": {
+          "xyChart": {
+            "dataSets": [
+              {
+                "timeSeriesQuery": {
+                  "prometheusQuery": "100 * sum by (task)(rate(custom_googleapis_com:task_outcome\\{monitored_resource=\"gce_instance\",subtask=\"uworker_main\", task_succeeded=\"false\"\\}[$\\{__interval\\}]))\n/ sum by (task)(rate(custom_googleapis_com:task_outcome\\{monitored_resource=\"gce_instance\",subtask=\"uworker_main\"\\}[$\\{__interval\\}]))",
+                  "unitOverride": "%",
+                  "outputFullDuration": false
+                },
+                "plotType": "LINE",
+                "legendTemplate": "",
+                "targetAxis": "Y1",
+                "dimensions": [],
+                "measures": [],
+                "breakdowns": []
+              }
+            ],
+            "thresholds": [],
+            "yAxis": {
+              "label": "",
+              "scale": "LINEAR"
+            },
+            "chartOptions": {
+              "mode": "COLOR",
+              "showLegend": false,
+              "displayHorizontal": false
+            }
+          },
+          "title": "uworker_main overall failure rate (by task) - CAN BE DRILLED BY JOB",
+          "id": ""
+        }
+      }
+    ]
+  },
+  "dashboardFilters": [],
+  "labels": {}
+}
+JSON
+}
\ No newline at end of file