Skip to content

Commit

Permalink
[Monitoring] Partition UTask outcomes correctly into success and error (
Browse files Browse the repository at this point in the history
#4516)

### Motivation

#4458 implemented an error rate for utasks, only considering exceptions.
In #4499 , outcomes were split between success, failure and maybe_retry
conditions. There we learned that the volume of retryable outcomes is
negligible, so it makes sense to count them as failures.

Listing out all the success conditions under _MetricRecorder is not
desirable. However, we are consciously taking this technical debt so we
can deliver #4271 .

A refactor of uworker main will be later performed, so we can split the
success and failure conditions, both of which are mixed in
uworker_output.ErrorType.

Reference for tech debt acknowledgement: #4517
  • Loading branch information
vitorguidi committed Dec 27, 2024
1 parent 3997898 commit b16161e
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
31 changes: 22 additions & 9 deletions src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ def __init__(self, subtask: _Subtask):
self._subtask = subtask
self._labels = None
self.utask_main_failure = None
self._utask_success_conditions = [
None, # This can be a successful return value in, ie, fuzz task
uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member
uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member
]

if subtask == _Subtask.PREPROCESS:
self._preprocess_start_time_ns = self.start_time_ns
Expand Down Expand Up @@ -125,6 +136,12 @@ def set_task_details(self,
# Ensure we always have a value after this method returns.
assert self._preprocess_start_time_ns is not None

def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool:
"""Returns True if task succeeded, False otherwise."""
if exc_type or uworker_error not in self._utask_success_conditions:
return False
return True

def __exit__(self, _exc_type, _exc_value, _traceback):
# Ignore exception details, let Python continue unwinding the stack.

Expand All @@ -145,11 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# The only case where a task might fail without throwing, is in
# utask_main, by returning an ErrorType proto which indicates
# failure.
outcome = 'error' if _exc_type or self.utask_main_failure else 'success'
task_succeeded = self._infer_uworker_main_outcome(_exc_type,
self.utask_main_failure)
monitoring_metrics.TASK_OUTCOME_COUNT.increment({
**self._labels, 'outcome': outcome
**self._labels, 'task_succeeded': task_succeeded
})
if outcome == "success":
if task_succeeded:
error_condition = 'N/A'
elif _exc_type:
error_condition = 'UNHANDLED_EXCEPTION'
Expand All @@ -161,16 +179,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback):
# labels limit recommended by gcp.
trimmed_labels = self._labels
del trimmed_labels['job']
trimmed_labels['outcome'] = outcome
trimmed_labels['task_succeeded'] = task_succeeded
trimmed_labels['error_condition'] = error_condition
monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment(
trimmed_labels)

if error_condition != 'UNHANDLED_EXCEPTION':
task = self._labels['task']
subtask = self._labels['subtask']
logs.info(f'Task {task}, at subtask {subtask}, finished successfully.')


def ensure_uworker_env_type_safety(uworker_env):
"""Converts all values in |uworker_env| to str types.
Expand Down
4 changes: 2 additions & 2 deletions src/clusterfuzz/_internal/metrics/monitoring_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.BooleanField('task_succeeded'),
])

TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric(
Expand All @@ -274,7 +274,7 @@
monitor.StringField('subtask'),
monitor.StringField('mode'),
monitor.StringField('platform'),
monitor.StringField('outcome'),
monitor.BooleanField('task_succeeded'),
monitor.StringField('error_condition'),
])

Expand Down

0 comments on commit b16161e

Please sign in to comment.