From 3997898900c3ac58f93faf5e914860229c212741 Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Tue, 17 Dec 2024 00:21:48 -0300 Subject: [PATCH 1/6] Revert #4499 (#4512) There is a clear way to partition ErrorType enums in the criteria proposed, reverting this one. Part of #4271 --- .../_internal/bot/tasks/utasks/__init__.py | 64 ++----------------- 1 file changed, 6 insertions(+), 58 deletions(-) diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py index 1dea6b20ed5..c9879799962 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py @@ -84,50 +84,6 @@ def __init__(self, subtask: _Subtask): self._subtask = subtask self._labels = None self.utask_main_failure = None - self._utask_success_conditions = [ - uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.MINIMIZE_UNREPRODUCIBLE_CRASH, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member - ] - self._utask_maybe_retry_conditions = [ - uworker_msg_pb2.ErrorType.ANALYZE_BUILD_SETUP, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISIONS_LIST, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.TESTCASE_SETUP, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.MINIMIZE_SETUP, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.FUZZ_DATA_BUNDLE_SETUP_FAILURE, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZ_TARGET_SELECTED, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_NO_CRASH, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_TIMEOUT, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_BUILD_SETUP_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_TIMEOUT_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.SYMBOLIZE_BUILD_SETUP_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.MINIMIZE_DEADLINE_EXCEEDED_IN_MAIN_FILE_PHASE, # pylint: disable=no-member - ] - self._utask_failure_conditions = [ - uworker_msg_pb2.ErrorType.ANALYZE_NO_REVISION_INDEX, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.UNHANDLED, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.VARIANT_BUILD_SETUP, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.FUZZ_BUILD_SETUP_FAILURE, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.FUZZ_NO_FUZZER, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.PROGRESSION_BAD_BUILD, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_BUILD_NOT_FOUND, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.REGRESSION_BAD_BUILD_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_FAILED, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.CORPUS_PRUNING_FUZZER_SETUP_FAILED, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.CORPUS_PRUNING_ERROR, # pylint: disable=no-member - uworker_msg_pb2.ErrorType.FUZZ_BAD_BUILD, # pylint: disable=no-member - ] if subtask == _Subtask.PREPROCESS: self._preprocess_start_time_ns = self.start_time_ns @@ -169,18 +125,6 @@ def set_task_details(self, # Ensure we always have a value after this method returns. assert self._preprocess_start_time_ns is not None - def _infer_uworker_main_outcome(self, exc_type, uworker_error): - '''Infers, on a best effort basis, whether an uworker output implies - success or failure. If an unequivocal response is not possible, - classifies as maybe_retry.''' - if exc_type or uworker_error in self._utask_failure_conditions: - outcome = 'error' - elif uworker_error in self._utask_maybe_retry_conditions: - outcome = 'maybe_retry' - else: - outcome = 'success' - return outcome - def __exit__(self, _exc_type, _exc_value, _traceback): # Ignore exception details, let Python continue unwinding the stack. @@ -201,8 +145,7 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # The only case where a task might fail without throwing, is in # utask_main, by returning an ErrorType proto which indicates # failure. - outcome = self._infer_uworker_main_outcome(_exc_type, - self.utask_main_failure) + outcome = 'error' if _exc_type or self.utask_main_failure else 'success' monitoring_metrics.TASK_OUTCOME_COUNT.increment({ **self._labels, 'outcome': outcome }) @@ -223,6 +166,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback): monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( trimmed_labels) + if error_condition != 'UNHANDLED_EXCEPTION': + task = self._labels['task'] + subtask = self._labels['subtask'] + logs.info(f'Task {task}, at subtask {subtask}, finished successfully.') + def ensure_uworker_env_type_safety(uworker_env): """Converts all values in |uworker_env| to str types. From b16161ea74a9c289bc13fcf91eb4243be74ebe99 Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Mon, 23 Dec 2024 08:34:15 -0300 Subject: [PATCH 2/6] [Monitoring] Partition UTask outcomes correctly into success and error (#4516) ### Motivation #4458 implemented an error rate for utasks, only considering exceptions. In #4499 , outcomes were split between success, failure and maybe_retry conditions. There we learned that the volume of retryable outcomes is negligible, so it makes sense to count them as failures. Listing out all the success conditions under _MetricRecorder is not desirable. However, we are consciously taking this technical debt so we can deliver #4271 . A refactor of uworker main will be later performed, so we can split the success and failure conditions, both of which are mixed in uworker_output.ErrorType. Reference for tech debt acknowledgement: #4517 --- .../_internal/bot/tasks/utasks/__init__.py | 31 +++++++++++++------ .../_internal/metrics/monitoring_metrics.py | 4 +-- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py index c9879799962..b8d8f0f6634 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py @@ -84,6 +84,17 @@ def __init__(self, subtask: _Subtask): self._subtask = subtask self._labels = None self.utask_main_failure = None + self._utask_success_conditions = [ + None, # This can be a successful return value in, ie, fuzz task + uworker_msg_pb2.ErrorType.NO_ERROR, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.PROGRESSION_BAD_STATE_MIN_MAX, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_NO_CRASH, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.REGRESSION_LOW_CONFIDENCE_IN_REGRESSION_RANGE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.MINIMIZE_CRASH_TOO_FLAKY, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.LIBFUZZER_MINIMIZATION_UNREPRODUCIBLE, # pylint: disable=no-member + uworker_msg_pb2.ErrorType.ANALYZE_CLOSE_INVALID_UPLOADED, # pylint: disable=no-member + ] if subtask == _Subtask.PREPROCESS: self._preprocess_start_time_ns = self.start_time_ns @@ -125,6 +136,12 @@ def set_task_details(self, # Ensure we always have a value after this method returns. assert self._preprocess_start_time_ns is not None + def _infer_uworker_main_outcome(self, exc_type, uworker_error) -> bool: + """Returns True if task succeeded, False otherwise.""" + if exc_type or uworker_error not in self._utask_success_conditions: + return False + return True + def __exit__(self, _exc_type, _exc_value, _traceback): # Ignore exception details, let Python continue unwinding the stack. @@ -145,11 +162,12 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # The only case where a task might fail without throwing, is in # utask_main, by returning an ErrorType proto which indicates # failure. - outcome = 'error' if _exc_type or self.utask_main_failure else 'success' + task_succeeded = self._infer_uworker_main_outcome(_exc_type, + self.utask_main_failure) monitoring_metrics.TASK_OUTCOME_COUNT.increment({ - **self._labels, 'outcome': outcome + **self._labels, 'task_succeeded': task_succeeded }) - if outcome == "success": + if task_succeeded: error_condition = 'N/A' elif _exc_type: error_condition = 'UNHANDLED_EXCEPTION' @@ -161,16 +179,11 @@ def __exit__(self, _exc_type, _exc_value, _traceback): # labels limit recommended by gcp. trimmed_labels = self._labels del trimmed_labels['job'] - trimmed_labels['outcome'] = outcome + trimmed_labels['task_succeeded'] = task_succeeded trimmed_labels['error_condition'] = error_condition monitoring_metrics.TASK_OUTCOME_COUNT_BY_ERROR_TYPE.increment( trimmed_labels) - if error_condition != 'UNHANDLED_EXCEPTION': - task = self._labels['task'] - subtask = self._labels['subtask'] - logs.info(f'Task {task}, at subtask {subtask}, finished successfully.') - def ensure_uworker_env_type_safety(uworker_env): """Converts all values in |uworker_env| to str types. diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index cac65fd4ec9..cb54c9da545 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -263,7 +263,7 @@ monitor.StringField('subtask'), monitor.StringField('mode'), monitor.StringField('platform'), - monitor.StringField('outcome'), + monitor.BooleanField('task_succeeded'), ]) TASK_OUTCOME_COUNT_BY_ERROR_TYPE = monitor.CounterMetric( @@ -274,7 +274,7 @@ monitor.StringField('subtask'), monitor.StringField('mode'), monitor.StringField('platform'), - monitor.StringField('outcome'), + monitor.BooleanField('task_succeeded'), monitor.StringField('error_condition'), ]) From f4dc5ca81845d6677ce5b165e05ded9782f6e5a4 Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Mon, 23 Dec 2024 15:11:06 -0300 Subject: [PATCH 3/6] [Monitoring] Enrich UNTRIAGED_TESTCASE_AGE metric to track testcases stuck in analyze (#4547) ### Motivation We currently have no way to tell if analyze task was successfully executed. The TESTCASE_UPLOAD_TRIAGE_DURATION metric from #4364 would only track duration for tasks that did finish. An analyze_pending field is added to the Testcase entity in datastore, which is set to False by default, to True for manually uploaded testcases, and to False once analyze task postprocess runs. It also increments the UNTRIAGED_TESTCASE_AGE metric from #4381 with a status label, so we can know at what step the testcase is stuck, thus allowing us to alert if analyze is taking longer to finish than expected. The alert itself could be, for instance, P50 age of untriaged testcase (status=analyze_pending) > 3h. Also, this retroactively addresses comments from #4481: * Fixes docstring for emit_testcase_triage_duration_metric * Removes assertions * Renames TESTCASE_UPLOAD_TRIAGE_DURATION to TESTCASE_TRIAGE_DURATION, since it now accounts for fuzzer generated testcases * Use a boolean "from_fuzzer" field, instead of "origin" string, in TESTCASE_TRIAGE_DURATION --- .../bot/tasks/utasks/analyze_task.py | 2 + .../_internal/common/testcase_utils.py | 40 ++++------------ src/clusterfuzz/_internal/cron/triage.py | 46 +++++++++++-------- .../_internal/datastore/data_handler.py | 3 +- .../_internal/datastore/data_types.py | 4 ++ .../_internal/metrics/monitoring_metrics.py | 9 ++-- 6 files changed, 49 insertions(+), 55 deletions(-) diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py index 785974f7353..1f0b7e8f58b 100644 --- a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py +++ b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py @@ -552,6 +552,8 @@ def _update_testcase(output): if analyze_task_output.platform_id: testcase.platform_id = analyze_task_output.platform_id + testcase.analyze_pending = False + testcase.put() diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py index 73fe9edfd4d..4314dcd55fa 100644 --- a/src/clusterfuzz/_internal/common/testcase_utils.py +++ b/src/clusterfuzz/_internal/common/testcase_utils.py @@ -31,59 +31,39 @@ def emit_testcase_triage_duration_metric(testcase_id: int, step: str): - '''Finds out if a testcase is fuzzer generated or manually uploaded, - and emits the TESTCASE_UPLOAD_TRIAGE_DURATION metric.''' - testcase_upload_metadata = get_testcase_upload_metadata(testcase_id) - if not testcase_upload_metadata: - logs.warning(f'No upload metadata found for testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - if not testcase_upload_metadata.timestamp: - logs.warning( - f'No timestamp for testcase {testcase_upload_metadata.testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') - return - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - + """Finds out if a testcase is fuzzer generated or manually uploaded, + and emits the TESTCASE_TRIAGE_DURATION metric.""" testcase = data_handler.get_testcase_by_id(testcase_id) if not testcase: logs.warning(f'No testcase found with id {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return if not testcase.job_type: logs.warning(f'No job_type associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return from_fuzzer = not get_testcase_upload_metadata(testcase_id) - assert step in [ - 'analyze_launched', 'analyze_completed', 'minimize_completed', - 'regression_completed', 'impact_completed', 'issue_updated' - ] - if not testcase.get_age_in_seconds(): logs.warning(f'No timestamp associated to testcase {testcase_id},' - ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.') + ' failed to emit TESTCASE_TRIAGE_DURATION metric.') return - testcase_age_in_hours = testcase.get_age_in_seconds() / 3600 + testcase_age_in_hours = testcase.get_age_in_seconds() / (60 * 60) - logs.info('Emiting TESTCASE_UPLOAD_TRIAGE_DURATION metric for testcase ' + logs.info('Emiting TESTCASE_TRIAGE_DURATION metric for testcase ' f'{testcase_id} (age = {testcase_age_in_hours} hours.) ' - 'in step {step}.') + f'in step {step}, from_fuzzer: {from_fuzzer}.') - monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add( + monitoring_metrics.TESTCASE_TRIAGE_DURATION.add( testcase_age_in_hours, labels={ 'job': testcase.job_type, 'step': step, - 'origin': 'fuzzer' if from_fuzzer else 'manually_uploaded' + 'from_fuzzer': from_fuzzer }) diff --git a/src/clusterfuzz/_internal/cron/triage.py b/src/clusterfuzz/_internal/cron/triage.py index 1f3b3759227..ccfb03e8ae0 100644 --- a/src/clusterfuzz/_internal/cron/triage.py +++ b/src/clusterfuzz/_internal/cron/triage.py @@ -256,6 +256,8 @@ def _check_and_update_similar_bug(testcase, issue_tracker): def _emit_bug_filing_from_testcase_elapsed_time_metric(testcase): testcase_age = testcase.get_age_in_seconds() + if not testcase_age: + return monitoring_metrics.BUG_FILING_FROM_TESTCASE_ELAPSED_TIME.add( testcase_age, labels={ @@ -336,27 +338,30 @@ def _emit_untriaged_testcase_count_metric(): }) -def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase): +PENDING_ANALYZE = 'pending_analyze' +PENDING_CRITICAL_TASKS = 'pending_critical_tasks' +PENDING_PROGRESSION = 'pending_progression' +PENDING_GROUPING = 'pending_grouping' +PENDING_FILING = 'pending_filing' + + +def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase, + step: str): """Emmits a metric to track age of untriaged testcases.""" - if not testcase.timestamp: + if not testcase.get_age_in_seconds(): return logs.info(f'Emiting UNTRIAGED_TESTCASE_AGE for testcase {testcase.key.id()} ' - f'(age = {testcase.get_age_in_seconds()})') + f'(age = {testcase.get_age_in_seconds()}), step = {step}') monitoring_metrics.UNTRIAGED_TESTCASE_AGE.add( testcase.get_age_in_seconds() / 3600, labels={ 'job': testcase.job_type, 'platform': testcase.platform, + 'step': step, }) -PENDING_CRITICAL_TASKS = 'pending_critical_tasks' -PENDING_PROGRESSION = 'pending_progression' -PENDING_GROUPING = 'pending_grouping' -PENDING_FILING = 'pending_filing' - - def main(): """Files bugs.""" try: @@ -409,7 +414,7 @@ def main(): if testcase.get_metadata('progression_pending'): _set_testcase_stuck_state(testcase, True) logs.info(f'Skipping testcase {testcase_id}, progression pending') - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION) _increment_untriaged_testcase_count(testcase.job_type, PENDING_PROGRESSION) continue @@ -432,10 +437,12 @@ def main(): # Require that all tasks like minimizaton, regression testing, etc have # finished. if not critical_tasks_completed: - _emit_untriaged_testcase_age_metric(testcase) + status = PENDING_CRITICAL_TASKS + if testcase.analyze_pending: + status = PENDING_ANALYZE + _emit_untriaged_testcase_age_metric(testcase, status) _set_testcase_stuck_state(testcase, True) - _increment_untriaged_testcase_count(testcase.job_type, - PENDING_CRITICAL_TASKS) + _increment_untriaged_testcase_count(testcase.job_type, status) logs.info( f'Skipping testcase {testcase_id}, critical tasks still pending.') continue @@ -452,7 +459,7 @@ def main(): # metadata works well. if not testcase.group_id and not dates.time_has_expired( testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT): - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') @@ -460,7 +467,7 @@ def main(): if not testcase.get_metadata('ran_grouper'): # Testcase should be considered by the grouper first before filing. - _emit_untriaged_testcase_age_metric(testcase) + _emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING) _set_testcase_stuck_state(testcase, True) _increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING) logs.info(f'Skipping testcase {testcase_id}, pending grouping.') @@ -490,16 +497,15 @@ def main(): # Clean up old triage messages that would be not applicable now. testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False) - # A testcase is untriaged, until immediately before a bug is opened - _emit_untriaged_testcase_age_metric(testcase) - _set_testcase_stuck_state(testcase, False) - _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) - # File the bug first and then create filed bug metadata. if not _file_issue(testcase, issue_tracker, throttler): + _emit_untriaged_testcase_age_metric(testcase, PENDING_FILING) + _increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING) logs.info(f'Issue filing failed for testcase id {testcase_id}') continue + _set_testcase_stuck_state(testcase, False) + _create_filed_bug_metadata(testcase) issue_filer.notify_issue_update(testcase, 'new') diff --git a/src/clusterfuzz/_internal/datastore/data_handler.py b/src/clusterfuzz/_internal/datastore/data_handler.py index abe8d5c677d..6b378609c4d 100644 --- a/src/clusterfuzz/_internal/datastore/data_handler.py +++ b/src/clusterfuzz/_internal/datastore/data_handler.py @@ -921,7 +921,7 @@ def critical_tasks_completed(testcase): return testcase.minimized_keys and testcase.regression return bool(testcase.minimized_keys and testcase.regression and - testcase.is_impact_set_flag) + testcase.is_impact_set_flag and not testcase.analyze_pending) # ------------------------------------------------------------------------------ @@ -1379,6 +1379,7 @@ def create_user_uploaded_testcase(key, testcase.timestamp = utils.utcnow() testcase.created = testcase.timestamp + testcase.analyze_pending = True testcase.uploader_email = uploader_email testcase.put() diff --git a/src/clusterfuzz/_internal/datastore/data_types.py b/src/clusterfuzz/_internal/datastore/data_types.py index 8aa009ad36d..cbc63c7f4ba 100644 --- a/src/clusterfuzz/_internal/datastore/data_types.py +++ b/src/clusterfuzz/_internal/datastore/data_types.py @@ -583,6 +583,10 @@ class Testcase(Model): # Tracks if a testcase is stuck during triage. stuck_in_triage = ndb.BooleanProperty(default=False) + # Tracks if analyze task is pending. + # Defaults to false, since most testcases are fuzzer produced. + analyze_pending = ndb.BooleanProperty(default=False) + def is_chromium(self): return self.project_name in ('chromium', 'chromium-testing') diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index cb54c9da545..dfd92e100e4 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -231,17 +231,17 @@ ], ) -TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( - 'uploaded_testcase_analysis/triage_duration_secs', +TESTCASE_TRIAGE_DURATION = monitor.CumulativeDistributionMetric( + 'testcase_analysis/triage_duration_hours', description=('Time elapsed between testcase upload and completion' - ' of relevant tasks in the testcase upload lifecycle.' + ' of relevant tasks in the testcase lifecycle.' ' Origin can be either from a fuzzer, or a manual' ' upload. Measured in hours.'), bucketer=monitor.GeometricBucketer(), field_spec=[ monitor.StringField('step'), monitor.StringField('job'), - monitor.StringField('origin'), + monitor.BooleanField('from_fuzzer'), ], ) @@ -365,6 +365,7 @@ field_spec=[ monitor.StringField('job'), monitor.StringField('platform'), + monitor.StringField('step'), ]) UNTRIAGED_TESTCASE_COUNT = monitor.GaugeMetric( From 6add668ba4c316695d51133dc2518058af4f908d Mon Sep 17 00:00:00 2001 From: Vitor Guidi Date: Thu, 26 Dec 2024 15:42:49 -0300 Subject: [PATCH 4/6] [Monitoring] Create a dashboard for overall clusterfuzz health (#4497) ### Motivation As a final step for the monitoring project, this PR creates a dashboard for overall system health. The reasoning behind it is to have: * Business level metrics (fuzzing hours, generated testcases, issues filed, issues closed, testcases for which processing is pending) * Testcase metrics (untriaged testcase age and count) * SQS metrics (queue size, and published messages, per topic) * Datastore/GCS metrics (number of requests, error rate, and latencies) * Utask level metrics (duration, number of executions, error rate, latency) These are sufficient to apply the [RED methodology](https://grafana.com/blog/2018/08/02/the-red-method-how-to-instrument-your-services/) (rate, error and duration), provide high level metrics we can alert on, and aid in troubleshooting outages with a well defined methodology. There were two options to commit this to version control: terraform, or butler definitions. The first was chosen, since it is the preffered long term solution, and it is also simpler to implement, since it supports copy pasting the JSON definition from GCP. ### Attention points This should be automatically imported from main.tf, so it (should be) sufficient to just place the .tf file in the same folder, and have butler deploy handle the terraform apply step. ### How to review Head over to go/cf-chrome-health-beta, internally. It is not expected that the actual dashboard definition is reviewed, it is just a dump of what was manually created in GCP. Part of #4271 --- infra/terraform/monitoring.tf | 1624 +++++++++++++++++++++++++++++++++ 1 file changed, 1624 insertions(+) create mode 100644 infra/terraform/monitoring.tf diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf new file mode 100644 index 00000000000..a725e4ea075 --- /dev/null +++ b/infra/terraform/monitoring.tf @@ -0,0 +1,1624 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { + dashboard_id = "clusterfuzz_sli_dashboard" + display_name = "ClusterFuzz General Health" + dashboard_json = < Date: Thu, 26 Dec 2024 15:53:11 -0300 Subject: [PATCH 5/6] Revert #4497 (#4560) Reverting #4497 due to terraform apply erroring out, will reland once it is fixed. --- infra/terraform/monitoring.tf | 1624 --------------------------------- 1 file changed, 1624 deletions(-) delete mode 100644 infra/terraform/monitoring.tf diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf deleted file mode 100644 index a725e4ea075..00000000000 --- a/infra/terraform/monitoring.tf +++ /dev/null @@ -1,1624 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { - dashboard_id = "clusterfuzz_sli_dashboard" - display_name = "ClusterFuzz General Health" - dashboard_json = < Date: Thu, 26 Dec 2024 17:43:02 -0300 Subject: [PATCH 6/6] [Monitoring] Restore dashboard terraform definition (#4561) This brings back #4497 . The issue was that, in promql definitions, the '{}' characters were lacking a double escape (\\\\{ and \\\\}). Also, the only parameter to the terraform object should be dashboard_json. Testing: terraform validate ![image](https://github.com/user-attachments/assets/f67434d0-c8e4-4bc0-be38-27188c98ea49) Part of #4271 --- infra/terraform/monitoring.tf | 1622 +++++++++++++++++++++++++++++++++ 1 file changed, 1622 insertions(+) create mode 100644 infra/terraform/monitoring.tf diff --git a/infra/terraform/monitoring.tf b/infra/terraform/monitoring.tf new file mode 100644 index 00000000000..7d9a540d76d --- /dev/null +++ b/infra/terraform/monitoring.tf @@ -0,0 +1,1622 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_monitoring_dashboard" "clusterfuzz_sli_dashboard" { + dashboard_json = <