diff --git a/tests/gamma/csm_observability_test.py b/tests/gamma/csm_observability_test.py index 24eb0d37..aa7d649c 100644 --- a/tests/gamma/csm_observability_test.py +++ b/tests/gamma/csm_observability_test.py @@ -19,6 +19,8 @@ from absl import flags from absl.testing import absltest +from google.api_core import exceptions as gapi_errors +from google.api_core import retry as gapi_retries from google.cloud import monitoring_v3 import yaml @@ -398,6 +400,27 @@ def query_metrics( A helper function to make the cloud monitoring API call to query metrics created by this test run. """ + + # Based on default retry settings for list_time_series method: + # https://github.com/googleapis/google-cloud-python/blob/google-cloud-monitoring-v2.18.0/packages/google-cloud-monitoring/google/cloud/monitoring_v3/services/metric_service/transports/base.py#L210-L218 + # Modified: predicate extended to retry on a wider range of error types. + retry_settings = gapi_retries.Retry( + initial=0.1, + maximum=30.0, + multiplier=1.3, + predicate=gapi_retries.if_exception_type( + # Retry on 5xx, not just 503 ServiceUnavailable. This also + # covers gRPC Unknown, DataLoss, and DeadlineExceeded statuses. + # 501 MethodNotImplemented not excluded because most likely + # reason we'd see this error is server misconfiguration, so we + # want to give it a chance to recovering this situation too. + gapi_errors.ServerError, + # Retry on 429/ResourceExhausted: recoverable rate limiting. + gapi_errors.TooManyRequests, + ), + deadline=90.0, + ) + results = {} for metric in metric_names: logger.info("Requesting list_time_series for metric %s", metric) @@ -406,6 +429,7 @@ def query_metrics( filter=build_query_fn(metric), interval=interval, view=monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + retry=retry_settings, ) time_series = list(response)