Skip to content

Commit

Permalink
CSM O11y: improve the retry settings of list_time_series API call (#30)
Browse files Browse the repository at this point in the history
Extends the retry conditions for `metric_client.list_time_series` call
in csm_observability_test:

- Retry on HTTP 5xx + corresponding gRPC statuses. Previously only
retried on 503 ServiceUnavailable.
- Retry on HTTP 429 / gRPC ResourceExhausted. The error indicates the
request is rate limited, so we want to retry.

Ref [Default retry
settings](https://github.com/googleapis/google-cloud-python/blob/google-cloud-monitoring-v2.18.0/packages/google-cloud-monitoring/google/cloud/monitoring_v3/services/metric_service/transports/base.py#L210-L218).
  • Loading branch information
sergiitk authored Feb 2, 2024
1 parent d3d28f2 commit 49d4851
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions tests/gamma/csm_observability_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

from absl import flags
from absl.testing import absltest
from google.api_core import exceptions as gapi_errors
from google.api_core import retry as gapi_retries
from google.cloud import monitoring_v3
import yaml

Expand Down Expand Up @@ -398,6 +400,27 @@ def query_metrics(
A helper function to make the cloud monitoring API call to query
metrics created by this test run.
"""

# Based on default retry settings for list_time_series method:
# https://github.com/googleapis/google-cloud-python/blob/google-cloud-monitoring-v2.18.0/packages/google-cloud-monitoring/google/cloud/monitoring_v3/services/metric_service/transports/base.py#L210-L218
# Modified: predicate extended to retry on a wider range of error types.
retry_settings = gapi_retries.Retry(
initial=0.1,
maximum=30.0,
multiplier=1.3,
predicate=gapi_retries.if_exception_type(
# Retry on 5xx, not just 503 ServiceUnavailable. This also
# covers gRPC Unknown, DataLoss, and DeadlineExceeded statuses.
# 501 MethodNotImplemented not excluded because most likely
# reason we'd see this error is server misconfiguration, so we
# want to give it a chance to recovering this situation too.
gapi_errors.ServerError,
# Retry on 429/ResourceExhausted: recoverable rate limiting.
gapi_errors.TooManyRequests,
),
deadline=90.0,
)

results = {}
for metric in metric_names:
logger.info("Requesting list_time_series for metric %s", metric)
Expand All @@ -406,6 +429,7 @@ def query_metrics(
filter=build_query_fn(metric),
interval=interval,
view=monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL,
retry=retry_settings,
)
time_series = list(response)

Expand Down

0 comments on commit 49d4851

Please sign in to comment.