From 7c5add3e7c8974fc737b55e1adaca07af7b2c0b1 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Wed, 2 Oct 2024 16:33:46 +0200 Subject: [PATCH] Dropping RequestRetrying altogether Fix #987 --- minet/crawl/crawler.py | 15 ++++++++------- minet/executors.py | 8 ++++++++ minet/web.py | 20 +++----------------- 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/minet/crawl/crawler.py b/minet/crawl/crawler.py index e31ab92959..4113cbee6c 100644 --- a/minet/crawl/crawler.py +++ b/minet/crawl/crawler.py @@ -146,6 +146,11 @@ def __init__( "cancel_event": crawler.executor.cancel_event, } + if self.crawler.executor.retry_on_statuses is not None: + self.default_kwargs["raise_on_statuses"] = ( + self.crawler.executor.retry_on_statuses + ) + if use_pycurl: del self.default_kwargs["pool_manager"] self.default_kwargs["use_pycurl"] = True @@ -200,16 +205,12 @@ def __call__( # and the subsequent spider processing response = None - # NOTE: the function takes "url" and "raise_on_statuses" because of RequestRetrying quirks - def retryable_work( - url: str, raise_on_statuses=None - ) -> Optional[Tuple["Response", Any, Any]]: + # NOTE: the function takes "url" so that the executor may format the warning's epilog + def retryable_work(url: str) -> Optional[Tuple["Response", Any, Any]]: nonlocal response try: - response = request_fn( - url, raise_on_statuses=raise_on_statuses, **kwargs - ) + response = request_fn(url, **kwargs) except CancelledRequestError: return diff --git a/minet/executors.py b/minet/executors.py index 80b983a0c1..5e2a7f5f3d 100644 --- a/minet/executors.py +++ b/minet/executors.py @@ -18,6 +18,7 @@ Union, Tuple, Awaitable, + Container, Any, TYPE_CHECKING, Literal, @@ -365,6 +366,7 @@ def __init__( infer_redirection: bool = False, canonicalize: bool = False, known_encoding: Optional[str] = None, + raise_on_statuses: Optional[Container[int]] = None, callback: Optional[ Union[ Callable[[ItemType, str, Response], CallbackResultType], @@ -398,6 +400,7 @@ def __init__( "follow_js_relocation": follow_js_relocation, "infer_redirection": infer_redirection, "canonicalize": canonicalize, + "raise_on_statuses": raise_on_statuses, } if use_pycurl: @@ -490,6 +493,7 @@ def __init__( ): self.cancel_event = Event() self.local_context = threading.local() + self.retry_on_statuses = None if retry: @@ -505,6 +509,8 @@ def epilog(retry_state: RetryCallState) -> str: default_retryer_kwargs.update(retryer_kwargs or {}) + self.retry_on_statuses = default_retryer_kwargs.get("retry_on_statuses") + def init_local_context(): self.local_context.retryer = create_request_retryer( **default_retryer_kwargs @@ -615,6 +621,7 @@ def request( use_pycurl=use_pycurl, compressed=compressed, known_encoding=known_encoding, + raise_on_statuses=self.retry_on_statuses, callback=callback, ) @@ -723,6 +730,7 @@ def resolve( follow_js_relocation=follow_js_relocation, infer_redirection=infer_redirection, canonicalize=canonicalize, + raise_on_statuses=self.retry_on_statuses, callback=callback, ) diff --git a/minet/web.py b/minet/web.py index d61f40e71b..65a6f3cf9a 100644 --- a/minet/web.py +++ b/minet/web.py @@ -1228,20 +1228,6 @@ def __call__(self, retry_state: RetryCallState) -> float: return max(0, min(result, self.max)) -class RequestRetrying(Retrying): - def __init__( - self, *args, invalid_statuses: Optional[Container[int]] = None, **kwargs - ): - self._invalid_statuses = invalid_statuses - super().__init__(*args, **kwargs) - - def __call__(self, fn, *args, **kwargs): - if self._invalid_statuses is not None: - kwargs["raise_on_statuses"] = self._invalid_statuses - - return super().__call__(fn, *args, **kwargs) - - def create_request_retryer( min: float = 10, max: float = ONE_DAY, @@ -1253,7 +1239,7 @@ def create_request_retryer( predicate: Optional[Callable[[BaseException], bool]] = None, epilog: Optional[Callable[[RetryCallState], Optional[str]]] = None, cancel_event: Optional[Event] = None, -) -> RequestRetrying: +) -> Retrying: # By default we only retry network issues, such as Internet being cut off etc. retryable_exception_types = [ # urllib3 errors @@ -1339,7 +1325,7 @@ def status_predicate(exc: BaseException) -> bool: lambda _: not cancel_event.is_set() ) - return RequestRetrying(invalid_statuses=retry_on_statuses, **retrying_kwargs) + return Retrying(**retrying_kwargs) def retrying_method(attr="retryer"): @@ -1363,7 +1349,7 @@ def __init__(self, **kwargs): self.kwargs = kwargs self.local_context = threading.local() - def acquire(self) -> RequestRetrying: + def acquire(self) -> Retrying: retryer = getattr(self.local_context, "retryer", None) if retryer is None: