diff --git a/ftest/crawlers/echojs.py b/ftest/crawlers/echojs.py index ee93e0390e..930e3c4d9b 100644 --- a/ftest/crawlers/echojs.py +++ b/ftest/crawlers/echojs.py @@ -63,7 +63,10 @@ def process(self, job: CrawlJob, response: Response) -> SpiderResult: class EchoJSCrawler(Crawler): def __init__(self, **kwargs): - super().__init__(EchoJSSpider(), **kwargs) + super().__init__( + EchoJSSpider(), + **kwargs, + ) def factory(**crawler_kwargs): diff --git a/minet/crawl/crawler.py b/minet/crawl/crawler.py index 175c16e7b1..e31ab92959 100644 --- a/minet/crawl/crawler.py +++ b/minet/crawl/crawler.py @@ -26,6 +26,7 @@ if TYPE_CHECKING: from playwright.async_api import BrowserContext from minet.browser import ThreadsafeBrowser + from minet.web import Response from os import makedirs from os.path import join @@ -189,53 +190,73 @@ def __call__( if cancel_event.is_set(): return - try: - retryer = getattr(self.local_context, "retryer", None) - request_fn = ( - request - if self.crawler.browser is None - else self.crawler.browser.request - ) + request_fn = ( + request + if self.crawler.browser is None + else self.crawler.browser.request + ) + + # NOTE: we create an atomic unit of work that will retry both the request + # and the subsequent spider processing + response = None + + # NOTE: the function takes "url" and "raise_on_statuses" because of RequestRetrying quirks + def retryable_work( + url: str, raise_on_statuses=None + ) -> Optional[Tuple["Response", Any, Any]]: + nonlocal response + + try: + response = request_fn( + url, raise_on_statuses=raise_on_statuses, **kwargs + ) + + except CancelledRequestError: + return - if retryer is not None: - response = retryer(request_fn, job.url, **kwargs) + if cancel_event.is_set(): + return + + spider_result = spider.process(job, response) + + if spider_result is not None: + try: + data, next_jobs = spider_result + except (ValueError, TypeError): + raise TypeError( + 'Spider.process is expected to return either None or a 2-tuple containing data and next targets to enqueue. Got a "%s" instead.' + % spider_result.__class__.__name__ + ) else: - response = request_fn(job.url, **kwargs) + data = None + next_jobs = None - # If end url is different from job we add the url to visited cache - # NOTE: this is somewhat subject to race conditions but it should - # be benign and still be useful in some cases. - if self.crawler.url_cache is not None and job.url != response.end_url: - with self.crawler.enqueue_lock: - self.crawler.url_cache.add(response.end_url) + return response, data, next_jobs - except CancelledRequestError: - return + retryer = getattr(self.local_context, "retryer", None) + + try: + if retryer is None: + output = retryable_work(job.url) + else: + output = retryer(retryable_work, job.url) except EXPECTED_WEB_ERRORS as error: return ErroredCrawlResult(job, error), None - if cancel_event.is_set(): - return - - try: - spider_result = spider.process(job, response) except Exception as reason: + if response is None: + raise + raise CrawlerSpiderProcessError( reason=reason, job=job, response=response ) - if spider_result is not None: - try: - data, next_jobs = spider_result - except (ValueError, TypeError): - raise TypeError( - 'Spider.process is expected to return either None or a 2-tuple containing data and next targets to enqueue. Got a "%s" instead.' - % spider_result.__class__.__name__ - ) - else: - data = None - next_jobs = None + # Was cancelled? + if output is None: + return + + response, data, next_jobs = output if cancel_event.is_set(): return diff --git a/minet/executors.py b/minet/executors.py index 90a762d83b..80b983a0c1 100644 --- a/minet/executors.py +++ b/minet/executors.py @@ -535,7 +535,8 @@ def cancel(self) -> None: def shutdown(self, wait=True) -> None: self.cancel() - self.pool_manager.clear() + if hasattr(self, "pool_manager"): + self.pool_manager.clear() return super().shutdown(wait=wait) @overload diff --git a/minet/web.py b/minet/web.py index 7b58b39868..d61f40e71b 100644 --- a/minet/web.py +++ b/minet/web.py @@ -1236,7 +1236,7 @@ def __init__( super().__init__(*args, **kwargs) def __call__(self, fn, *args, **kwargs): - if self._invalid_statuses is not None and fn in (request, resolve): + if self._invalid_statuses is not None: kwargs["raise_on_statuses"] = self._invalid_statuses return super().__call__(fn, *args, **kwargs)