From 64275ddd8dc8b31b75627babb2d09d1636897658 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Wed, 22 Nov 2023 08:08:04 +0100 Subject: [PATCH 1/6] Add use_playwright parameter in the request --- app/models/crawl.py | 1 + app/models/request.py | 2 ++ app/models/website.py | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/app/models/crawl.py b/app/models/crawl.py index 8dcac6d..f89f1d0 100644 --- a/app/models/crawl.py +++ b/app/models/crawl.py @@ -12,6 +12,7 @@ class CrawlParameters(BaseModel): depth: int limit: int + use_playwright: bool class CrawlConfig(BaseModel): diff --git a/app/models/request.py b/app/models/request.py index 718ba4d..56a6392 100644 --- a/app/models/request.py +++ b/app/models/request.py @@ -11,6 +11,7 @@ class UpdateWebsiteRequest(BaseModel): depth: int | None = None limit: int | None = None + use_playwright: bool = True lighthouse: MetadataConfig | None = None technologies_and_trackers: MetadataConfig | None = None responsiveness: MetadataConfig | None = None @@ -25,6 +26,7 @@ class CreateWebsiteRequest(BaseModel): url: str depth: int = Field(ge=0, default=2) limit: int = Field(ge=0, default=400) + use_playwright: bool = Field(default=True) lighthouse: MetadataConfig = Field(default=MetadataConfig()) technologies_and_trackers: MetadataConfig = Field( default=MetadataConfig(enabled=False) diff --git a/app/models/website.py b/app/models/website.py index 2675f1d..de052b1 100644 --- a/app/models/website.py +++ b/app/models/website.py @@ -16,6 +16,7 @@ class WebsiteModel(BaseModel): url: str depth: int limit: int + use_playwright: bool lighthouse: MetadataConfig technologies_and_trackers: MetadataConfig responsiveness: MetadataConfig @@ -31,7 +32,7 @@ class WebsiteModel(BaseModel): def to_config(self) -> CrawlConfig: return CrawlConfig( url=self.url, - parameters=CrawlParameters(depth=self.depth, limit=self.limit), + parameters=CrawlParameters(depth=self.depth, limit=self.limit, use_playwright=self.use_playwright), metadata_config={ MetadataType.LIGHTHOUSE: self.lighthouse, MetadataType.TECHNOLOGIES: self.technologies_and_trackers, From 4d9d44f819606b0b9218216f1f95cd1321cf558b Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Mon, 27 Nov 2023 09:04:51 +0100 Subject: [PATCH 2/6] Playwright support --- Dockerfile | 5 +++-- app/celery_broker/crawler_utils.py | 6 ++++++ app/crawler/spider.py | 31 ++++++++++++++++++++++++++++++ requirements.txt | 5 ++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index f8c68ad..7c03d05 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-alpine3.18 +FROM python:3.11 ENV PYTHONUNBUFFERED 1 ENV PYTHONDONTWRITEBYTECODE 1 @@ -8,7 +8,8 @@ WORKDIR /open-crawler COPY ./requirements.txt /open-crawler -RUN pip install --no-cache-dir --upgrade -r requirements.txt +RUN pip install --no-cache-dir --upgrade -r requirements.txt \ + && playwright install --with-deps chromium COPY ./app/ /open-crawler/app diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py index 6eedf51..6b31e6d 100644 --- a/app/celery_broker/crawler_utils.py +++ b/app/celery_broker/crawler_utils.py @@ -24,6 +24,12 @@ def init_crawler_settings(crawl_process: CrawlProcess): } ) settings.update(custom_settings) + if crawl_process.config.parameters.use_playwright: + settings.set('DOWNLOAD_HANDLERS', { + 'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler', + 'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler', + }) + return settings diff --git a/app/crawler/spider.py b/app/crawler/spider.py index 1b52286..bab672a 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -1,5 +1,6 @@ from urllib.parse import urlparse +from scrapy import Request from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule @@ -9,6 +10,7 @@ class MenesrSpider(CrawlSpider): name = "menesr" rules = (Rule(),) + use_playwright = False def __init__(self, crawl_process: CrawlProcess, *a, **kw): parsed_url = urlparse(crawl_process.config.url) @@ -17,8 +19,37 @@ def __init__(self, crawl_process: CrawlProcess, *a, **kw): self.allowed_domains = [parsed_url.netloc] self.start_urls = [crawl_process.config.url] self.crawl_process = crawl_process + self.use_playwright = crawl_process.config.parameters.use_playwright super().__init__(*a, **kw) + def start_requests(self): + for url in self.start_urls: + if self.use_playwright: + yield Request(url, self.parse, meta={ + 'depth': 0, # Set the initial depth to 0 + "playwright": True, + "playwright_page_methods": [ + ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') + ] + }) + else: + yield Request(url, self.parse) + + + def parse(self, response, **kwargs): + # Crawl the links in the response page and continue to crawl the next page + links = LinkExtractor().extract_links(response) + for link in links: + if self.use_playwright: + yield Request(link.url, self.parse, meta={ + "playwright": True, + "playwright_page_methods": [ + ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') + ] + }) + else: + yield Request(link.url) + if __name__ == "__main__": from scrapy.utils.project import get_project_settings diff --git a/requirements.txt b/requirements.txt index 9fb7dbb..6959573 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ dnspython==2.4.2 fastapi==0.103.1 filelock==3.12.4 gevent==23.7.0 -greenlet==2.0.2 +greenlet==3.0.1 h11==0.14.0 hyperlink==21.0.0 idna==3.4 @@ -33,6 +33,7 @@ lxml==4.9.3 minio==7.1.15 packaging==23.1 parsel==1.8.1 +playwright==1.40.0 prompt-toolkit==3.0.39 Protego==0.3.0 pyasn1==0.5.0 @@ -41,6 +42,7 @@ pycparser==2.21 pydantic==2.4.1 pydantic_core==2.10.1 PyDispatcher==2.0.7 +pyee==11.0.1 pymongo==4.4.1 pyOpenSSL==23.2.0 python-dateutil==2.8.2 @@ -49,6 +51,7 @@ redis==4.6.0 requests==2.31.0 requests-file==1.5.1 Scrapy==2.9.0 +scrapy-playwright==0.0.33 service-identity==23.1.0 six==1.16.0 sniffio==1.3.0 From 5bf63b27e1edd2d984b238cea9685523ed859ba4 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Mon, 27 Nov 2023 15:10:33 +0100 Subject: [PATCH 3/6] Limit crawl in allowed url --- app/crawler/spider.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/app/crawler/spider.py b/app/crawler/spider.py index bab672a..4c6d1bb 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -11,34 +11,38 @@ class MenesrSpider(CrawlSpider): name = "menesr" rules = (Rule(),) use_playwright = False + allowed_url = None def __init__(self, crawl_process: CrawlProcess, *a, **kw): parsed_url = urlparse(crawl_process.config.url) + self.use_playwright = crawl_process.config.parameters.use_playwright if parsed_url.path: - self.rules = (Rule(LinkExtractor(allow=parsed_url.path)),) + self.allowed_url = parsed_url.path self.allowed_domains = [parsed_url.netloc] self.start_urls = [crawl_process.config.url] self.crawl_process = crawl_process - self.use_playwright = crawl_process.config.parameters.use_playwright super().__init__(*a, **kw) + def start_requests(self): for url in self.start_urls: if self.use_playwright: yield Request(url, self.parse, meta={ - 'depth': 0, # Set the initial depth to 0 + "depth": 0, # Set the initial depth to 0 "playwright": True, "playwright_page_methods": [ ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') ] }) else: - yield Request(url, self.parse) + yield Request(url, self.parse, meta={ + "depth": 0, # Set the initial depth to 0 + }) def parse(self, response, **kwargs): # Crawl the links in the response page and continue to crawl the next page - links = LinkExtractor().extract_links(response) + links = LinkExtractor(allow=self.allowed_url).extract_links(response) for link in links: if self.use_playwright: yield Request(link.url, self.parse, meta={ @@ -48,7 +52,7 @@ def parse(self, response, **kwargs): ] }) else: - yield Request(link.url) + yield Request(link.url, self.parse) if __name__ == "__main__": From ed321878eb5c7b4e305429dbf77129e95ccd8324 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Mon, 27 Nov 2023 15:14:31 +0100 Subject: [PATCH 4/6] Repair unit tests --- tests/tests_models/test_crawl.py | 10 +++++----- tests/tests_models/test_process.py | 4 ++-- tests/tests_models/test_website.py | 5 +++++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/tests_models/test_crawl.py b/tests/tests_models/test_crawl.py index 2f87f00..0a94ed6 100644 --- a/tests/tests_models/test_crawl.py +++ b/tests/tests_models/test_crawl.py @@ -12,7 +12,7 @@ class TestCrawlParametersConfig(unittest.TestCase): def test_instantiation(self): - params = CrawlParameters(depth=2, limit=400) + params = CrawlParameters(depth=2, limit=400, use_playwright=False) config = CrawlConfig( url="http://example.com", parameters=params, @@ -29,7 +29,7 @@ class TestCrawlModel(unittest.TestCase): def test_default_values(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=False), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], @@ -43,7 +43,7 @@ def test_default_values(self): def test_enabled_metadata_property(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=True), metadata_config={ MetadataType.LIGHTHOUSE: MetadataConfig(), MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False), @@ -57,7 +57,7 @@ def test_enabled_metadata_property(self): def test_init_tasks_method(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=True), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], @@ -73,7 +73,7 @@ class TestListCrawlResponse(unittest.TestCase): def test_instantiation(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=True), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], diff --git a/tests/tests_models/test_process.py b/tests/tests_models/test_process.py index 5e5323f..1d8ec43 100644 --- a/tests/tests_models/test_process.py +++ b/tests/tests_models/test_process.py @@ -27,7 +27,7 @@ def test_from_model_classmethod(self): website_id="website_123", config=CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=False), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], @@ -40,7 +40,7 @@ def test_from_model_classmethod(self): def test_enabled_metadata_property(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=False), metadata_config={ MetadataType.LIGHTHOUSE: MetadataConfig(), MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False), diff --git a/tests/tests_models/test_website.py b/tests/tests_models/test_website.py index 628a93e..23ba664 100644 --- a/tests/tests_models/test_website.py +++ b/tests/tests_models/test_website.py @@ -12,6 +12,7 @@ def test_default_values(self): url="http://example.com", depth=2, limit=400, + use_playwright=False, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -32,6 +33,7 @@ def test_to_config_method(self): url="http://example.com", depth=2, limit=400, + use_playwright=True, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -52,6 +54,7 @@ def test_refresh_next_crawl_date(self): url="http://example.com", depth=2, limit=400, + use_playwright=True, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -71,6 +74,7 @@ def test_instantiation(self): url="http://example1.com", depth=2, limit=400, + use_playwright=True, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -83,6 +87,7 @@ def test_instantiation(self): url="http://example2.com", depth=3, limit=500, + use_playwright=False, lighthouse=MetadataConfig(enabled=False), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), From 69db6185c19df80c825eff65be34972f86d26747 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Mon, 27 Nov 2023 15:54:14 +0100 Subject: [PATCH 5/6] Add tests for start_request and parse functions --- tests/tests_crawler/test_menesr.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py index 8d6bfb8..eea884f 100644 --- a/tests/tests_crawler/test_menesr.py +++ b/tests/tests_crawler/test_menesr.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock from urllib.parse import urlparse +from scrapy.http import HtmlResponse from scrapy.spiders import Rule from app.crawler.spider import MenesrSpider @@ -37,6 +38,23 @@ def test_init_with_path(self): def test_name(self): self.assertEqual(MenesrSpider.name, "menesr") + def test_start_requests(self): + self.mock_crawl_process.config.url = "http://www.example.com/" + spider = MenesrSpider(self.mock_crawl_process) + request = next(spider.start_requests()) + self.assertEqual(request.url, 'http://www.example.com/') + self.assertEqual(request.callback, spider.parse) + + def test_parse(self): + self.mock_crawl_process.config.url = "http://example.com/" + spider = MenesrSpider(self.mock_crawl_process) + body = ('L\'actualité de la ' + 'recherche').encode('utf-8') + response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8') + result = next(spider.parse(response)) + assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche' + # Add assertions here to check the result + if __name__ == "__main__": unittest.main() From 337432392e087fa3d7b75427331142821d6cbc86 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Tue, 28 Nov 2023 09:36:02 +0100 Subject: [PATCH 6/6] MSPDT-72-retry-mecanism --- app/services/carbon_calculator.py | 5 +++-- app/services/lighthouse_calculator.py | 3 +++ app/services/responsiveness_calculator.py | 6 +++--- app/services/technologies_calculator.py | 3 +++ requirements.txt | 1 + 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/app/services/carbon_calculator.py b/app/services/carbon_calculator.py index 6ffd79c..7ab1848 100644 --- a/app/services/carbon_calculator.py +++ b/app/services/carbon_calculator.py @@ -1,6 +1,6 @@ -from typing import Any - import requests +from retry import retry +from typing import Any class CarbonCalculatorError(Exception): @@ -11,6 +11,7 @@ class CarbonCalculator: BASE_URL = "https://api.websitecarbon.com/site" TIMEOUT = 300 # 5 minutes timeout for the API request + @retry(CarbonCalculatorError, tries=3, delay=2, backoff=2) def get_carbon_footprint(self, url: str) -> dict[str, Any]: if not url: raise ValueError("URL cannot be empty.") diff --git a/app/services/lighthouse_calculator.py b/app/services/lighthouse_calculator.py index 4365a67..b682eb4 100644 --- a/app/services/lighthouse_calculator.py +++ b/app/services/lighthouse_calculator.py @@ -1,5 +1,6 @@ import json import subprocess +from retry import retry from typing import Any @@ -8,6 +9,8 @@ class LighthouseError(Exception): class LighthouseCalculator: + + @retry(LighthouseError, tries=3, delay=2, backoff=2) def get_lighthouse(self, url: str) -> dict[str, Any]: try: lighthouse_process = subprocess.run( diff --git a/app/services/responsiveness_calculator.py b/app/services/responsiveness_calculator.py index 3e72e7a..59fa420 100644 --- a/app/services/responsiveness_calculator.py +++ b/app/services/responsiveness_calculator.py @@ -1,8 +1,7 @@ -from typing import Any - import requests - from app.config import settings +from retry import retry +from typing import Any class ResponsivenessCalculatorError(Exception): @@ -14,6 +13,7 @@ def __init__(self): self.base_url = "https://content-searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run" self._api_key = settings.GOOGLE_API_KEY + @retry(ResponsivenessCalculatorError, tries=3, delay=2, backoff=2) def get_responsiveness(self, url: str) -> dict[str, Any]: response = None try: diff --git a/app/services/technologies_calculator.py b/app/services/technologies_calculator.py index ca1b55c..b8c4224 100644 --- a/app/services/technologies_calculator.py +++ b/app/services/technologies_calculator.py @@ -1,5 +1,6 @@ import json import subprocess +from retry import retry from typing import Any @@ -8,6 +9,8 @@ class TechnologiesError(Exception): class TechnologiesCalculator: + + @retry(TechnologiesError, tries=3, delay=2, backoff=2) def get_technologies(self, url: str) -> list[dict[str, Any]]: try: technologies_process = subprocess.run( diff --git a/requirements.txt b/requirements.txt index 6959573..ca9784d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,3 +68,4 @@ watchfiles==0.19.0 wcwidth==0.2.6 zope.event==5.0 zope.interface==6.0 +retry~=0.9.2 \ No newline at end of file