diff --git a/Dockerfile b/Dockerfile index f8c68ad..7c03d05 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-alpine3.18 +FROM python:3.11 ENV PYTHONUNBUFFERED 1 ENV PYTHONDONTWRITEBYTECODE 1 @@ -8,7 +8,8 @@ WORKDIR /open-crawler COPY ./requirements.txt /open-crawler -RUN pip install --no-cache-dir --upgrade -r requirements.txt +RUN pip install --no-cache-dir --upgrade -r requirements.txt \ + && playwright install --with-deps chromium COPY ./app/ /open-crawler/app diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py index 6eedf51..6b31e6d 100644 --- a/app/celery_broker/crawler_utils.py +++ b/app/celery_broker/crawler_utils.py @@ -24,6 +24,12 @@ def init_crawler_settings(crawl_process: CrawlProcess): } ) settings.update(custom_settings) + if crawl_process.config.parameters.use_playwright: + settings.set('DOWNLOAD_HANDLERS', { + 'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler', + 'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler', + }) + return settings diff --git a/app/crawler/spider.py b/app/crawler/spider.py index 1b52286..4c6d1bb 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -1,5 +1,6 @@ from urllib.parse import urlparse +from scrapy import Request from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule @@ -9,16 +10,50 @@ class MenesrSpider(CrawlSpider): name = "menesr" rules = (Rule(),) + use_playwright = False + allowed_url = None def __init__(self, crawl_process: CrawlProcess, *a, **kw): parsed_url = urlparse(crawl_process.config.url) + self.use_playwright = crawl_process.config.parameters.use_playwright if parsed_url.path: - self.rules = (Rule(LinkExtractor(allow=parsed_url.path)),) + self.allowed_url = parsed_url.path self.allowed_domains = [parsed_url.netloc] self.start_urls = [crawl_process.config.url] self.crawl_process = crawl_process super().__init__(*a, **kw) + + def start_requests(self): + for url in self.start_urls: + if self.use_playwright: + yield Request(url, self.parse, meta={ + "depth": 0, # Set the initial depth to 0 + "playwright": True, + "playwright_page_methods": [ + ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') + ] + }) + else: + yield Request(url, self.parse, meta={ + "depth": 0, # Set the initial depth to 0 + }) + + + def parse(self, response, **kwargs): + # Crawl the links in the response page and continue to crawl the next page + links = LinkExtractor(allow=self.allowed_url).extract_links(response) + for link in links: + if self.use_playwright: + yield Request(link.url, self.parse, meta={ + "playwright": True, + "playwright_page_methods": [ + ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)') + ] + }) + else: + yield Request(link.url, self.parse) + if __name__ == "__main__": from scrapy.utils.project import get_project_settings diff --git a/app/models/crawl.py b/app/models/crawl.py index 8dcac6d..f89f1d0 100644 --- a/app/models/crawl.py +++ b/app/models/crawl.py @@ -12,6 +12,7 @@ class CrawlParameters(BaseModel): depth: int limit: int + use_playwright: bool class CrawlConfig(BaseModel): diff --git a/app/models/request.py b/app/models/request.py index 718ba4d..56a6392 100644 --- a/app/models/request.py +++ b/app/models/request.py @@ -11,6 +11,7 @@ class UpdateWebsiteRequest(BaseModel): depth: int | None = None limit: int | None = None + use_playwright: bool = True lighthouse: MetadataConfig | None = None technologies_and_trackers: MetadataConfig | None = None responsiveness: MetadataConfig | None = None @@ -25,6 +26,7 @@ class CreateWebsiteRequest(BaseModel): url: str depth: int = Field(ge=0, default=2) limit: int = Field(ge=0, default=400) + use_playwright: bool = Field(default=True) lighthouse: MetadataConfig = Field(default=MetadataConfig()) technologies_and_trackers: MetadataConfig = Field( default=MetadataConfig(enabled=False) diff --git a/app/models/website.py b/app/models/website.py index 2675f1d..de052b1 100644 --- a/app/models/website.py +++ b/app/models/website.py @@ -16,6 +16,7 @@ class WebsiteModel(BaseModel): url: str depth: int limit: int + use_playwright: bool lighthouse: MetadataConfig technologies_and_trackers: MetadataConfig responsiveness: MetadataConfig @@ -31,7 +32,7 @@ class WebsiteModel(BaseModel): def to_config(self) -> CrawlConfig: return CrawlConfig( url=self.url, - parameters=CrawlParameters(depth=self.depth, limit=self.limit), + parameters=CrawlParameters(depth=self.depth, limit=self.limit, use_playwright=self.use_playwright), metadata_config={ MetadataType.LIGHTHOUSE: self.lighthouse, MetadataType.TECHNOLOGIES: self.technologies_and_trackers, diff --git a/app/services/carbon_calculator.py b/app/services/carbon_calculator.py index 6ffd79c..7ab1848 100644 --- a/app/services/carbon_calculator.py +++ b/app/services/carbon_calculator.py @@ -1,6 +1,6 @@ -from typing import Any - import requests +from retry import retry +from typing import Any class CarbonCalculatorError(Exception): @@ -11,6 +11,7 @@ class CarbonCalculator: BASE_URL = "https://api.websitecarbon.com/site" TIMEOUT = 300 # 5 minutes timeout for the API request + @retry(CarbonCalculatorError, tries=3, delay=2, backoff=2) def get_carbon_footprint(self, url: str) -> dict[str, Any]: if not url: raise ValueError("URL cannot be empty.") diff --git a/app/services/lighthouse_calculator.py b/app/services/lighthouse_calculator.py index 4365a67..b682eb4 100644 --- a/app/services/lighthouse_calculator.py +++ b/app/services/lighthouse_calculator.py @@ -1,5 +1,6 @@ import json import subprocess +from retry import retry from typing import Any @@ -8,6 +9,8 @@ class LighthouseError(Exception): class LighthouseCalculator: + + @retry(LighthouseError, tries=3, delay=2, backoff=2) def get_lighthouse(self, url: str) -> dict[str, Any]: try: lighthouse_process = subprocess.run( diff --git a/app/services/responsiveness_calculator.py b/app/services/responsiveness_calculator.py index 3e72e7a..59fa420 100644 --- a/app/services/responsiveness_calculator.py +++ b/app/services/responsiveness_calculator.py @@ -1,8 +1,7 @@ -from typing import Any - import requests - from app.config import settings +from retry import retry +from typing import Any class ResponsivenessCalculatorError(Exception): @@ -14,6 +13,7 @@ def __init__(self): self.base_url = "https://content-searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run" self._api_key = settings.GOOGLE_API_KEY + @retry(ResponsivenessCalculatorError, tries=3, delay=2, backoff=2) def get_responsiveness(self, url: str) -> dict[str, Any]: response = None try: diff --git a/app/services/technologies_calculator.py b/app/services/technologies_calculator.py index ca1b55c..b8c4224 100644 --- a/app/services/technologies_calculator.py +++ b/app/services/technologies_calculator.py @@ -1,5 +1,6 @@ import json import subprocess +from retry import retry from typing import Any @@ -8,6 +9,8 @@ class TechnologiesError(Exception): class TechnologiesCalculator: + + @retry(TechnologiesError, tries=3, delay=2, backoff=2) def get_technologies(self, url: str) -> list[dict[str, Any]]: try: technologies_process = subprocess.run( diff --git a/requirements.txt b/requirements.txt index 9fb7dbb..ca9784d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ dnspython==2.4.2 fastapi==0.103.1 filelock==3.12.4 gevent==23.7.0 -greenlet==2.0.2 +greenlet==3.0.1 h11==0.14.0 hyperlink==21.0.0 idna==3.4 @@ -33,6 +33,7 @@ lxml==4.9.3 minio==7.1.15 packaging==23.1 parsel==1.8.1 +playwright==1.40.0 prompt-toolkit==3.0.39 Protego==0.3.0 pyasn1==0.5.0 @@ -41,6 +42,7 @@ pycparser==2.21 pydantic==2.4.1 pydantic_core==2.10.1 PyDispatcher==2.0.7 +pyee==11.0.1 pymongo==4.4.1 pyOpenSSL==23.2.0 python-dateutil==2.8.2 @@ -49,6 +51,7 @@ redis==4.6.0 requests==2.31.0 requests-file==1.5.1 Scrapy==2.9.0 +scrapy-playwright==0.0.33 service-identity==23.1.0 six==1.16.0 sniffio==1.3.0 @@ -65,3 +68,4 @@ watchfiles==0.19.0 wcwidth==0.2.6 zope.event==5.0 zope.interface==6.0 +retry~=0.9.2 \ No newline at end of file diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py index 8d6bfb8..eea884f 100644 --- a/tests/tests_crawler/test_menesr.py +++ b/tests/tests_crawler/test_menesr.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock from urllib.parse import urlparse +from scrapy.http import HtmlResponse from scrapy.spiders import Rule from app.crawler.spider import MenesrSpider @@ -37,6 +38,23 @@ def test_init_with_path(self): def test_name(self): self.assertEqual(MenesrSpider.name, "menesr") + def test_start_requests(self): + self.mock_crawl_process.config.url = "http://www.example.com/" + spider = MenesrSpider(self.mock_crawl_process) + request = next(spider.start_requests()) + self.assertEqual(request.url, 'http://www.example.com/') + self.assertEqual(request.callback, spider.parse) + + def test_parse(self): + self.mock_crawl_process.config.url = "http://example.com/" + spider = MenesrSpider(self.mock_crawl_process) + body = ('L\'actualité de la ' + 'recherche').encode('utf-8') + response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8') + result = next(spider.parse(response)) + assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche' + # Add assertions here to check the result + if __name__ == "__main__": unittest.main() diff --git a/tests/tests_models/test_crawl.py b/tests/tests_models/test_crawl.py index 2f87f00..0a94ed6 100644 --- a/tests/tests_models/test_crawl.py +++ b/tests/tests_models/test_crawl.py @@ -12,7 +12,7 @@ class TestCrawlParametersConfig(unittest.TestCase): def test_instantiation(self): - params = CrawlParameters(depth=2, limit=400) + params = CrawlParameters(depth=2, limit=400, use_playwright=False) config = CrawlConfig( url="http://example.com", parameters=params, @@ -29,7 +29,7 @@ class TestCrawlModel(unittest.TestCase): def test_default_values(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=False), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], @@ -43,7 +43,7 @@ def test_default_values(self): def test_enabled_metadata_property(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=True), metadata_config={ MetadataType.LIGHTHOUSE: MetadataConfig(), MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False), @@ -57,7 +57,7 @@ def test_enabled_metadata_property(self): def test_init_tasks_method(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=True), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], @@ -73,7 +73,7 @@ class TestListCrawlResponse(unittest.TestCase): def test_instantiation(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=True), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], diff --git a/tests/tests_models/test_process.py b/tests/tests_models/test_process.py index 5e5323f..1d8ec43 100644 --- a/tests/tests_models/test_process.py +++ b/tests/tests_models/test_process.py @@ -27,7 +27,7 @@ def test_from_model_classmethod(self): website_id="website_123", config=CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=False), metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()}, headers={}, tags=[], @@ -40,7 +40,7 @@ def test_from_model_classmethod(self): def test_enabled_metadata_property(self): config = CrawlConfig( url="http://example.com", - parameters=CrawlParameters(depth=2, limit=400), + parameters=CrawlParameters(depth=2, limit=400, use_playwright=False), metadata_config={ MetadataType.LIGHTHOUSE: MetadataConfig(), MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False), diff --git a/tests/tests_models/test_website.py b/tests/tests_models/test_website.py index 628a93e..23ba664 100644 --- a/tests/tests_models/test_website.py +++ b/tests/tests_models/test_website.py @@ -12,6 +12,7 @@ def test_default_values(self): url="http://example.com", depth=2, limit=400, + use_playwright=False, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -32,6 +33,7 @@ def test_to_config_method(self): url="http://example.com", depth=2, limit=400, + use_playwright=True, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -52,6 +54,7 @@ def test_refresh_next_crawl_date(self): url="http://example.com", depth=2, limit=400, + use_playwright=True, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -71,6 +74,7 @@ def test_instantiation(self): url="http://example1.com", depth=2, limit=400, + use_playwright=True, lighthouse=MetadataConfig(), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(), @@ -83,6 +87,7 @@ def test_instantiation(self): url="http://example2.com", depth=3, limit=500, + use_playwright=False, lighthouse=MetadataConfig(enabled=False), technologies_and_trackers=MetadataConfig(), responsiveness=MetadataConfig(),