Merge pull request #31 from dataesr/dev

Dev
dataesr · Nov 28, 2023 · 54a0d40 · 54a0d40
2 parents abf87fc + 3374323
commit 54a0d40
Show file tree

Hide file tree

Showing 15 changed files with 97 additions and 17 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11-alpine3.18
+FROM python:3.11
 
 ENV PYTHONUNBUFFERED 1
 ENV PYTHONDONTWRITEBYTECODE 1
@@ -8,7 +8,8 @@ WORKDIR /open-crawler
 
 COPY ./requirements.txt /open-crawler
 
-RUN pip install --no-cache-dir --upgrade -r requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt \
+    && playwright install --with-deps chromium
 
 COPY ./app/ /open-crawler/app
 

diff --git a/app/celery_broker/crawler_utils.py b/app/celery_broker/crawler_utils.py
@@ -24,6 +24,12 @@ def init_crawler_settings(crawl_process: CrawlProcess):
         }
     )
     settings.update(custom_settings)
+    if crawl_process.config.parameters.use_playwright:
+        settings.set('DOWNLOAD_HANDLERS', {
+            'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
+            'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
+        })
+
     return settings
 
 

diff --git a/app/crawler/spider.py b/app/crawler/spider.py
@@ -1,5 +1,6 @@
 from urllib.parse import urlparse
 
+from scrapy import Request
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 
@@ -9,16 +10,50 @@
 class MenesrSpider(CrawlSpider):
     name = "menesr"
     rules = (Rule(),)
+    use_playwright = False
+    allowed_url = None
 
     def __init__(self, crawl_process: CrawlProcess, *a, **kw):
         parsed_url = urlparse(crawl_process.config.url)
+        self.use_playwright = crawl_process.config.parameters.use_playwright
         if parsed_url.path:
-            self.rules = (Rule(LinkExtractor(allow=parsed_url.path)),)
+            self.allowed_url = parsed_url.path
         self.allowed_domains = [parsed_url.netloc]
         self.start_urls = [crawl_process.config.url]
         self.crawl_process = crawl_process
         super().__init__(*a, **kw)
 
+
+    def start_requests(self):
+        for url in self.start_urls:
+            if self.use_playwright:
+                yield Request(url, self.parse, meta={
+                    "depth": 0, # Set the initial depth to 0
+                    "playwright": True,
+                    "playwright_page_methods": [
+                        ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
+                    ]
+                })
+            else:
+                yield Request(url, self.parse, meta={
+                    "depth": 0, # Set the initial depth to 0
+                })
+
+
+    def parse(self, response, **kwargs):
+        # Crawl the links in the response page and continue to crawl the next page
+        links = LinkExtractor(allow=self.allowed_url).extract_links(response)
+        for link in links:
+            if self.use_playwright:
+                yield Request(link.url, self.parse, meta={
+                    "playwright": True,
+                    "playwright_page_methods": [
+                        ("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
+                    ]
+                })
+            else:
+                yield Request(link.url, self.parse)
+
 
 if __name__ == "__main__":
     from scrapy.utils.project import get_project_settings

diff --git a/app/models/crawl.py b/app/models/crawl.py
@@ -12,6 +12,7 @@
 class CrawlParameters(BaseModel):
     depth: int
     limit: int
+    use_playwright: bool
 
 
 class CrawlConfig(BaseModel):

diff --git a/app/models/request.py b/app/models/request.py
@@ -11,6 +11,7 @@
 class UpdateWebsiteRequest(BaseModel):
     depth: int | None = None
     limit: int | None = None
+    use_playwright: bool = True
     lighthouse: MetadataConfig | None = None
     technologies_and_trackers: MetadataConfig | None = None
     responsiveness: MetadataConfig | None = None
@@ -25,6 +26,7 @@ class CreateWebsiteRequest(BaseModel):
     url: str
     depth: int = Field(ge=0, default=2)
     limit: int = Field(ge=0, default=400)
+    use_playwright: bool = Field(default=True)
     lighthouse: MetadataConfig = Field(default=MetadataConfig())
     technologies_and_trackers: MetadataConfig = Field(
         default=MetadataConfig(enabled=False)

diff --git a/app/models/website.py b/app/models/website.py
@@ -16,6 +16,7 @@ class WebsiteModel(BaseModel):
     url: str
     depth: int
     limit: int
+    use_playwright: bool
     lighthouse: MetadataConfig
     technologies_and_trackers: MetadataConfig
     responsiveness: MetadataConfig
@@ -31,7 +32,7 @@ class WebsiteModel(BaseModel):
     def to_config(self) -> CrawlConfig:
         return CrawlConfig(
             url=self.url,
-            parameters=CrawlParameters(depth=self.depth, limit=self.limit),
+            parameters=CrawlParameters(depth=self.depth, limit=self.limit, use_playwright=self.use_playwright),
             metadata_config={
                 MetadataType.LIGHTHOUSE: self.lighthouse,
                 MetadataType.TECHNOLOGIES: self.technologies_and_trackers,

diff --git a/app/services/carbon_calculator.py b/app/services/carbon_calculator.py
@@ -1,6 +1,6 @@
-from typing import Any
-
 import requests
+from retry import retry
+from typing import Any
 
 
 class CarbonCalculatorError(Exception):
@@ -11,6 +11,7 @@ class CarbonCalculator:
     BASE_URL = "https://api.websitecarbon.com/site"
     TIMEOUT = 300  # 5 minutes timeout for the API request
 
+    @retry(CarbonCalculatorError, tries=3, delay=2, backoff=2)
     def get_carbon_footprint(self, url: str) -> dict[str, Any]:
         if not url:
             raise ValueError("URL cannot be empty.")

diff --git a/app/services/lighthouse_calculator.py b/app/services/lighthouse_calculator.py
@@ -1,5 +1,6 @@
 import json
 import subprocess
+from retry import retry
 from typing import Any
 
 
@@ -8,6 +9,8 @@ class LighthouseError(Exception):
 
 
 class LighthouseCalculator:
+
+    @retry(LighthouseError, tries=3, delay=2, backoff=2)
     def get_lighthouse(self, url: str) -> dict[str, Any]:
         try:
             lighthouse_process = subprocess.run(

diff --git a/app/services/responsiveness_calculator.py b/app/services/responsiveness_calculator.py
@@ -1,8 +1,7 @@
-from typing import Any
-
 import requests
-
 from app.config import settings
+from retry import retry
+from typing import Any
 
 
 class ResponsivenessCalculatorError(Exception):
@@ -14,6 +13,7 @@ def __init__(self):
         self.base_url = "https://content-searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run"
         self._api_key = settings.GOOGLE_API_KEY
 
+    @retry(ResponsivenessCalculatorError, tries=3, delay=2, backoff=2)
     def get_responsiveness(self, url: str) -> dict[str, Any]:
         response = None
         try:

diff --git a/app/services/technologies_calculator.py b/app/services/technologies_calculator.py
@@ -1,5 +1,6 @@
 import json
 import subprocess
+from retry import retry
 from typing import Any
 
 
@@ -8,6 +9,8 @@ class TechnologiesError(Exception):
 
 
 class TechnologiesCalculator:
+
+    @retry(TechnologiesError, tries=3, delay=2, backoff=2)
     def get_technologies(self, url: str) -> list[dict[str, Any]]:
         try:
             technologies_process = subprocess.run(

diff --git a/requirements.txt b/requirements.txt
@@ -20,7 +20,7 @@ dnspython==2.4.2
 fastapi==0.103.1
 filelock==3.12.4
 gevent==23.7.0
-greenlet==2.0.2
+greenlet==3.0.1
 h11==0.14.0
 hyperlink==21.0.0
 idna==3.4
@@ -33,6 +33,7 @@ lxml==4.9.3
 minio==7.1.15
 packaging==23.1
 parsel==1.8.1
+playwright==1.40.0
 prompt-toolkit==3.0.39
 Protego==0.3.0
 pyasn1==0.5.0
@@ -41,6 +42,7 @@ pycparser==2.21
 pydantic==2.4.1
 pydantic_core==2.10.1
 PyDispatcher==2.0.7
+pyee==11.0.1
 pymongo==4.4.1
 pyOpenSSL==23.2.0
 python-dateutil==2.8.2
@@ -49,6 +51,7 @@ redis==4.6.0
 requests==2.31.0
 requests-file==1.5.1
 Scrapy==2.9.0
+scrapy-playwright==0.0.33
 service-identity==23.1.0
 six==1.16.0
 sniffio==1.3.0
@@ -65,3 +68,4 @@ watchfiles==0.19.0
 wcwidth==0.2.6
 zope.event==5.0
 zope.interface==6.0
+retry~=0.9.2
diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py
@@ -2,6 +2,7 @@
 from unittest.mock import MagicMock
 from urllib.parse import urlparse
 
+from scrapy.http import HtmlResponse
 from scrapy.spiders import Rule
 
 from app.crawler.spider import MenesrSpider
@@ -37,6 +38,23 @@ def test_init_with_path(self):
     def test_name(self):
         self.assertEqual(MenesrSpider.name, "menesr")
 
+    def test_start_requests(self):
+        self.mock_crawl_process.config.url = "http://www.example.com/"
+        spider = MenesrSpider(self.mock_crawl_process)
+        request = next(spider.start_requests())
+        self.assertEqual(request.url, 'http://www.example.com/')
+        self.assertEqual(request.callback, spider.parse)
+
+    def test_parse(self):
+        self.mock_crawl_process.config.url = "http://example.com/"
+        spider = MenesrSpider(self.mock_crawl_process)
+        body = ('<html><a href="/recherche/lactualite-de-la-recherche"><span>L\'actualité de la '
+                'recherche</span></a></html>').encode('utf-8')
+        response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8')
+        result = next(spider.parse(response))
+        assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche'
+        # Add assertions here to check the result
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/tests_models/test_crawl.py b/tests/tests_models/test_crawl.py
@@ -12,7 +12,7 @@
 
 class TestCrawlParametersConfig(unittest.TestCase):
     def test_instantiation(self):
-        params = CrawlParameters(depth=2, limit=400)
+        params = CrawlParameters(depth=2, limit=400, use_playwright=False)
         config = CrawlConfig(
             url="http://example.com",
             parameters=params,
@@ -29,7 +29,7 @@ class TestCrawlModel(unittest.TestCase):
     def test_default_values(self):
         config = CrawlConfig(
             url="http://example.com",
-            parameters=CrawlParameters(depth=2, limit=400),
+            parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
             metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
             headers={},
             tags=[],
@@ -43,7 +43,7 @@ def test_default_values(self):
     def test_enabled_metadata_property(self):
         config = CrawlConfig(
             url="http://example.com",
-            parameters=CrawlParameters(depth=2, limit=400),
+            parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
             metadata_config={
                 MetadataType.LIGHTHOUSE: MetadataConfig(),
                 MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False),
@@ -57,7 +57,7 @@ def test_enabled_metadata_property(self):
     def test_init_tasks_method(self):
         config = CrawlConfig(
             url="http://example.com",
-            parameters=CrawlParameters(depth=2, limit=400),
+            parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
             metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
             headers={},
             tags=[],
@@ -73,7 +73,7 @@ class TestListCrawlResponse(unittest.TestCase):
     def test_instantiation(self):
         config = CrawlConfig(
             url="http://example.com",
-            parameters=CrawlParameters(depth=2, limit=400),
+            parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
             metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
             headers={},
             tags=[],

diff --git a/tests/tests_models/test_process.py b/tests/tests_models/test_process.py
@@ -27,7 +27,7 @@ def test_from_model_classmethod(self):
             website_id="website_123",
             config=CrawlConfig(
                 url="http://example.com",
-                parameters=CrawlParameters(depth=2, limit=400),
+                parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
                 metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
                 headers={},
                 tags=[],
@@ -40,7 +40,7 @@ def test_from_model_classmethod(self):
     def test_enabled_metadata_property(self):
         config = CrawlConfig(
             url="http://example.com",
-            parameters=CrawlParameters(depth=2, limit=400),
+            parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
             metadata_config={
                 MetadataType.LIGHTHOUSE: MetadataConfig(),
                 MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False),

diff --git a/tests/tests_models/test_website.py b/tests/tests_models/test_website.py
@@ -12,6 +12,7 @@ def test_default_values(self):
             url="http://example.com",
             depth=2,
             limit=400,
+            use_playwright=False,
             lighthouse=MetadataConfig(),
             technologies_and_trackers=MetadataConfig(),
             responsiveness=MetadataConfig(),
@@ -32,6 +33,7 @@ def test_to_config_method(self):
             url="http://example.com",
             depth=2,
             limit=400,
+            use_playwright=True,
             lighthouse=MetadataConfig(),
             technologies_and_trackers=MetadataConfig(),
             responsiveness=MetadataConfig(),
@@ -52,6 +54,7 @@ def test_refresh_next_crawl_date(self):
             url="http://example.com",
             depth=2,
             limit=400,
+            use_playwright=True,
             lighthouse=MetadataConfig(),
             technologies_and_trackers=MetadataConfig(),
             responsiveness=MetadataConfig(),
@@ -71,6 +74,7 @@ def test_instantiation(self):
             url="http://example1.com",
             depth=2,
             limit=400,
+            use_playwright=True,
             lighthouse=MetadataConfig(),
             technologies_and_trackers=MetadataConfig(),
             responsiveness=MetadataConfig(),
@@ -83,6 +87,7 @@ def test_instantiation(self):
             url="http://example2.com",
             depth=3,
             limit=500,
+            use_playwright=False,
             lighthouse=MetadataConfig(enabled=False),
             technologies_and_trackers=MetadataConfig(),
             responsiveness=MetadataConfig(),