Merge pull request #32 from dataesr/dev

Dev
dataesr · Nov 30, 2023 · f3ff75c · f3ff75c
2 parents fd92231 + fd6ab99
commit f3ff75c
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 9 deletions.
diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py
@@ -128,13 +128,19 @@ def upload_html(crawl: CrawlModel):
     )
     local_files_folder = f"/{settings.LOCAL_FILES_PATH.strip('/')}"
 
-    for file in crawl_files_path.rglob("*.[hj][ts][mo][ln]"):
+    file_counter = 0
+    for file in crawl_files_path.rglob("*.html"):
         file_path = str(file)
         file_name = file_path.removeprefix(local_files_folder).lstrip('/')
         files.store_html_file(
             object_name=file_name,
             file_path=file_path,
             content_type=assume_content_type(file_path),
         )
+        file_counter += 1
         os.remove(file)
     shutil.rmtree(crawl_files_path, ignore_errors=True)
+
+    # If there is no html file, we consider the crawl as failed
+    if file_counter == 0:
+        raise Exception("No html is crawled for the website")
diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py
@@ -48,7 +48,7 @@ def from_crawler(cls, crawler):
     def _format_file_path(self, response, spider) -> Path:
         domain = spider.allowed_domains[0]
         base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}"
-        file_name = response.url.split(f"{domain}")[-1]
+        file_name = response.url.split(f"{domain}")[-1].lstrip('/')
         if not file_name.endswith(".html"):
             file_name = f"{file_name}.html"
         if file_name == ".html":

diff --git a/app/crawler/settings.py b/app/crawler/settings.py
@@ -105,3 +105,10 @@
 DEPTH_PRIORITY = 1
 SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue"
 SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"
+
+def should_abort_request(request):
+    return request.resource_type in ["stylesheet", "font"]
+
+
+PLAYWRIGHT_ABORT_REQUEST = should_abort_request
+PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 60 * 1000  # 60 seconds
diff --git a/app/crawler/spider.py b/app/crawler/spider.py
@@ -6,18 +6,22 @@
 
 from app.models.process import CrawlProcess
 
-
 class MenesrSpider(CrawlSpider):
     name = "menesr"
     rules = (Rule(),)
     use_playwright = False
     allowed_url = None
+    page_count = 0
+    page_limit = 0
+    depth_limit = 0
 
     def __init__(self, crawl_process: CrawlProcess, *a, **kw):
         parsed_url = urlparse(crawl_process.config.url)
         self.use_playwright = crawl_process.config.parameters.use_playwright
         if parsed_url.path:
             self.allowed_url = parsed_url.path
+        self.page_limit = crawl_process.config.parameters.limit
+        self.depth_limit = crawl_process.config.parameters.depth
         self.allowed_domains = [parsed_url.netloc]
         self.start_urls = [crawl_process.config.url]
         self.crawl_process = crawl_process
@@ -39,9 +43,15 @@ def start_requests(self):
                     "depth": 0, # Set the initial depth to 0
                 })
 
-
     def parse(self, response, **kwargs):
         # Crawl the links in the response page and continue to crawl the next page
+        self.page_count += 1
+        # Retrieve the depth of the current request
+        depth = response.meta.get('depth', 0)
+        if depth > self.depth_limit or self.page_limit != 0 and self.page_count > self.page_limit:
+            self.crawler.engine.close_spider(self, 'page_or_depth_limit_reached')
+            return
+
         links = LinkExtractor(allow=self.allowed_url).extract_links(response)
         for link in links:
             if self.use_playwright:
@@ -52,6 +62,7 @@ def parse(self, response, **kwargs):
                     ]
                 })
             else:
+                # we don't need to add depth beacause the natif scrapy crawler already does it
                 yield Request(link.url, self.parse)
 
 

diff --git a/app/models/crawl.py b/app/models/crawl.py
@@ -18,7 +18,26 @@ class CrawlParameters(BaseModel):
 class CrawlConfig(BaseModel):
     url: str
     parameters: CrawlParameters
-    metadata_config: dict[MetadataType, MetadataConfig]
+    metadata_config: dict[MetadataType, MetadataConfig] = Field(default_factory=dict, examples=[
+                                                                                {
+                                                                                    MetadataType.LIGHTHOUSE: {
+                                                                                        "enabled": True,
+                                                                                        "depth": 0
+                                                                                    },
+                                                                                    MetadataType.TECHNOLOGIES: {
+                                                                                        "enabled": True,
+                                                                                        "depth": 0
+                                                                                    },
+                                                                                    MetadataType.RESPONSIVENESS: {
+                                                                                        "enabled": True,
+                                                                                        "depth": 0
+                                                                                    },
+                                                                                    MetadataType.CARBON_FOOTPRINT: {
+                                                                                        "enabled": True,
+                                                                                        "depth": 0
+                                                                                    }
+                                                                                }
+                                                                            ])
     headers: dict[str, Any]
     tags: list[str]
 

diff --git a/app/models/request.py b/app/models/request.py
@@ -11,7 +11,7 @@
 class UpdateWebsiteRequest(BaseModel):
     depth: int | None = None
     limit: int | None = None
-    use_playwright: bool = True
+    use_playwright: bool = False
     lighthouse: MetadataConfig | None = None
     technologies_and_trackers: MetadataConfig | None = None
     responsiveness: MetadataConfig | None = None
@@ -26,7 +26,7 @@ class CreateWebsiteRequest(BaseModel):
     url: str
     depth: int = Field(ge=0, default=2)
     limit: int = Field(ge=0, default=400)
-    use_playwright: bool = Field(default=True)
+    use_playwright: bool = Field(default=False)
     lighthouse: MetadataConfig = Field(default=MetadataConfig())
     technologies_and_trackers: MetadataConfig = Field(
         default=MetadataConfig(enabled=False)

diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py
@@ -2,6 +2,7 @@
 from unittest.mock import MagicMock
 from urllib.parse import urlparse
 
+from scrapy import Request
 from scrapy.http import HtmlResponse
 from scrapy.spiders import Rule
 
@@ -46,11 +47,15 @@ def test_start_requests(self):
         self.assertEqual(request.callback, spider.parse)
 
     def test_parse(self):
-        self.mock_crawl_process.config.url = "http://example.com/"
+        self.mock_crawl_process.config.url = "http://www.example.com/"
+        url = self.mock_crawl_process.config.url
         spider = MenesrSpider(self.mock_crawl_process)
+        spider.depth_limit = 1
+        spider.page_limit = 2
+        request = Request(url)
         body = ('<html><a href="/recherche/lactualite-de-la-recherche"><span>L\'actualité de la '
                 'recherche</span></a></html>').encode('utf-8')
-        response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8')
+        response = HtmlResponse(url=url, body=body, encoding='utf-8', request=request)
         result = next(spider.parse(response))
         assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche'
         # Add assertions here to check the result