diff --git a/app/celery_broker/tasks.py b/app/celery_broker/tasks.py index d3d48e4..da9b90e 100644 --- a/app/celery_broker/tasks.py +++ b/app/celery_broker/tasks.py @@ -128,7 +128,8 @@ def upload_html(crawl: CrawlModel): ) local_files_folder = f"/{settings.LOCAL_FILES_PATH.strip('/')}" - for file in crawl_files_path.rglob("*.[hj][ts][mo][ln]"): + file_counter = 0 + for file in crawl_files_path.rglob("*.html"): file_path = str(file) file_name = file_path.removeprefix(local_files_folder).lstrip('/') files.store_html_file( @@ -136,5 +137,10 @@ def upload_html(crawl: CrawlModel): file_path=file_path, content_type=assume_content_type(file_path), ) + file_counter += 1 os.remove(file) shutil.rmtree(crawl_files_path, ignore_errors=True) + + # If there is no html file, we consider the crawl as failed + if file_counter == 0: + raise Exception("No html is crawled for the website") diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index 2df014b..7241e9b 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -48,7 +48,7 @@ def from_crawler(cls, crawler): def _format_file_path(self, response, spider) -> Path: domain = spider.allowed_domains[0] base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}" - file_name = response.url.split(f"{domain}")[-1] + file_name = response.url.split(f"{domain}")[-1].lstrip('/') if not file_name.endswith(".html"): file_name = f"{file_name}.html" if file_name == ".html": diff --git a/app/crawler/settings.py b/app/crawler/settings.py index 4ff60d2..e17ad0c 100644 --- a/app/crawler/settings.py +++ b/app/crawler/settings.py @@ -105,3 +105,10 @@ DEPTH_PRIORITY = 1 SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue" SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue" + +def should_abort_request(request): + return request.resource_type in ["stylesheet", "font"] + + +PLAYWRIGHT_ABORT_REQUEST = should_abort_request +PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 60 * 1000 # 60 seconds diff --git a/app/crawler/spider.py b/app/crawler/spider.py index 4c6d1bb..e56ee28 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -6,18 +6,22 @@ from app.models.process import CrawlProcess - class MenesrSpider(CrawlSpider): name = "menesr" rules = (Rule(),) use_playwright = False allowed_url = None + page_count = 0 + page_limit = 0 + depth_limit = 0 def __init__(self, crawl_process: CrawlProcess, *a, **kw): parsed_url = urlparse(crawl_process.config.url) self.use_playwright = crawl_process.config.parameters.use_playwright if parsed_url.path: self.allowed_url = parsed_url.path + self.page_limit = crawl_process.config.parameters.limit + self.depth_limit = crawl_process.config.parameters.depth self.allowed_domains = [parsed_url.netloc] self.start_urls = [crawl_process.config.url] self.crawl_process = crawl_process @@ -39,9 +43,15 @@ def start_requests(self): "depth": 0, # Set the initial depth to 0 }) - def parse(self, response, **kwargs): # Crawl the links in the response page and continue to crawl the next page + self.page_count += 1 + # Retrieve the depth of the current request + depth = response.meta.get('depth', 0) + if depth > self.depth_limit or self.page_limit != 0 and self.page_count > self.page_limit: + self.crawler.engine.close_spider(self, 'page_or_depth_limit_reached') + return + links = LinkExtractor(allow=self.allowed_url).extract_links(response) for link in links: if self.use_playwright: @@ -52,6 +62,7 @@ def parse(self, response, **kwargs): ] }) else: + # we don't need to add depth beacause the natif scrapy crawler already does it yield Request(link.url, self.parse) diff --git a/app/models/crawl.py b/app/models/crawl.py index f89f1d0..1f1a695 100644 --- a/app/models/crawl.py +++ b/app/models/crawl.py @@ -18,7 +18,26 @@ class CrawlParameters(BaseModel): class CrawlConfig(BaseModel): url: str parameters: CrawlParameters - metadata_config: dict[MetadataType, MetadataConfig] + metadata_config: dict[MetadataType, MetadataConfig] = Field(default_factory=dict, examples=[ + { + MetadataType.LIGHTHOUSE: { + "enabled": True, + "depth": 0 + }, + MetadataType.TECHNOLOGIES: { + "enabled": True, + "depth": 0 + }, + MetadataType.RESPONSIVENESS: { + "enabled": True, + "depth": 0 + }, + MetadataType.CARBON_FOOTPRINT: { + "enabled": True, + "depth": 0 + } + } + ]) headers: dict[str, Any] tags: list[str] diff --git a/app/models/request.py b/app/models/request.py index 56a6392..9d63058 100644 --- a/app/models/request.py +++ b/app/models/request.py @@ -11,7 +11,7 @@ class UpdateWebsiteRequest(BaseModel): depth: int | None = None limit: int | None = None - use_playwright: bool = True + use_playwright: bool = False lighthouse: MetadataConfig | None = None technologies_and_trackers: MetadataConfig | None = None responsiveness: MetadataConfig | None = None @@ -26,7 +26,7 @@ class CreateWebsiteRequest(BaseModel): url: str depth: int = Field(ge=0, default=2) limit: int = Field(ge=0, default=400) - use_playwright: bool = Field(default=True) + use_playwright: bool = Field(default=False) lighthouse: MetadataConfig = Field(default=MetadataConfig()) technologies_and_trackers: MetadataConfig = Field( default=MetadataConfig(enabled=False) diff --git a/tests/tests_crawler/test_menesr.py b/tests/tests_crawler/test_menesr.py index eea884f..b301d6c 100644 --- a/tests/tests_crawler/test_menesr.py +++ b/tests/tests_crawler/test_menesr.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock from urllib.parse import urlparse +from scrapy import Request from scrapy.http import HtmlResponse from scrapy.spiders import Rule @@ -46,11 +47,15 @@ def test_start_requests(self): self.assertEqual(request.callback, spider.parse) def test_parse(self): - self.mock_crawl_process.config.url = "http://example.com/" + self.mock_crawl_process.config.url = "http://www.example.com/" + url = self.mock_crawl_process.config.url spider = MenesrSpider(self.mock_crawl_process) + spider.depth_limit = 1 + spider.page_limit = 2 + request = Request(url) body = ('L\'actualité de la ' 'recherche').encode('utf-8') - response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8') + response = HtmlResponse(url=url, body=body, encoding='utf-8', request=request) result = next(spider.parse(response)) assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche' # Add assertions here to check the result