Skip to content

Commit

Permalink
Merge pull request #32 from dataesr/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
folland87 authored Nov 30, 2023
2 parents fd92231 + fd6ab99 commit f3ff75c
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 9 deletions.
8 changes: 7 additions & 1 deletion app/celery_broker/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,19 @@ def upload_html(crawl: CrawlModel):
)
local_files_folder = f"/{settings.LOCAL_FILES_PATH.strip('/')}"

for file in crawl_files_path.rglob("*.[hj][ts][mo][ln]"):
file_counter = 0
for file in crawl_files_path.rglob("*.html"):
file_path = str(file)
file_name = file_path.removeprefix(local_files_folder).lstrip('/')
files.store_html_file(
object_name=file_name,
file_path=file_path,
content_type=assume_content_type(file_path),
)
file_counter += 1
os.remove(file)
shutil.rmtree(crawl_files_path, ignore_errors=True)

# If there is no html file, we consider the crawl as failed
if file_counter == 0:
raise Exception("No html is crawled for the website")
2 changes: 1 addition & 1 deletion app/crawler/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def from_crawler(cls, crawler):
def _format_file_path(self, response, spider) -> Path:
domain = spider.allowed_domains[0]
base_file_path = f"/{settings.LOCAL_FILES_PATH.strip('/')}/{spider.crawl_process.id}"
file_name = response.url.split(f"{domain}")[-1]
file_name = response.url.split(f"{domain}")[-1].lstrip('/')
if not file_name.endswith(".html"):
file_name = f"{file_name}.html"
if file_name == ".html":
Expand Down
7 changes: 7 additions & 0 deletions app/crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,10 @@
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleFifoDiskQueue"
SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.FifoMemoryQueue"

def should_abort_request(request):
return request.resource_type in ["stylesheet", "font"]


PLAYWRIGHT_ABORT_REQUEST = should_abort_request
PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 60 * 1000 # 60 seconds
15 changes: 13 additions & 2 deletions app/crawler/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,22 @@

from app.models.process import CrawlProcess


class MenesrSpider(CrawlSpider):
name = "menesr"
rules = (Rule(),)
use_playwright = False
allowed_url = None
page_count = 0
page_limit = 0
depth_limit = 0

def __init__(self, crawl_process: CrawlProcess, *a, **kw):
parsed_url = urlparse(crawl_process.config.url)
self.use_playwright = crawl_process.config.parameters.use_playwright
if parsed_url.path:
self.allowed_url = parsed_url.path
self.page_limit = crawl_process.config.parameters.limit
self.depth_limit = crawl_process.config.parameters.depth
self.allowed_domains = [parsed_url.netloc]
self.start_urls = [crawl_process.config.url]
self.crawl_process = crawl_process
Expand All @@ -39,9 +43,15 @@ def start_requests(self):
"depth": 0, # Set the initial depth to 0
})


def parse(self, response, **kwargs):
# Crawl the links in the response page and continue to crawl the next page
self.page_count += 1
# Retrieve the depth of the current request
depth = response.meta.get('depth', 0)
if depth > self.depth_limit or self.page_limit != 0 and self.page_count > self.page_limit:
self.crawler.engine.close_spider(self, 'page_or_depth_limit_reached')
return

links = LinkExtractor(allow=self.allowed_url).extract_links(response)
for link in links:
if self.use_playwright:
Expand All @@ -52,6 +62,7 @@ def parse(self, response, **kwargs):
]
})
else:
# we don't need to add depth beacause the natif scrapy crawler already does it
yield Request(link.url, self.parse)


Expand Down
21 changes: 20 additions & 1 deletion app/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,26 @@ class CrawlParameters(BaseModel):
class CrawlConfig(BaseModel):
url: str
parameters: CrawlParameters
metadata_config: dict[MetadataType, MetadataConfig]
metadata_config: dict[MetadataType, MetadataConfig] = Field(default_factory=dict, examples=[
{
MetadataType.LIGHTHOUSE: {
"enabled": True,
"depth": 0
},
MetadataType.TECHNOLOGIES: {
"enabled": True,
"depth": 0
},
MetadataType.RESPONSIVENESS: {
"enabled": True,
"depth": 0
},
MetadataType.CARBON_FOOTPRINT: {
"enabled": True,
"depth": 0
}
}
])
headers: dict[str, Any]
tags: list[str]

Expand Down
4 changes: 2 additions & 2 deletions app/models/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class UpdateWebsiteRequest(BaseModel):
depth: int | None = None
limit: int | None = None
use_playwright: bool = True
use_playwright: bool = False
lighthouse: MetadataConfig | None = None
technologies_and_trackers: MetadataConfig | None = None
responsiveness: MetadataConfig | None = None
Expand All @@ -26,7 +26,7 @@ class CreateWebsiteRequest(BaseModel):
url: str
depth: int = Field(ge=0, default=2)
limit: int = Field(ge=0, default=400)
use_playwright: bool = Field(default=True)
use_playwright: bool = Field(default=False)
lighthouse: MetadataConfig = Field(default=MetadataConfig())
technologies_and_trackers: MetadataConfig = Field(
default=MetadataConfig(enabled=False)
Expand Down
9 changes: 7 additions & 2 deletions tests/tests_crawler/test_menesr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import MagicMock
from urllib.parse import urlparse

from scrapy import Request
from scrapy.http import HtmlResponse
from scrapy.spiders import Rule

Expand Down Expand Up @@ -46,11 +47,15 @@ def test_start_requests(self):
self.assertEqual(request.callback, spider.parse)

def test_parse(self):
self.mock_crawl_process.config.url = "http://example.com/"
self.mock_crawl_process.config.url = "http://www.example.com/"
url = self.mock_crawl_process.config.url
spider = MenesrSpider(self.mock_crawl_process)
spider.depth_limit = 1
spider.page_limit = 2
request = Request(url)
body = ('<html><a href="/recherche/lactualite-de-la-recherche"><span>L\'actualité de la '
'recherche</span></a></html>').encode('utf-8')
response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8')
response = HtmlResponse(url=url, body=body, encoding='utf-8', request=request)
result = next(spider.parse(response))
assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche'
# Add assertions here to check the result
Expand Down

0 comments on commit f3ff75c

Please sign in to comment.