Skip to content

Commit

Permalink
Merge pull request #31 from dataesr/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
folland87 authored Nov 28, 2023
2 parents abf87fc + 3374323 commit 54a0d40
Show file tree
Hide file tree
Showing 15 changed files with 97 additions and 17 deletions.
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11-alpine3.18
FROM python:3.11

ENV PYTHONUNBUFFERED 1
ENV PYTHONDONTWRITEBYTECODE 1
Expand All @@ -8,7 +8,8 @@ WORKDIR /open-crawler

COPY ./requirements.txt /open-crawler

RUN pip install --no-cache-dir --upgrade -r requirements.txt
RUN pip install --no-cache-dir --upgrade -r requirements.txt \
&& playwright install --with-deps chromium

COPY ./app/ /open-crawler/app

Expand Down
6 changes: 6 additions & 0 deletions app/celery_broker/crawler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ def init_crawler_settings(crawl_process: CrawlProcess):
}
)
settings.update(custom_settings)
if crawl_process.config.parameters.use_playwright:
settings.set('DOWNLOAD_HANDLERS', {
'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
})

return settings


Expand Down
37 changes: 36 additions & 1 deletion app/crawler/spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from urllib.parse import urlparse

from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

Expand All @@ -9,16 +10,50 @@
class MenesrSpider(CrawlSpider):
name = "menesr"
rules = (Rule(),)
use_playwright = False
allowed_url = None

def __init__(self, crawl_process: CrawlProcess, *a, **kw):
parsed_url = urlparse(crawl_process.config.url)
self.use_playwright = crawl_process.config.parameters.use_playwright
if parsed_url.path:
self.rules = (Rule(LinkExtractor(allow=parsed_url.path)),)
self.allowed_url = parsed_url.path
self.allowed_domains = [parsed_url.netloc]
self.start_urls = [crawl_process.config.url]
self.crawl_process = crawl_process
super().__init__(*a, **kw)


def start_requests(self):
for url in self.start_urls:
if self.use_playwright:
yield Request(url, self.parse, meta={
"depth": 0, # Set the initial depth to 0
"playwright": True,
"playwright_page_methods": [
("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
]
})
else:
yield Request(url, self.parse, meta={
"depth": 0, # Set the initial depth to 0
})


def parse(self, response, **kwargs):
# Crawl the links in the response page and continue to crawl the next page
links = LinkExtractor(allow=self.allowed_url).extract_links(response)
for link in links:
if self.use_playwright:
yield Request(link.url, self.parse, meta={
"playwright": True,
"playwright_page_methods": [
("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
]
})
else:
yield Request(link.url, self.parse)


if __name__ == "__main__":
from scrapy.utils.project import get_project_settings
Expand Down
1 change: 1 addition & 0 deletions app/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
class CrawlParameters(BaseModel):
depth: int
limit: int
use_playwright: bool


class CrawlConfig(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions app/models/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class UpdateWebsiteRequest(BaseModel):
depth: int | None = None
limit: int | None = None
use_playwright: bool = True
lighthouse: MetadataConfig | None = None
technologies_and_trackers: MetadataConfig | None = None
responsiveness: MetadataConfig | None = None
Expand All @@ -25,6 +26,7 @@ class CreateWebsiteRequest(BaseModel):
url: str
depth: int = Field(ge=0, default=2)
limit: int = Field(ge=0, default=400)
use_playwright: bool = Field(default=True)
lighthouse: MetadataConfig = Field(default=MetadataConfig())
technologies_and_trackers: MetadataConfig = Field(
default=MetadataConfig(enabled=False)
Expand Down
3 changes: 2 additions & 1 deletion app/models/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class WebsiteModel(BaseModel):
url: str
depth: int
limit: int
use_playwright: bool
lighthouse: MetadataConfig
technologies_and_trackers: MetadataConfig
responsiveness: MetadataConfig
Expand All @@ -31,7 +32,7 @@ class WebsiteModel(BaseModel):
def to_config(self) -> CrawlConfig:
return CrawlConfig(
url=self.url,
parameters=CrawlParameters(depth=self.depth, limit=self.limit),
parameters=CrawlParameters(depth=self.depth, limit=self.limit, use_playwright=self.use_playwright),
metadata_config={
MetadataType.LIGHTHOUSE: self.lighthouse,
MetadataType.TECHNOLOGIES: self.technologies_and_trackers,
Expand Down
5 changes: 3 additions & 2 deletions app/services/carbon_calculator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any

import requests
from retry import retry
from typing import Any


class CarbonCalculatorError(Exception):
Expand All @@ -11,6 +11,7 @@ class CarbonCalculator:
BASE_URL = "https://api.websitecarbon.com/site"
TIMEOUT = 300 # 5 minutes timeout for the API request

@retry(CarbonCalculatorError, tries=3, delay=2, backoff=2)
def get_carbon_footprint(self, url: str) -> dict[str, Any]:
if not url:
raise ValueError("URL cannot be empty.")
Expand Down
3 changes: 3 additions & 0 deletions app/services/lighthouse_calculator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import subprocess
from retry import retry
from typing import Any


Expand All @@ -8,6 +9,8 @@ class LighthouseError(Exception):


class LighthouseCalculator:

@retry(LighthouseError, tries=3, delay=2, backoff=2)
def get_lighthouse(self, url: str) -> dict[str, Any]:
try:
lighthouse_process = subprocess.run(
Expand Down
6 changes: 3 additions & 3 deletions app/services/responsiveness_calculator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Any

import requests

from app.config import settings
from retry import retry
from typing import Any


class ResponsivenessCalculatorError(Exception):
Expand All @@ -14,6 +13,7 @@ def __init__(self):
self.base_url = "https://content-searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run"
self._api_key = settings.GOOGLE_API_KEY

@retry(ResponsivenessCalculatorError, tries=3, delay=2, backoff=2)
def get_responsiveness(self, url: str) -> dict[str, Any]:
response = None
try:
Expand Down
3 changes: 3 additions & 0 deletions app/services/technologies_calculator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import subprocess
from retry import retry
from typing import Any


Expand All @@ -8,6 +9,8 @@ class TechnologiesError(Exception):


class TechnologiesCalculator:

@retry(TechnologiesError, tries=3, delay=2, backoff=2)
def get_technologies(self, url: str) -> list[dict[str, Any]]:
try:
technologies_process = subprocess.run(
Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dnspython==2.4.2
fastapi==0.103.1
filelock==3.12.4
gevent==23.7.0
greenlet==2.0.2
greenlet==3.0.1
h11==0.14.0
hyperlink==21.0.0
idna==3.4
Expand All @@ -33,6 +33,7 @@ lxml==4.9.3
minio==7.1.15
packaging==23.1
parsel==1.8.1
playwright==1.40.0
prompt-toolkit==3.0.39
Protego==0.3.0
pyasn1==0.5.0
Expand All @@ -41,6 +42,7 @@ pycparser==2.21
pydantic==2.4.1
pydantic_core==2.10.1
PyDispatcher==2.0.7
pyee==11.0.1
pymongo==4.4.1
pyOpenSSL==23.2.0
python-dateutil==2.8.2
Expand All @@ -49,6 +51,7 @@ redis==4.6.0
requests==2.31.0
requests-file==1.5.1
Scrapy==2.9.0
scrapy-playwright==0.0.33
service-identity==23.1.0
six==1.16.0
sniffio==1.3.0
Expand All @@ -65,3 +68,4 @@ watchfiles==0.19.0
wcwidth==0.2.6
zope.event==5.0
zope.interface==6.0
retry~=0.9.2
18 changes: 18 additions & 0 deletions tests/tests_crawler/test_menesr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import MagicMock
from urllib.parse import urlparse

from scrapy.http import HtmlResponse
from scrapy.spiders import Rule

from app.crawler.spider import MenesrSpider
Expand Down Expand Up @@ -37,6 +38,23 @@ def test_init_with_path(self):
def test_name(self):
self.assertEqual(MenesrSpider.name, "menesr")

def test_start_requests(self):
self.mock_crawl_process.config.url = "http://www.example.com/"
spider = MenesrSpider(self.mock_crawl_process)
request = next(spider.start_requests())
self.assertEqual(request.url, 'http://www.example.com/')
self.assertEqual(request.callback, spider.parse)

def test_parse(self):
self.mock_crawl_process.config.url = "http://example.com/"
spider = MenesrSpider(self.mock_crawl_process)
body = ('<html><a href="/recherche/lactualite-de-la-recherche"><span>L\'actualité de la '
'recherche</span></a></html>').encode('utf-8')
response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8')
result = next(spider.parse(response))
assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche'
# Add assertions here to check the result


if __name__ == "__main__":
unittest.main()
10 changes: 5 additions & 5 deletions tests/tests_models/test_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class TestCrawlParametersConfig(unittest.TestCase):
def test_instantiation(self):
params = CrawlParameters(depth=2, limit=400)
params = CrawlParameters(depth=2, limit=400, use_playwright=False)
config = CrawlConfig(
url="http://example.com",
parameters=params,
Expand All @@ -29,7 +29,7 @@ class TestCrawlModel(unittest.TestCase):
def test_default_values(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand All @@ -43,7 +43,7 @@ def test_default_values(self):
def test_enabled_metadata_property(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
metadata_config={
MetadataType.LIGHTHOUSE: MetadataConfig(),
MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False),
Expand All @@ -57,7 +57,7 @@ def test_enabled_metadata_property(self):
def test_init_tasks_method(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand All @@ -73,7 +73,7 @@ class TestListCrawlResponse(unittest.TestCase):
def test_instantiation(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_models/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_from_model_classmethod(self):
website_id="website_123",
config=CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand All @@ -40,7 +40,7 @@ def test_from_model_classmethod(self):
def test_enabled_metadata_property(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
metadata_config={
MetadataType.LIGHTHOUSE: MetadataConfig(),
MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False),
Expand Down
5 changes: 5 additions & 0 deletions tests/tests_models/test_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def test_default_values(self):
url="http://example.com",
depth=2,
limit=400,
use_playwright=False,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -32,6 +33,7 @@ def test_to_config_method(self):
url="http://example.com",
depth=2,
limit=400,
use_playwright=True,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -52,6 +54,7 @@ def test_refresh_next_crawl_date(self):
url="http://example.com",
depth=2,
limit=400,
use_playwright=True,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -71,6 +74,7 @@ def test_instantiation(self):
url="http://example1.com",
depth=2,
limit=400,
use_playwright=True,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -83,6 +87,7 @@ def test_instantiation(self):
url="http://example2.com",
depth=3,
limit=500,
use_playwright=False,
lighthouse=MetadataConfig(enabled=False),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand Down

0 comments on commit 54a0d40

Please sign in to comment.