Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #31

Merged
merged 6 commits into from
Nov 28, 2023
Merged

Dev #31

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11-alpine3.18
FROM python:3.11

ENV PYTHONUNBUFFERED 1
ENV PYTHONDONTWRITEBYTECODE 1
Expand All @@ -8,7 +8,8 @@ WORKDIR /open-crawler

COPY ./requirements.txt /open-crawler

RUN pip install --no-cache-dir --upgrade -r requirements.txt
RUN pip install --no-cache-dir --upgrade -r requirements.txt \
&& playwright install --with-deps chromium

COPY ./app/ /open-crawler/app

Expand Down
6 changes: 6 additions & 0 deletions app/celery_broker/crawler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ def init_crawler_settings(crawl_process: CrawlProcess):
}
)
settings.update(custom_settings)
if crawl_process.config.parameters.use_playwright:
settings.set('DOWNLOAD_HANDLERS', {
'http': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
'https': 'scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler',
})

return settings


Expand Down
37 changes: 36 additions & 1 deletion app/crawler/spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from urllib.parse import urlparse

from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

Expand All @@ -9,16 +10,50 @@
class MenesrSpider(CrawlSpider):
name = "menesr"
rules = (Rule(),)
use_playwright = False
allowed_url = None

def __init__(self, crawl_process: CrawlProcess, *a, **kw):
parsed_url = urlparse(crawl_process.config.url)
self.use_playwright = crawl_process.config.parameters.use_playwright
if parsed_url.path:
self.rules = (Rule(LinkExtractor(allow=parsed_url.path)),)
self.allowed_url = parsed_url.path
self.allowed_domains = [parsed_url.netloc]
self.start_urls = [crawl_process.config.url]
self.crawl_process = crawl_process
super().__init__(*a, **kw)


def start_requests(self):
for url in self.start_urls:
if self.use_playwright:
yield Request(url, self.parse, meta={
"depth": 0, # Set the initial depth to 0
"playwright": True,
"playwright_page_methods": [
("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
]
})
else:
yield Request(url, self.parse, meta={
"depth": 0, # Set the initial depth to 0
})


def parse(self, response, **kwargs):
# Crawl the links in the response page and continue to crawl the next page
links = LinkExtractor(allow=self.allowed_url).extract_links(response)
for link in links:
if self.use_playwright:
yield Request(link.url, self.parse, meta={
"playwright": True,
"playwright_page_methods": [
("evaluate", 'window.scrollTo(0, document.body.scrollHeight)')
]
})
else:
yield Request(link.url, self.parse)


if __name__ == "__main__":
from scrapy.utils.project import get_project_settings
Expand Down
1 change: 1 addition & 0 deletions app/models/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
class CrawlParameters(BaseModel):
depth: int
limit: int
use_playwright: bool


class CrawlConfig(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions app/models/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class UpdateWebsiteRequest(BaseModel):
depth: int | None = None
limit: int | None = None
use_playwright: bool = True
lighthouse: MetadataConfig | None = None
technologies_and_trackers: MetadataConfig | None = None
responsiveness: MetadataConfig | None = None
Expand All @@ -25,6 +26,7 @@ class CreateWebsiteRequest(BaseModel):
url: str
depth: int = Field(ge=0, default=2)
limit: int = Field(ge=0, default=400)
use_playwright: bool = Field(default=True)
lighthouse: MetadataConfig = Field(default=MetadataConfig())
technologies_and_trackers: MetadataConfig = Field(
default=MetadataConfig(enabled=False)
Expand Down
3 changes: 2 additions & 1 deletion app/models/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class WebsiteModel(BaseModel):
url: str
depth: int
limit: int
use_playwright: bool
lighthouse: MetadataConfig
technologies_and_trackers: MetadataConfig
responsiveness: MetadataConfig
Expand All @@ -31,7 +32,7 @@ class WebsiteModel(BaseModel):
def to_config(self) -> CrawlConfig:
return CrawlConfig(
url=self.url,
parameters=CrawlParameters(depth=self.depth, limit=self.limit),
parameters=CrawlParameters(depth=self.depth, limit=self.limit, use_playwright=self.use_playwright),
metadata_config={
MetadataType.LIGHTHOUSE: self.lighthouse,
MetadataType.TECHNOLOGIES: self.technologies_and_trackers,
Expand Down
5 changes: 3 additions & 2 deletions app/services/carbon_calculator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any

import requests
from retry import retry
from typing import Any


class CarbonCalculatorError(Exception):
Expand All @@ -11,6 +11,7 @@ class CarbonCalculator:
BASE_URL = "https://api.websitecarbon.com/site"
TIMEOUT = 300 # 5 minutes timeout for the API request

@retry(CarbonCalculatorError, tries=3, delay=2, backoff=2)
def get_carbon_footprint(self, url: str) -> dict[str, Any]:
if not url:
raise ValueError("URL cannot be empty.")
Expand Down
3 changes: 3 additions & 0 deletions app/services/lighthouse_calculator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import subprocess
from retry import retry
from typing import Any


Expand All @@ -8,6 +9,8 @@ class LighthouseError(Exception):


class LighthouseCalculator:

@retry(LighthouseError, tries=3, delay=2, backoff=2)
def get_lighthouse(self, url: str) -> dict[str, Any]:
try:
lighthouse_process = subprocess.run(
Expand Down
6 changes: 3 additions & 3 deletions app/services/responsiveness_calculator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Any

import requests

from app.config import settings
from retry import retry
from typing import Any


class ResponsivenessCalculatorError(Exception):
Expand All @@ -14,6 +13,7 @@ def __init__(self):
self.base_url = "https://content-searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run"
self._api_key = settings.GOOGLE_API_KEY

@retry(ResponsivenessCalculatorError, tries=3, delay=2, backoff=2)
def get_responsiveness(self, url: str) -> dict[str, Any]:
response = None
try:
Expand Down
3 changes: 3 additions & 0 deletions app/services/technologies_calculator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import subprocess
from retry import retry
from typing import Any


Expand All @@ -8,6 +9,8 @@ class TechnologiesError(Exception):


class TechnologiesCalculator:

@retry(TechnologiesError, tries=3, delay=2, backoff=2)
def get_technologies(self, url: str) -> list[dict[str, Any]]:
try:
technologies_process = subprocess.run(
Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dnspython==2.4.2
fastapi==0.103.1
filelock==3.12.4
gevent==23.7.0
greenlet==2.0.2
greenlet==3.0.1
h11==0.14.0
hyperlink==21.0.0
idna==3.4
Expand All @@ -33,6 +33,7 @@ lxml==4.9.3
minio==7.1.15
packaging==23.1
parsel==1.8.1
playwright==1.40.0
prompt-toolkit==3.0.39
Protego==0.3.0
pyasn1==0.5.0
Expand All @@ -41,6 +42,7 @@ pycparser==2.21
pydantic==2.4.1
pydantic_core==2.10.1
PyDispatcher==2.0.7
pyee==11.0.1
pymongo==4.4.1
pyOpenSSL==23.2.0
python-dateutil==2.8.2
Expand All @@ -49,6 +51,7 @@ redis==4.6.0
requests==2.31.0
requests-file==1.5.1
Scrapy==2.9.0
scrapy-playwright==0.0.33
service-identity==23.1.0
six==1.16.0
sniffio==1.3.0
Expand All @@ -65,3 +68,4 @@ watchfiles==0.19.0
wcwidth==0.2.6
zope.event==5.0
zope.interface==6.0
retry~=0.9.2
18 changes: 18 additions & 0 deletions tests/tests_crawler/test_menesr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import MagicMock
from urllib.parse import urlparse

from scrapy.http import HtmlResponse
from scrapy.spiders import Rule

from app.crawler.spider import MenesrSpider
Expand Down Expand Up @@ -37,6 +38,23 @@ def test_init_with_path(self):
def test_name(self):
self.assertEqual(MenesrSpider.name, "menesr")

def test_start_requests(self):
self.mock_crawl_process.config.url = "http://www.example.com/"
spider = MenesrSpider(self.mock_crawl_process)
request = next(spider.start_requests())
self.assertEqual(request.url, 'http://www.example.com/')
self.assertEqual(request.callback, spider.parse)

def test_parse(self):
self.mock_crawl_process.config.url = "http://example.com/"
spider = MenesrSpider(self.mock_crawl_process)
body = ('<html><a href="/recherche/lactualite-de-la-recherche"><span>L\'actualité de la '
'recherche</span></a></html>').encode('utf-8')
response = HtmlResponse(url='http://www.example.com', body=body, encoding='utf-8')
result = next(spider.parse(response))
assert result.url == 'http://www.example.com/recherche/lactualite-de-la-recherche'
# Add assertions here to check the result


if __name__ == "__main__":
unittest.main()
10 changes: 5 additions & 5 deletions tests/tests_models/test_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class TestCrawlParametersConfig(unittest.TestCase):
def test_instantiation(self):
params = CrawlParameters(depth=2, limit=400)
params = CrawlParameters(depth=2, limit=400, use_playwright=False)
config = CrawlConfig(
url="http://example.com",
parameters=params,
Expand All @@ -29,7 +29,7 @@ class TestCrawlModel(unittest.TestCase):
def test_default_values(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand All @@ -43,7 +43,7 @@ def test_default_values(self):
def test_enabled_metadata_property(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
metadata_config={
MetadataType.LIGHTHOUSE: MetadataConfig(),
MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False),
Expand All @@ -57,7 +57,7 @@ def test_enabled_metadata_property(self):
def test_init_tasks_method(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand All @@ -73,7 +73,7 @@ class TestListCrawlResponse(unittest.TestCase):
def test_instantiation(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=True),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_models/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_from_model_classmethod(self):
website_id="website_123",
config=CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
metadata_config={MetadataType.LIGHTHOUSE: MetadataConfig()},
headers={},
tags=[],
Expand All @@ -40,7 +40,7 @@ def test_from_model_classmethod(self):
def test_enabled_metadata_property(self):
config = CrawlConfig(
url="http://example.com",
parameters=CrawlParameters(depth=2, limit=400),
parameters=CrawlParameters(depth=2, limit=400, use_playwright=False),
metadata_config={
MetadataType.LIGHTHOUSE: MetadataConfig(),
MetadataType.TECHNOLOGIES: MetadataConfig(enabled=False),
Expand Down
5 changes: 5 additions & 0 deletions tests/tests_models/test_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def test_default_values(self):
url="http://example.com",
depth=2,
limit=400,
use_playwright=False,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -32,6 +33,7 @@ def test_to_config_method(self):
url="http://example.com",
depth=2,
limit=400,
use_playwright=True,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -52,6 +54,7 @@ def test_refresh_next_crawl_date(self):
url="http://example.com",
depth=2,
limit=400,
use_playwright=True,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -71,6 +74,7 @@ def test_instantiation(self):
url="http://example1.com",
depth=2,
limit=400,
use_playwright=True,
lighthouse=MetadataConfig(),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand All @@ -83,6 +87,7 @@ def test_instantiation(self):
url="http://example2.com",
depth=3,
limit=500,
use_playwright=False,
lighthouse=MetadataConfig(enabled=False),
technologies_and_trackers=MetadataConfig(),
responsiveness=MetadataConfig(),
Expand Down
Loading