Skip to content

Commit

Permalink
add trafilatura extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 23, 2024
1 parent fdc37cb commit 5e4b55f
Show file tree
Hide file tree
Showing 6 changed files with 347 additions and 10 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ First, clone the repository and install the dependencies:
poetry add scrapework
```

### Quick Start

Flow:

- Fetch: retrieve web pages
- Extract: parse and extract structured data from pages
- Pipeline: transform and export the structured data

### Spider Configuration

- `start_urls`: A list of URLs to start scraping from.
Expand Down
266 changes: 265 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ python-dotenv = "^1.0.1"
boto3 = "^1.34.68"
pydantic = "^2.6.4"
parsel = "^1.9.0"
courlan = "^1.0.0"
trafilatura = "^1.8.0"

[tool.poetry.group.dev.dependencies]
black = "^24.3.0"
Expand Down
18 changes: 16 additions & 2 deletions scrapework/extractors.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
from typing import Any, Dict, Iterable, Union

from parsel import Selector
from pydantic import BaseModel
from trafilatura import bare_extraction


class Extractor:
class Extractor(BaseModel):
def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
raise NotImplementedError

def extract_body(self, response) -> Dict[str, str]:

class BodyExtractor(Extractor):
def extract(self, response) -> Dict[str, str]:
body = Selector(response.text).xpath("//body/text()").get()

if not body:
raise ValueError("Body not found")

return {"body": body}


class ArticleExtractor(Extractor):
def extract(self, response) -> Dict[str, str]:
article = bare_extraction(response.text)

if not article:
raise ValueError("Article not found")

return {"text": article.text}
57 changes: 53 additions & 4 deletions scrapework/spider.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import logging
from abc import ABC, abstractmethod
from http import HTTPStatus
from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Union

import requests
from pydantic import BaseModel, Field
from urllib3.exceptions import (
HTTPError,
MaxRetryError,
TimeoutError,
)

from scrapework.config import EnvConfig
from scrapework.extractors import Extractor
from scrapework.extractors import BodyExtractor
from scrapework.logger import logger
from scrapework.middleware import Middleware
from scrapework.pipelines import Pipeline
Expand All @@ -31,7 +37,7 @@ def __init__(self, **args):
raise ValueError("Subclass must provide a name attribute")
super().__init__(**args)
self.config = self.SpiderConfig.create_config()
self.callback = self.parse
self.callback = self.extract
if not self.base_url and self.start_urls:
self.base_url = self.start_urls[0]

Expand All @@ -48,8 +54,8 @@ def use(self, middleware: Middleware):
self.middlewares.append(middleware)

@abstractmethod
def parse(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
Extractor().extract_body(response)
def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
BodyExtractor().extract(response)

def run(self):
for url in self.start_urls:
Expand Down Expand Up @@ -90,3 +96,46 @@ def make_request(self, url: str) -> Optional[requests.Response]:
self.logger.info(f"Received response with status code {response.status_code}")

return response

def fetch(self, url):
"""
Fetches the HTML content of a given URL.
:param cache: The cache object used for caching the fetched HTML content.
:param url: The URL to fetch.
:return: The fetched HTML content as a string, or None if there was an error.
"""

r = None

try:
self.logger.debug(f"fetching {url}")
r = requests.get(str(url), timeout=10)

if r is None:
logger.error(f"Failed to fetch {url} returned NONE")
return None
if r.status_code != HTTPStatus.OK:
logger.error(f"Failed to fetch {url} returned {r.status_code}")
return None

return r.text # noqa: TRY300

except MaxRetryError as err:
logger.error(f"MaxRetryError fetching {url}") # type: ignore
raise err

except TimeoutError as err:
logger.error(f"TimeoutError fetching {url}: {err}") # type: ignore
raise err

except HTTPError as err:
logger.error(f"HTTPError fetching {url}: {err}") # type: ignore
raise err

except Exception as err:
logger.error(f"Exception fetching {url}: {err}") # type: ignore
raise err

return None
6 changes: 3 additions & 3 deletions tests/test_extractors.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from unittest.mock import MagicMock

from scrapework.extractors import Extractor
from scrapework.extractors import BodyExtractor, Extractor


def test_extract_body():
extractor = Extractor()
extractor = BodyExtractor()
response = MagicMock()
response.text = "<body>Hello, world!</body>"
result = extractor.extract_body(response)
result = extractor.extract(response)
assert result == {"body": "Hello, world!"}


Expand Down

0 comments on commit 5e4b55f

Please sign in to comment.