add trafilatura extractor

sbusso · Mar 23, 2024 · 5e4b55f · 5e4b55f
1 parent fdc37cb
commit 5e4b55f
Show file tree

Hide file tree

Showing 6 changed files with 347 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,14 @@ First, clone the repository and install the dependencies:
 poetry add scrapework
 ```
 
+### Quick Start
+
+Flow:
+
+- Fetch: retrieve web pages
+- Extract: parse and extract structured data from pages
+- Pipeline: transform and export the structured data
+
 ### Spider Configuration
 
 - `start_urls`: A list of URLs to start scraping from.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,8 @@ python-dotenv = "^1.0.1"
 boto3 = "^1.34.68"
 pydantic = "^2.6.4"
 parsel = "^1.9.0"
+courlan = "^1.0.0"
+trafilatura = "^1.8.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.3.0"

diff --git a/scrapework/extractors.py b/scrapework/extractors.py
@@ -1,16 +1,30 @@
 from typing import Any, Dict, Iterable, Union
 
 from parsel import Selector
+from pydantic import BaseModel
+from trafilatura import bare_extraction
 
 
-class Extractor:
+class Extractor(BaseModel):
     def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
         raise NotImplementedError
 
-    def extract_body(self, response) -> Dict[str, str]:
+
+class BodyExtractor(Extractor):
+    def extract(self, response) -> Dict[str, str]:
         body = Selector(response.text).xpath("//body/text()").get()
 
         if not body:
             raise ValueError("Body not found")
 
         return {"body": body}
+
+
+class ArticleExtractor(Extractor):
+    def extract(self, response) -> Dict[str, str]:
+        article = bare_extraction(response.text)
+
+        if not article:
+            raise ValueError("Article not found")
+
+        return {"text": article.text}
diff --git a/scrapework/spider.py b/scrapework/spider.py
@@ -1,12 +1,18 @@
 import logging
 from abc import ABC, abstractmethod
+from http import HTTPStatus
 from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Union
 
 import requests
 from pydantic import BaseModel, Field
+from urllib3.exceptions import (
+    HTTPError,
+    MaxRetryError,
+    TimeoutError,
+)
 
 from scrapework.config import EnvConfig
-from scrapework.extractors import Extractor
+from scrapework.extractors import BodyExtractor
 from scrapework.logger import logger
 from scrapework.middleware import Middleware
 from scrapework.pipelines import Pipeline
@@ -31,7 +37,7 @@ def __init__(self, **args):
             raise ValueError("Subclass must provide a name attribute")
         super().__init__(**args)
         self.config = self.SpiderConfig.create_config()
-        self.callback = self.parse
+        self.callback = self.extract
         if not self.base_url and self.start_urls:
             self.base_url = self.start_urls[0]
 
@@ -48,8 +54,8 @@ def use(self, middleware: Middleware):
         self.middlewares.append(middleware)
 
     @abstractmethod
-    def parse(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
-        Extractor().extract_body(response)
+    def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
+        BodyExtractor().extract(response)
 
     def run(self):
         for url in self.start_urls:
@@ -90,3 +96,46 @@ def make_request(self, url: str) -> Optional[requests.Response]:
         self.logger.info(f"Received response with status code {response.status_code}")
 
         return response
+
+    def fetch(self, url):
+        """
+        Fetches the HTML content of a given URL.
+
+        :param cache: The cache object used for caching the fetched HTML content.
+        :param url: The URL to fetch.
+
+        :return: The fetched HTML content as a string, or None if there was an error.
+        """
+
+        r = None
+
+        try:
+            self.logger.debug(f"fetching {url}")
+            r = requests.get(str(url), timeout=10)
+
+            if r is None:
+                logger.error(f"Failed to fetch {url} returned NONE")
+                return None
+            if r.status_code != HTTPStatus.OK:
+                logger.error(f"Failed to fetch {url} returned {r.status_code}")
+                return None
+
+            return r.text  # noqa: TRY300
+
+        except MaxRetryError as err:
+            logger.error(f"MaxRetryError fetching {url}")  # type: ignore
+            raise err
+
+        except TimeoutError as err:
+            logger.error(f"TimeoutError fetching {url}: {err}")  # type: ignore
+            raise err
+
+        except HTTPError as err:
+            logger.error(f"HTTPError fetching {url}: {err}")  # type: ignore
+            raise err
+
+        except Exception as err:
+            logger.error(f"Exception fetching {url}: {err}")  # type: ignore
+            raise err
+
+        return None
diff --git a/tests/test_extractors.py b/tests/test_extractors.py
@@ -1,13 +1,13 @@
 from unittest.mock import MagicMock
 
-from scrapework.extractors import Extractor
+from scrapework.extractors import BodyExtractor, Extractor
 
 
 def test_extract_body():
-    extractor = Extractor()
+    extractor = BodyExtractor()
     response = MagicMock()
     response.text = "<body>Hello, world!</body>"
-    result = extractor.extract_body(response)
+    result = extractor.extract(response)
     assert result == {"body": "Hello, world!"}