From 1ffc02da813ba818a3e804e14062bd0867264f78 Mon Sep 17 00:00:00 2001 From: Nihit Date: Tue, 5 Nov 2024 17:37:15 -0800 Subject: [PATCH] Remove static HTML based webscrape (#930) --- src/autolabel/transforms/__init__.py | 2 - src/autolabel/transforms/schema.py | 1 - src/autolabel/transforms/webpage_scrape.py | 166 ------------------- tests/unit/transforms/test_webpage_scrape.py | 87 ---------- 4 files changed, 256 deletions(-) delete mode 100644 src/autolabel/transforms/webpage_scrape.py delete mode 100644 tests/unit/transforms/test_webpage_scrape.py diff --git a/src/autolabel/transforms/__init__.py b/src/autolabel/transforms/__init__.py index e6210d94..5c8b3e16 100644 --- a/src/autolabel/transforms/__init__.py +++ b/src/autolabel/transforms/__init__.py @@ -7,7 +7,6 @@ from .serper_maps import SerperMaps from .custom_api import CustomApi from .webpage_transform import WebpageTransform -from .webpage_scrape import WebpageScrape from .image import ImageTransform from typing import Dict from autolabel.transforms.schema import TransformType @@ -18,7 +17,6 @@ TRANSFORM_REGISTRY = { TransformType.PDF: PDFTransform, TransformType.WEBPAGE_TRANSFORM: WebpageTransform, - TransformType.WEBPAGE_SCRAPE: WebpageScrape, TransformType.IMAGE: ImageTransform, TransformType.WEB_SEARCH_SERP_API: SerpApi, TransformType.WEB_SEARCH_SERPER: SerperApi, diff --git a/src/autolabel/transforms/schema.py b/src/autolabel/transforms/schema.py index 6f8da6b8..1618bda2 100644 --- a/src/autolabel/transforms/schema.py +++ b/src/autolabel/transforms/schema.py @@ -9,7 +9,6 @@ class TransformType(str, Enum): """Enum containing all Transforms supported by autolabel""" WEBPAGE_TRANSFORM = "webpage_transform" - WEBPAGE_SCRAPE = "webpage_scrape" PDF = "pdf" IMAGE = "image" WEB_SEARCH_SERP_API = "web_search_serp_api" diff --git a/src/autolabel/transforms/webpage_scrape.py b/src/autolabel/transforms/webpage_scrape.py deleted file mode 100644 index 6b699411..00000000 --- a/src/autolabel/transforms/webpage_scrape.py +++ /dev/null @@ -1,166 +0,0 @@ -from autolabel.transforms.schema import ( - TransformType, - TransformError, - TransformErrorType, -) -from autolabel.transforms import BaseTransform -from typing import Dict, Any -from urllib.parse import urlparse -import asyncio -import logging -import pandas as pd -import ssl - -from autolabel.cache import BaseCache - -logger = logging.getLogger(__name__) - -MAX_RETRIES = 5 -MAX_KEEPALIVE_CONNECTIONS = 20 -CONNECTION_TIMEOUT = 10 -MAX_CONNECTIONS = 100 -BACKOFF = 2 -HEADERS = {} -HTML_PARSER = "html.parser" - - -class WebpageScrape(BaseTransform): - COLUMN_NAMES = [ - "content_column", - "content_in_bytes_column", - "soup_column", - "metadata_column", - ] - - def __init__( - self, - cache: BaseCache, - output_columns: Dict[str, Any], - url_column: str, - timeout: int = 60, - headers: Dict[str, str] = HEADERS, - max_retries: int = MAX_RETRIES, - ) -> None: - super().__init__(cache, output_columns) - self.url_column = url_column - self.headers = headers - self.max_retries = max_retries - try: - from bs4 import BeautifulSoup - import httpx - - if not headers.get("User-Agent"): - from fake_useragent import UserAgent - - headers["User-Agent"] = UserAgent().random - - self.httpx = httpx - self.timeout_time = timeout - self.timeout = httpx.Timeout(connect=CONNECTION_TIMEOUT, timeout=timeout) - limits = httpx.Limits( - max_keepalive_connections=MAX_KEEPALIVE_CONNECTIONS, - max_connections=MAX_CONNECTIONS, - keepalive_expiry=timeout, - ) - self.client = httpx.AsyncClient( - timeout=self.timeout, limits=limits, follow_redirects=True - ) - self.client_with_no_verify = httpx.AsyncClient( - timeout=self.timeout, limits=limits, follow_redirects=True, verify=False - ) - self.beautiful_soup = BeautifulSoup - except ImportError: - raise ImportError( - "BeautifulSoup, httpx and fake_useragent are required to use the webpage transform. Please install them with the following command: pip install beautifulsoup4 httpx fake_useragent" - ) - - def name(self) -> str: - return TransformType.WEBPAGE_SCRAPE - - def _load_metadata(self, url, soup) -> Dict[str, Any]: - metadata = {"url": url} - if soup.find("title"): - metadata["title"] = soup.find("title").get_text() - for meta in soup.find_all("meta"): - if meta.get("name") and meta.get("content"): - metadata[meta.get("name")] = meta.get("content") - elif meta.get("property") and meta.get("content"): - metadata[meta.get("property")] = meta.get("content") - return metadata - - async def _load_url( - self, url: str, verify=True, headers=HEADERS, retry_count=0 - ) -> Dict[str, Any]: - if retry_count >= self.max_retries: - logger.warning(f"Max retries reached for URL: {url}") - raise TransformError( - TransformErrorType.MAX_RETRIES_REACHED, - f"Max retries reached for URL: {url}", - ) - - try: - client = self.client - if not verify: - client = self.client_with_no_verify - response = await client.get(url, headers=headers) - response.raise_for_status() - # TODO: Add support for other parsers - content_bytes = response.content - soup = self.beautiful_soup(content_bytes, HTML_PARSER) - return { - "content": soup.get_text(), - "content_bytes": content_bytes, - "soup": soup, - "metadata": self._load_metadata(url, soup), - } - except self.httpx.ConnectTimeout as e: - logger.error(f"Timeout when fetching content from URL: {url}") - raise TransformError( - TransformErrorType.TRANSFORM_TIMEOUT, - "Timeout when fetching content from URL", - ) - except ssl.SSLCertVerificationError as e: - logger.warning( - f"SSL verification error when fetching content from URL: {url}, retrying with verify=False" - ) - await asyncio.sleep(BACKOFF**retry_count) - return await self._load_url( - url, verify=False, headers=headers, retry_count=retry_count + 1 - ) - except Exception as e: - logger.error(f"Error fetching content from URL: {url}. Exception: {e}") - raise e - - async def _apply(self, row: Dict[str, Any]) -> Dict[str, Any]: - url = row[self.url_column] - url_response_data = {} - if pd.isna(url) or url == self.NULL_TRANSFORM_TOKEN: - raise TransformError( - TransformErrorType.INVALID_INPUT, - f"Empty url in row {row}", - ) - else: - if not urlparse(url).scheme: - url = f"https://{url}" - url_response_data = await self._load_url(url) - - transformed_row = { - self.output_columns["content_column"]: url_response_data.get("content"), - self.output_columns["content_in_bytes_column"]: url_response_data.get( - "content_bytes" - ), - self.output_columns["soup_column"]: url_response_data.get("soup"), - self.output_columns["metadata_column"]: url_response_data.get("metadata"), - } - - return self._return_output_row(transformed_row) - - def params(self): - return { - "url_column": self.url_column, - "output_columns": self.output_columns, - "timeout": self.timeout_time, - } - - def input_columns(self): - return [self.url_column] diff --git a/tests/unit/transforms/test_webpage_scrape.py b/tests/unit/transforms/test_webpage_scrape.py deleted file mode 100644 index b2b1d702..00000000 --- a/tests/unit/transforms/test_webpage_scrape.py +++ /dev/null @@ -1,87 +0,0 @@ -from autolabel.transforms.webpage_scrape import WebpageScrape -import pytest - -pytest_plugins = ("pytest_asyncio",) - - -@pytest.mark.asyncio -async def test_webpage_scrape(): - # Initialize the transform class - transform = WebpageScrape( - output_columns={ - "content_column": "webpage_content", - "metadata_column": "metadata", - }, - url_column="url", - cache=None, - ) - - # Create a mock row - row = {"url": "en.wikipedia.org/wiki/Main_Page"} - # Transform the row - transformed_row = await transform.apply(row) - # Check the output - assert set(transformed_row.keys()) == set( - ["webpage_content", "metadata", "webpage_content_error", "metadata_error"] - ) - assert isinstance(transformed_row["webpage_content"], str) - assert isinstance(transformed_row["metadata"], dict) - assert len(transformed_row["webpage_content"]) > 0 - - -@pytest.mark.asyncio -async def test_empty_url(): - # Initialize the transform class - transform = WebpageScrape( - output_columns={ - "content_column": "webpage_content", - }, - url_column="url", - cache=None, - ) - - # Create a mock row - row = {"url": transform.NULL_TRANSFORM_TOKEN} - # Transform the row - transformed_row = await transform.apply(row) - # Check the output - assert set(transformed_row.keys()) == set( - ["webpage_content", "webpage_content_error"] - ) - assert ( - transformed_row["webpage_content"] - == "INVALID_INPUT: Empty url in row {'url': 'NO_TRANSFORM'}" - ) - assert ( - transformed_row["webpage_content_error"] - == "INVALID_INPUT: Empty url in row {'url': 'NO_TRANSFORM'}" - ) - - -@pytest.mark.asyncio -async def test_unreachable_url(): - # Initialize the transform class - transform = WebpageScrape( - output_columns={ - "content_column": "webpage_content", - }, - url_column="url", - cache=None, - ) - - # Create a mock row - row = {"url": "http://portal.net.kp/"} - # Transform the row - transformed_row = await transform.apply(row) - # Check the output - assert set(transformed_row.keys()) == set( - ["webpage_content", "webpage_content_error"] - ) - assert ( - transformed_row["webpage_content"] - == "ENRICHMENT_TIMEOUT: Timeout when fetching content from URL" - ) - assert ( - transformed_row["webpage_content_error"] - == "ENRICHMENT_TIMEOUT: Timeout when fetching content from URL" - )