Skip to content

Commit

Permalink
fix for scrapingbee fallback (#931)
Browse files Browse the repository at this point in the history
* fix for scrapingbee fallback

* feature flag v2 web scraping

* black fmt
  • Loading branch information
nihit authored Nov 7, 2024
1 parent 1ffc02d commit d7340d2
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions src/autolabel/transforms/webpage_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from langchain.docstore.document import Document
from autolabel.cache import BaseCache
from scrapingbee import ScrapingBeeClient
import requests

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -47,6 +48,8 @@ def __init__(
timeout: int = DEFAULT_TIMEOUT,
scrapingbee_api_key: str = None,
max_retries: int = MAX_RETRIES,
v2_enabled: bool = False,
v2_api_key: str = None,
) -> None:
super().__init__(cache, output_columns)
self.url_column = url_column
Expand All @@ -60,12 +63,30 @@ def __init__(
"transparent_status_code": "True",
"js_scenario": JS_SCENARIO,
}
self.v2_enabled = v2_enabled
self.v2_api_key = v2_api_key

def name(self) -> str:
return TransformType.WEBPAGE_TRANSFORM

def _load_url_v2(self, url: str) -> str:
try:
headers = {"Authorization": f"Bearer {self.v2_api_key}"}

url = f"https://r.jina.ai/{url}"
response = requests.get(url, headers=headers)
return response.text
except Exception as e:
logger.warning(f"Error fetching content from URL: {url}. Exception: {e}")
raise TransformError(
TransformErrorType.TRANSFORM_ERROR,
f"Error fetching content from URL: {url}. Exception: {e}",
)

# On error, retry fetching the URL with a premium proxy. Only use exponential backoff for certain status codes.
async def _load_url(self, url: str, retry_count=0) -> str:
if self.v2_enabled:
return self._load_url_v2(url)
if retry_count >= self.max_retries:
logger.warning(f"Max retries reached for URL: {url}")
raise TransformError(
Expand Down

0 comments on commit d7340d2

Please sign in to comment.