Skip to content

Commit

Permalink
add playwright
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Apr 27, 2024
1 parent cd6dfbe commit 6e98aab
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 2 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ To run the tests, use the following command:
pytest tests/
```

using playwright:

```sh
playwright install
```

## Contributing

Contributions are welcome! Please read the contributing guidelines first.
Expand Down
110 changes: 109 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ httpx = "^0.27.0"
hishel = ">=0.0.24,<0.0.27"
rich = "^13.7.1"
fake-useragent = "^1.5.1"
playwright = "^1.43.0"

[tool.poetry.group.dev.dependencies]
black = "^24.3.0"
Expand Down
6 changes: 6 additions & 0 deletions scrapework/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,9 @@ def process_request(self, ctx: Context, request: Request):
proxy = choice(self.proxies)
request.proxy = proxy.url
return request


class PlaywrightMiddleware(RequestMiddleware):
def process_request(self, ctx: Context, request: Request):
request.playwright = True
return request
64 changes: 63 additions & 1 deletion scrapework/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from typing import Any, Dict

import httpx
from httpx import URL, HTTPError, TimeoutException
from httpx import URL, Client, HTTPError, TimeoutException
from playwright.sync_api import Request as pRequest
from playwright.sync_api import Route, sync_playwright

from scrapework.core.http_client import HTTPClient, HttpxClient

Expand All @@ -19,6 +21,7 @@ class Request:
cls_client: type[HTTPClient] = HttpxClient
client_kwargs: Dict[str, Any] = {}
request_kwargs: Dict[str, Any] = {}
playwright: bool = False

def __init__(self, url: str, **kwargs):
self.url = url
Expand All @@ -39,6 +42,61 @@ class Config:
def urljoin(self, url: str) -> str:
return str(URL(self.url).join(URL(url)))

def httpx_request_handler(
self, route: Route, request: pRequest, client: httpx.Client
):

# Extract request details from Playwright's Request object
method = request.method
url = request.url
headers = dict(request.headers)
# data = request.post_data

try:
# Perform the request using HTTPX client
response = client.request(
method,
url,
headers=headers,
**self.request_kwargs,
)

# Return the response to Playwright

route.fulfill(
status=response.status_code,
headers=dict(response.headers),
body=response.content,
)
except Exception as e:
self.logger.error(f"Error: {e} fetching {url}")
route.abort()

def fetch_playwright(self, httpx_client: Client) -> httpx.Response:

with sync_playwright() as p:
browser = p.chromium.launch()

page = browser.new_page()

page.route(
"**/*",
lambda route, request: self.httpx_request_handler(
route, request, httpx_client
),
)

page.goto(self.url)
content = page.content()
browser.close()

return httpx.Response(
200,
request=httpx.Request("GET", self.url),
content=content.encode("utf-8"),
headers={},
)

def fetch(self) -> httpx.Response:
"""
Fetches the HTML content of a given URL.
Expand All @@ -48,6 +106,7 @@ def fetch(self) -> httpx.Response:
:return: The fetched HTML content as a string, or None if there was an error.
"""

if self.proxy:
self.logger.debug(f"Using proxy: {self.proxy}")
mounts = {
Expand All @@ -64,6 +123,9 @@ def fetch(self) -> httpx.Response:
**self.client_kwargs,
)
try:
if self.playwright:
return self.fetch_playwright(client)

response: httpx.Response = client.get(
self.request_url,
**self.request_kwargs,
Expand Down

0 comments on commit 6e98aab

Please sign in to comment.