add playwright

sbusso · Apr 27, 2024 · 6e98aab · 6e98aab
1 parent cd6dfbe
commit 6e98aab
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -133,6 +133,12 @@ To run the tests, use the following command:
 pytest tests/
 ```
 
+using playwright:
+
+```sh
+playwright install
+```
+
 ## Contributing
 
 Contributions are welcome! Please read the contributing guidelines first.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ httpx = "^0.27.0"
 hishel = ">=0.0.24,<0.0.27"
 rich = "^13.7.1"
 fake-useragent = "^1.5.1"
+playwright = "^1.43.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.3.0"

diff --git a/scrapework/middleware.py b/scrapework/middleware.py
@@ -103,3 +103,9 @@ def process_request(self, ctx: Context, request: Request):
         proxy = choice(self.proxies)
         request.proxy = proxy.url
         return request
+
+
+class PlaywrightMiddleware(RequestMiddleware):
+    def process_request(self, ctx: Context, request: Request):
+        request.playwright = True
+        return request
diff --git a/scrapework/request.py b/scrapework/request.py
@@ -2,7 +2,9 @@
 from typing import Any, Dict
 
 import httpx
-from httpx import URL, HTTPError, TimeoutException
+from httpx import URL, Client, HTTPError, TimeoutException
+from playwright.sync_api import Request as pRequest
+from playwright.sync_api import Route, sync_playwright
 
 from scrapework.core.http_client import HTTPClient, HttpxClient
 
@@ -19,6 +21,7 @@ class Request:
     cls_client: type[HTTPClient] = HttpxClient
     client_kwargs: Dict[str, Any] = {}
     request_kwargs: Dict[str, Any] = {}
+    playwright: bool = False
 
     def __init__(self, url: str, **kwargs):
         self.url = url
@@ -39,6 +42,61 @@ class Config:
     def urljoin(self, url: str) -> str:
         return str(URL(self.url).join(URL(url)))
 
+    def httpx_request_handler(
+        self, route: Route, request: pRequest, client: httpx.Client
+    ):
+
+        # Extract request details from Playwright's Request object
+        method = request.method
+        url = request.url
+        headers = dict(request.headers)
+        # data = request.post_data
+
+        try:
+            # Perform the request using HTTPX client
+            response = client.request(
+                method,
+                url,
+                headers=headers,
+                **self.request_kwargs,
+            )
+
+            # Return the response to Playwright
+
+            route.fulfill(
+                status=response.status_code,
+                headers=dict(response.headers),
+                body=response.content,
+            )
+        except Exception as e:
+            self.logger.error(f"Error: {e} fetching {url}")
+            route.abort()
+
+    def fetch_playwright(self, httpx_client: Client) -> httpx.Response:
+
+        with sync_playwright() as p:
+            browser = p.chromium.launch()
+
+            page = browser.new_page()
+
+            page.route(
+                "**/*",
+                lambda route, request: self.httpx_request_handler(
+                    route, request, httpx_client
+                ),
+            )
+
+            page.goto(self.url)
+            content = page.content()
+            browser.close()
+
+        return httpx.Response(
+            200,
+            request=httpx.Request("GET", self.url),
+            content=content.encode("utf-8"),
+            headers={},
+        )
+
     def fetch(self) -> httpx.Response:
         """
         Fetches the HTML content of a given URL.
@@ -48,6 +106,7 @@ def fetch(self) -> httpx.Response:
 
         :return: The fetched HTML content as a string, or None if there was an error.
         """
+
         if self.proxy:
             self.logger.debug(f"Using proxy: {self.proxy}")
             mounts = {
@@ -64,6 +123,9 @@ def fetch(self) -> httpx.Response:
             **self.client_kwargs,
         )
         try:
+            if self.playwright:
+                return self.fetch_playwright(client)
+
             response: httpx.Response = client.get(
                 self.request_url,
                 **self.request_kwargs,