Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
"scikit-learn>=1.6.0",
"apify_fingerprint_datapoints>=0.0.3",
"apify_fingerprint_datapoints>=0.11.0",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

"browserforge>=1.2.4"
]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"]
parsel = ["parsel>=1.10.0"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"]
otel = [
"opentelemetry-api>=1.34.1",
"opentelemetry-distro[otlp]>=0.54",
Expand Down
54 changes: 47 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

from __future__ import annotations

import inspect
from asyncio import Lock
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
from playwright.async_api import BrowserType as PlaywrightBrowserType
from typing_extensions import override

from crawlee._utils.docs import docs_group
Expand All @@ -28,6 +31,18 @@
logger = getLogger(__name__)


# Cache Playwright signatures to avoid overhead in critical path
@lru_cache(maxsize=1)
def _get_context_params_cache() -> dict[str, set[str]]:
launch_persistent_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters)
new_context_params = set(inspect.signature(Browser.new_context).parameters)
return {
'common': launch_persistent_params & new_context_params,
'persistent_unique': launch_persistent_params - new_context_params,
'incognito_unique': new_context_params - launch_persistent_params,
}


@docs_group('Browser management')
class PlaywrightBrowserController(BrowserController):
"""Controller for managing Playwright browser instances and their pages.
Expand Down Expand Up @@ -222,11 +237,38 @@ async def _create_browser_context(
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}

params_cache = _get_context_params_cache()

filtered_options = {}
for key, value in browser_new_context_options.items():
if self._use_incognito_pages:
# Incognito mode (new_context)
if key in params_cache['common'] or key in params_cache['incognito_unique']:
filtered_options[key] = value
elif key in params_cache['persistent_unique']:
logger.warning(
f'Option "{key}" is only supported in persistent context mode '
'(use_incognito_pages=False) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')
elif key in params_cache['common'] or key in params_cache['persistent_unique']:
# Persistent mode (launch_persistent_context)
filtered_options[key] = value
elif key in params_cache['incognito_unique']:
logger.warning(
f'Option "{key}" is only supported in incognito context mode '
'(use_incognito_pages=True) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')

if proxy_info:
if browser_new_context_options.get('proxy'):
if filtered_options.get('proxy'):
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")

browser_new_context_options['proxy'] = ProxySettings(
filtered_options['proxy'] = ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
Expand All @@ -236,7 +278,7 @@ async def _create_browser_context(
return await AsyncNewContext(
browser=self._browser,
fingerprint=self._fingerprint_generator.generate(),
**browser_new_context_options,
**filtered_options,
)

if self._header_generator:
Expand All @@ -256,7 +298,5 @@ async def _create_browser_context(
else:
extra_http_headers = None

browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)
return await self._browser.new_context(**browser_new_context_options)
filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers)
return await self._browser.new_context(**filtered_options)
62 changes: 62 additions & 0 deletions tests/unit/browsers/test_playwright_controller_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import pytest
from playwright.async_api import Browser, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController

if TYPE_CHECKING:
from collections.abc import AsyncGenerator


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
async with async_playwright() as playwright:
yield playwright


@pytest.fixture
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
browser = await playwright.chromium.launch()
yield browser
await browser.close()


async def test_controller_validation_typo(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
with pytest.raises(TypeError, match=r'"headles" is not a valid Playwright context option.'):
await controller.new_page(browser_new_context_options={'headles': True})
await controller.close()


async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
# Default is persistent mode (use_incognito_pages=False)
controller = PlaywrightBrowserController(browser, use_incognito_pages=False)
# storage_state is incognito-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}})
assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
controller = PlaywrightBrowserController(browser, use_incognito_pages=True)
# env is persistent-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'env': {}})
assert 'Option "env" is only supported in persistent context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_valid_common(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
# viewport is common
page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}})
assert page.viewport_size == {'width': 800, 'height': 600}
await page.close()
await controller.close()
10 changes: 7 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.