Skip to content

Commit 4b92ae1

Browse files
committed
Harden Playwright routing/cancellation and bump to 0.2.4
1 parent d3974ab commit 4b92ae1

7 files changed

Lines changed: 382 additions & 198 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "web-scraper-toolkit"
7-
version = "0.2.3"
7+
version = "0.2.4"
88
description = "A powerful, standalone web scraping toolkit using Playwright and various parsers."
99
readme = "README.md"
1010
requires-python = ">=3.10"

src/web_scraper_toolkit/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
1212
"""
1313

14-
__version__ = "0.2.3"
14+
__version__ = "0.2.4"
1515

1616
# Configs (Modular)
1717
from .browser.config import BrowserConfig
@@ -37,6 +37,7 @@
3737
from .parsers.scraping_tools import (
3838
read_website_markdown,
3939
read_website_content,
40+
aread_website_markdown,
4041
capture_screenshot,
4142
save_as_pdf,
4243
extract_metadata,
@@ -71,6 +72,7 @@
7172
"smart_discover_urls",
7273
"read_website_markdown",
7374
"read_website_content",
75+
"aread_website_markdown",
7476
"capture_screenshot",
7577
"save_as_pdf",
7678
"extract_metadata",

src/web_scraper_toolkit/browser/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
class BrowserConfig:
1818
headless: bool = True
1919
browser_type: str = "chromium"
20+
stealth_mode: bool = True
2021
viewport_width: int = 1280
2122
viewport_height: int = 800
2223
timeout: int = 30000

src/web_scraper_toolkit/browser/playwright_handler.py

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,18 @@
4040

4141
from .config import BrowserConfig
4242

43+
try:
44+
from playwright_stealth import Stealth as _StealthClass # type: ignore[import-untyped]
45+
except ImportError: # pragma: no cover
46+
_StealthClass = None
47+
48+
try:
49+
from playwright_stealth import stealth_async as _legacy_stealth_async # type: ignore[import-untyped]
50+
except ImportError: # pragma: no cover - older/newer API compatibility
51+
_legacy_stealth_async = None
52+
4353
logger = logging.getLogger(__name__)
54+
_STEALTH_BACKEND_LOGGED = False
4455

4556
# --- OPTIMIZED CONFIGURATION ---
4657
DEFAULT_USER_AGENTS = [
@@ -94,6 +105,7 @@ def __init__(
94105

95106
self.browser_type_name = self.config.browser_type.lower()
96107
self.headless = self.config.headless
108+
self.stealth_mode = bool(getattr(self.config, "stealth_mode", True))
97109

98110
# Mapping properties
99111
self.user_agents = DEFAULT_USER_AGENTS
@@ -111,12 +123,64 @@ def __init__(
111123

112124
self._playwright: Optional[Playwright] = None
113125
self._browser: Optional[Browser] = None
126+
self._logged_socks_auth_skips: set[str] = set()
127+
self._stealth = _StealthClass() if _StealthClass is not None else None
128+
self._stealth_missing_warned = False
129+
global _STEALTH_BACKEND_LOGGED
130+
if not _STEALTH_BACKEND_LOGGED:
131+
if self._stealth is not None:
132+
logger.info("Playwright stealth backend: playwright_stealth.Stealth")
133+
elif callable(_legacy_stealth_async):
134+
logger.info(
135+
"Playwright stealth backend: playwright_stealth.stealth_async"
136+
)
137+
else:
138+
logger.info(
139+
"Playwright stealth backend: unavailable (basic webdriver scrub only)"
140+
)
141+
_STEALTH_BACKEND_LOGGED = True
114142

115143
logger.info(
116144
f"PlaywrightManager initialized: Browser={self.browser_type_name}, Headless={self.headless}, "
117145
f"Default Timeout={self.default_navigation_timeout_ms}ms"
118146
)
119147

148+
def _build_playwright_proxy_settings(self, proxy_obj: Any) -> Dict[str, str]:
149+
"""
150+
Build Playwright proxy settings from a Proxy model.
151+
NOTE: Playwright does not support SOCKS authentication fields.
152+
"""
153+
protocol = (
154+
proxy_obj.protocol.value
155+
if hasattr(proxy_obj.protocol, "value")
156+
else str(proxy_obj.protocol)
157+
).lower()
158+
proxy_settings: Dict[str, str] = {
159+
"server": f"{protocol}://{proxy_obj.hostname}:{proxy_obj.port}"
160+
}
161+
162+
username = str(getattr(proxy_obj, "username", "") or "")
163+
password = str(getattr(proxy_obj, "password", "") or "")
164+
has_auth = bool(username or password)
165+
supports_auth = protocol in {"http", "https"}
166+
167+
if supports_auth:
168+
if username:
169+
proxy_settings["username"] = username
170+
if password:
171+
proxy_settings["password"] = password
172+
elif has_auth:
173+
warning_key = f"{proxy_obj.hostname}:{proxy_obj.port}:{protocol}"
174+
if warning_key not in self._logged_socks_auth_skips:
175+
logger.warning(
176+
"Proxy %s provided SOCKS credentials, but Playwright does not "
177+
"support SOCKS authentication. Credentials will be ignored.",
178+
warning_key,
179+
)
180+
self._logged_socks_auth_skips.add(warning_key)
181+
182+
return proxy_settings
183+
120184
@property
121185
def browser(self) -> Browser:
122186
"""Expose active browser for backward-compatible advanced operations."""
@@ -202,21 +266,12 @@ async def get_new_page(
202266
# Construct Playwright Proxy Dict
203267
# protocol://user:pass@host:port OR separate fields
204268
# Playwright expects: { "server": "...", "username": "...", "password": "..." }
205-
206-
# Protocol handling
207269
protocol = (
208270
proxy_obj.protocol.value
209271
if hasattr(proxy_obj.protocol, "value")
210272
else str(proxy_obj.protocol)
211273
)
212-
213-
proxy_settings = {
214-
"server": f"{protocol}://{proxy_obj.hostname}:{proxy_obj.port}"
215-
}
216-
if proxy_obj.username:
217-
proxy_settings["username"] = proxy_obj.username
218-
if proxy_obj.password:
219-
proxy_settings["password"] = proxy_obj.password
274+
proxy_settings = self._build_playwright_proxy_settings(proxy_obj)
220275

221276
base_context_options["proxy"] = proxy_settings
222277
logger.info(
@@ -239,16 +294,25 @@ async def get_new_page(
239294
)
240295

241296
# Tracker and ad blocking for faster, cleaner page loads
242-
await context.route(
243-
"**/*",
244-
lambda route: (
245-
route.abort()
246-
if self._is_tracker_or_ad(route.request.url)
247-
else route.continue_()
248-
),
249-
)
297+
async def _route_handler(route: Any) -> None:
298+
if self._is_tracker_or_ad(route.request.url):
299+
await route.abort()
300+
return
301+
await route.continue_()
302+
303+
await context.route("**/*", _route_handler)
250304

251305
page = await context.new_page()
306+
if self.stealth_mode and self._stealth is not None:
307+
await self._stealth.apply_stealth_async(page)
308+
elif self.stealth_mode and callable(_legacy_stealth_async):
309+
await _legacy_stealth_async(page)
310+
elif self.stealth_mode and not self._stealth_missing_warned:
311+
logger.warning(
312+
"stealth_mode is enabled but playwright_stealth is unavailable; "
313+
"falling back to basic webdriver scrubbing only."
314+
)
315+
self._stealth_missing_warned = True
252316
return page, context
253317
except Exception as e:
254318
logger.error(f"Error creating new page and context: {e}", exc_info=True)
@@ -404,6 +468,14 @@ async def fetch_page_content(
404468
f"Playwright: Timeout on {current_url_val} "
405469
f"(attempt {attempt + 1}/{effective_retries + 1}): {pte}"
406470
)
471+
except asyncio.CancelledError:
472+
logger.warning(
473+
"Playwright: Fetch cancelled on %s (attempt %s/%s).",
474+
current_url_val,
475+
attempt + 1,
476+
effective_retries + 1,
477+
)
478+
raise
407479
except Exception as e:
408480
logger.error(
409481
f"Playwright: Unexpected error on {current_url_val} "
@@ -493,6 +565,9 @@ async def smart_fetch(
493565
)
494566

495567
return content, final_url, status
568+
except asyncio.CancelledError:
569+
logger.warning("SmartFetch cancelled for %s.", url)
570+
raise
496571

497572
finally:
498573
if page:

src/web_scraper_toolkit/parsers/content.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,17 @@ async def _arun_scrape(
5656
"""Async helper for scraping."""
5757
manager = None
5858
# Config handling
59-
# Config handling
6059
browser_cfg = BrowserConfig() # default
6160
if isinstance(config, BrowserConfig):
6261
browser_cfg = config
6362
elif isinstance(config, dict):
64-
# Convert dict to BrowserConfig
6563
browser_cfg = BrowserConfig(
6664
headless=config.get("headless", True),
6765
browser_type=config.get("browser_type", "chromium"),
66+
stealth_mode=config.get("stealth_mode", True),
67+
viewport_width=config.get("viewport_width", 1280),
68+
viewport_height=config.get("viewport_height", 800),
69+
timeout=config.get("timeout", 30000),
6870
)
6971

7072
try:
@@ -154,28 +156,54 @@ def read_website_content(
154156
return _run_coro_sync(_arun_scrape(website_url, config))
155157

156158

157-
async def _arun_scrape_markdown(
159+
def _dict_to_browser_config(config: Dict[str, Any]) -> "BrowserConfig":
160+
"""Convert a dict to BrowserConfig, preserving all supported fields."""
161+
return BrowserConfig(
162+
headless=config.get("headless", True),
163+
browser_type=config.get("browser_type", "chromium"),
164+
stealth_mode=config.get("stealth_mode", True),
165+
viewport_width=config.get("viewport_width", 1280),
166+
viewport_height=config.get("viewport_height", 800),
167+
timeout=config.get("timeout", 30000),
168+
)
169+
170+
171+
async def aread_website_markdown(
158172
website_url: str,
159173
config: Optional[Union[Dict[str, Any], ParserConfig, BrowserConfig]] = None,
160174
selector: Optional[str] = None,
161175
max_length: Optional[int] = None,
176+
playwright_manager: Optional[Any] = None,
162177
) -> str:
163-
"""Async helper for scraping and converting to Markdown."""
164-
manager = None
178+
"""Async version of read_website_markdown.
179+
180+
Fetches a website and converts the content to clean Markdown.
181+
182+
Args:
183+
website_url: The full URL of the website to read.
184+
config: Browser configuration (dict, ParserConfig, or BrowserConfig).
185+
selector: Optional CSS selector to extract only specific content.
186+
max_length: Optional character limit for the output.
187+
playwright_manager: Optional pre-started PlaywrightManager instance
188+
for browser session reuse. When provided, the caller owns the
189+
lifecycle — this function will NOT start or stop the manager.
190+
"""
191+
owns_manager = playwright_manager is None
192+
manager = playwright_manager
193+
165194
browser_cfg = BrowserConfig()
166195
if isinstance(config, BrowserConfig):
167196
browser_cfg = config
168197
elif isinstance(config, dict):
169-
browser_cfg = BrowserConfig(
170-
headless=config.get("headless", True),
171-
browser_type=config.get("browser_type", "chromium"),
172-
timeout=config.get("timeout", 30000),
173-
)
198+
browser_cfg = _dict_to_browser_config(config)
199+
174200
try:
175-
from ..browser.playwright_handler import PlaywrightManager
201+
if manager is None:
202+
from ..browser.playwright_handler import PlaywrightManager
203+
204+
manager = PlaywrightManager(config=browser_cfg)
205+
await manager.start()
176206

177-
manager = PlaywrightManager(config=browser_cfg)
178-
await manager.start()
179207
# Use Smart Fetch for robustness
180208
content, final_url, status_code = await manager.smart_fetch(url=website_url)
181209

@@ -213,10 +241,14 @@ async def _arun_scrape_markdown(
213241
)
214242
return f"An error occurred while scraping the website: {str(e)}"
215243
finally:
216-
if manager:
244+
if owns_manager and manager:
217245
await manager.stop()
218246

219247

248+
# Backward-compatible alias
249+
_arun_scrape_markdown = aread_website_markdown
250+
251+
220252
def read_website_markdown(
221253
website_url: str,
222254
config: Optional[Union[Dict[str, Any], ParserConfig, BrowserConfig]] = None,
@@ -235,5 +267,5 @@ def read_website_markdown(
235267
"""
236268
logger.info(f"Executing read_website_markdown for URL: {website_url}")
237269
return _run_coro_sync(
238-
_arun_scrape_markdown(website_url, config, selector, max_length)
270+
aread_website_markdown(website_url, config, selector, max_length)
239271
)

0 commit comments

Comments
 (0)