Skip to content

Commit 7f09ce6

Browse files
committed
Add fetch result metadata API and release 0.3.1
1 parent 6066f14 commit 7f09ce6

6 files changed

Lines changed: 254 additions & 64 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "web-scraper-toolkit"
7-
version = "0.3.0"
7+
version = "0.3.1"
88
description = "A powerful, standalone web scraping toolkit using Playwright and various parsers."
99
readme = "README.md"
1010
requires-python = ">=3.10"

src/web_scraper_toolkit/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
Operational notes: this is the canonical API surface; keep exports intentional and synchronized with docs/tests.
99
"""
1010

11-
__version__ = "0.3.0"
11+
__version__ = "0.3.1"
1212

1313
# Configs (Modular)
1414
from .browser.config import BrowserConfig

src/web_scraper_toolkit/core/script_diagnostics.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import time
1919
from dataclasses import asdict, dataclass
2020
from datetime import datetime, timezone
21+
from functools import lru_cache
2122
from pathlib import Path
2223
from typing import Any, Dict, List, Optional, Sequence
2324

@@ -39,18 +40,24 @@ def _utc_now_iso() -> str:
3940
return datetime.now(timezone.utc).isoformat()
4041

4142

42-
def split_cli_args(raw: str) -> List[str]:
43-
"""Split a shell-like argument string into argv tokens."""
43+
@lru_cache(maxsize=512)
44+
def _split_cli_args_cached(raw: str) -> tuple[str, ...]:
45+
"""Cached argv splitter for hot-path diagnostic argument parsing."""
4446
if not raw.strip():
45-
return []
47+
return ()
4648
tokens = shlex.split(raw, posix=False)
4749
normalized: List[str] = []
4850
for token in tokens:
4951
if len(token) >= 2 and token[0] == token[-1] and token[0] in {"'", '"'}:
5052
normalized.append(token[1:-1])
5153
else:
5254
normalized.append(token)
53-
return normalized
55+
return tuple(normalized)
56+
57+
58+
def split_cli_args(raw: str) -> List[str]:
59+
"""Split a shell-like argument string into argv tokens."""
60+
return list(_split_cli_args_cached(raw))
5461

5562

5663
def _safe_read_json(path: Path) -> Optional[Any]:
Lines changed: 54 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,54 @@
1-
# ./src/web_scraper_toolkit/parsers/__init__.py
2-
"""
3-
Parsers Package
4-
===============
5-
6-
Exports key parser utilities, extraction, search, and sitemap functions.
7-
8-
Sub-packages:
9-
- extraction: Contact, metadata, media extraction
10-
- search: Web search and SERP parsing
11-
- sitemap: Sitemap discovery and parsing
12-
"""
13-
14-
from .html_to_markdown import MarkdownConverter
15-
from .sitemap import (
16-
fetch_sitemap_content as fetch_sitemap,
17-
parse_sitemap_urls as parse_sitemap,
18-
extract_sitemap_tree,
19-
)
20-
from .scraping_tools import read_website_markdown, read_website_content
21-
22-
# Re-exports from extraction sub-package (backward compatibility)
23-
from .extraction.contacts import extract_emails, extract_phones, extract_socials
24-
from .extraction.metadata import extract_metadata
25-
from .extraction.media import capture_screenshot, save_as_pdf
26-
27-
# Re-exports from search sub-package
28-
from .search.search import general_web_search, deep_research_with_google
29-
from .search.serp_parser import SerpParser
30-
31-
__all__ = [
32-
# Core
33-
"MarkdownConverter",
34-
"read_website_markdown",
35-
"read_website_content",
36-
# Sitemap
37-
"fetch_sitemap",
38-
"parse_sitemap",
39-
"extract_sitemap_tree",
40-
# Extraction
41-
"extract_emails",
42-
"extract_phones",
43-
"extract_socials",
44-
"extract_metadata",
45-
"capture_screenshot",
46-
"save_as_pdf",
47-
# Search
48-
"general_web_search",
49-
"deep_research_with_google",
50-
"SerpParser",
51-
]
1+
# ./src/web_scraper_toolkit/parsers/__init__.py
2+
"""
3+
Parsers Package
4+
===============
5+
6+
Exports key parser utilities, extraction, search, and sitemap functions.
7+
8+
Sub-packages:
9+
- extraction: Contact, metadata, media extraction
10+
- search: Web search and SERP parsing
11+
- sitemap: Sitemap discovery and parsing
12+
"""
13+
14+
from .html_to_markdown import MarkdownConverter
15+
from .sitemap import (
16+
fetch_sitemap_content as fetch_sitemap,
17+
parse_sitemap_urls as parse_sitemap,
18+
extract_sitemap_tree,
19+
)
20+
from .content import FetchResult, aread_website_markdown_result
21+
from .scraping_tools import read_website_markdown, read_website_content
22+
23+
# Re-exports from extraction sub-package (backward compatibility)
24+
from .extraction.contacts import extract_emails, extract_phones, extract_socials
25+
from .extraction.metadata import extract_metadata
26+
from .extraction.media import capture_screenshot, save_as_pdf
27+
28+
# Re-exports from search sub-package
29+
from .search.search import general_web_search, deep_research_with_google
30+
from .search.serp_parser import SerpParser
31+
32+
__all__ = [
33+
# Core
34+
"MarkdownConverter",
35+
"FetchResult",
36+
"aread_website_markdown_result",
37+
"read_website_markdown",
38+
"read_website_content",
39+
# Sitemap
40+
"fetch_sitemap",
41+
"parse_sitemap",
42+
"extract_sitemap_tree",
43+
# Extraction
44+
"extract_emails",
45+
"extract_phones",
46+
"extract_socials",
47+
"extract_metadata",
48+
"capture_screenshot",
49+
"save_as_pdf",
50+
# Search
51+
"general_web_search",
52+
"deep_research_with_google",
53+
"SerpParser",
54+
]

src/web_scraper_toolkit/parsers/content.py

Lines changed: 146 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import logging
1111
import re
1212
from concurrent.futures import Future
13+
from dataclasses import dataclass, field
1314
from threading import Thread
1415
from typing import Any, Coroutine, Dict, List, Optional, TypeVar, Union
1516

@@ -23,6 +24,34 @@
2324
T = TypeVar("T")
2425

2526

27+
@dataclass(frozen=True)
28+
class FetchResult:
29+
"""Structured fetch result that preserves markdown plus routing metadata."""
30+
31+
markdown: str
32+
final_url: str
33+
status_code: Optional[int]
34+
route_selected: str = ""
35+
host_profile_applied: str = ""
36+
challenge_detected: bool = False
37+
blocked_reason: str = ""
38+
artifact_paths: Dict[str, str] = field(default_factory=dict)
39+
metadata: Dict[str, Any] = field(default_factory=dict)
40+
41+
def as_dict(self) -> Dict[str, Any]:
42+
return {
43+
"markdown": self.markdown,
44+
"final_url": self.final_url,
45+
"status_code": self.status_code,
46+
"route_selected": self.route_selected,
47+
"host_profile_applied": self.host_profile_applied,
48+
"challenge_detected": self.challenge_detected,
49+
"blocked_reason": self.blocked_reason,
50+
"artifact_paths": dict(self.artifact_paths),
51+
"metadata": dict(self.metadata),
52+
}
53+
54+
2655
def _run_coro_sync(coro: Coroutine[Any, Any, T]) -> T:
2756
"""
2857
Run an async coroutine from synchronous call sites.
@@ -154,13 +183,51 @@ def _dict_to_browser_config(config: Dict[str, Any]) -> "BrowserConfig":
154183
return BrowserConfig.from_dict(config)
155184

156185

157-
async def aread_website_markdown(
186+
def _extract_route_selected(metadata: Dict[str, Any]) -> str:
187+
"""Resolve a stable route label from Playwright smart-fetch metadata."""
188+
candidate_values: List[str] = []
189+
for key in ("attempt_profile", "route_selected", "fetch_route", "strategy"):
190+
value = metadata.get(key)
191+
if isinstance(value, str) and value.strip():
192+
candidate_values.append(value.strip())
193+
resolved_routing = metadata.get("resolved_routing", {})
194+
if isinstance(resolved_routing, dict):
195+
for key in ("strategy", "primary_strategy", "mode"):
196+
value = resolved_routing.get(key)
197+
if isinstance(value, str) and value.strip():
198+
candidate_values.append(value.strip())
199+
return candidate_values[0] if candidate_values else ""
200+
201+
202+
def _extract_artifact_paths(metadata: Dict[str, Any]) -> Dict[str, str]:
203+
"""Collect artifact-like path fields without assuming a rigid metadata schema."""
204+
artifact_paths: Dict[str, str] = {}
205+
for key, value in metadata.items():
206+
if not isinstance(value, str) or not value.strip():
207+
continue
208+
lowered_key = key.lower()
209+
lowered_value = value.lower()
210+
if (
211+
"path" in lowered_key
212+
or "artifact" in lowered_key
213+
or "screenshot" in lowered_key
214+
):
215+
artifact_paths[key] = value
216+
continue
217+
if lowered_value.endswith(
218+
(".png", ".jpg", ".jpeg", ".pdf", ".html", ".json", ".md")
219+
):
220+
artifact_paths[key] = value
221+
return artifact_paths
222+
223+
224+
async def aread_website_markdown_result(
158225
website_url: str,
159226
config: Optional[Union[Dict[str, Any], ParserConfig, BrowserConfig]] = None,
160227
selector: Optional[str] = None,
161228
max_length: Optional[int] = None,
162229
playwright_manager: Optional[Any] = None,
163-
) -> str:
230+
) -> FetchResult:
164231
"""Async version of read_website_markdown.
165232
166233
Fetches a website and converts the content to clean Markdown.
@@ -192,6 +259,19 @@ async def aread_website_markdown(
192259

193260
# Use Smart Fetch for robustness
194261
content, final_url, status_code = await manager.smart_fetch(url=website_url)
262+
fetch_metadata = (
263+
manager.get_last_fetch_metadata()
264+
if hasattr(manager, "get_last_fetch_metadata")
265+
else {}
266+
)
267+
if not isinstance(fetch_metadata, dict):
268+
fetch_metadata = {}
269+
route_selected = _extract_route_selected(fetch_metadata)
270+
host_profile_applied = str(
271+
fetch_metadata.get("active_host_profile_applied", "")
272+
).strip()
273+
blocked_reason = str(fetch_metadata.get("blocked_reason", "")).strip()
274+
challenge_detected = blocked_reason.lower() not in {"", "none"}
195275

196276
if status_code == 200 and content:
197277
# Selector filtering (BeautifulSoup)
@@ -201,7 +281,17 @@ async def aread_website_markdown(
201281
if selected_tag:
202282
content = str(selected_tag)
203283
else:
204-
return f"Error: Selector '{selector}' not found on {website_url}"
284+
return FetchResult(
285+
markdown=f"Error: Selector '{selector}' not found on {website_url}",
286+
final_url=final_url,
287+
status_code=status_code,
288+
route_selected=route_selected,
289+
host_profile_applied=host_profile_applied,
290+
challenge_detected=challenge_detected,
291+
blocked_reason=blocked_reason,
292+
artifact_paths=_extract_artifact_paths(fetch_metadata),
293+
metadata=fetch_metadata,
294+
)
205295

206296
# Convert to Markdown
207297
markdown = MarkdownConverter.to_markdown(content, base_url=final_url)
@@ -218,19 +308,68 @@ async def aread_website_markdown(
218308
logger.info(
219309
f"Successfully scraped and converted {len(markdown)} chars from {final_url}"
220310
)
221-
return output
222-
else:
223-
return f"Error: Failed to retrieve content from {website_url}. Status code: {status_code}"
311+
return FetchResult(
312+
markdown=output,
313+
final_url=final_url,
314+
status_code=status_code,
315+
route_selected=route_selected,
316+
host_profile_applied=host_profile_applied,
317+
challenge_detected=challenge_detected,
318+
blocked_reason=blocked_reason,
319+
artifact_paths=_extract_artifact_paths(fetch_metadata),
320+
metadata=fetch_metadata,
321+
)
322+
323+
return FetchResult(
324+
markdown=(
325+
f"Error: Failed to retrieve content from {website_url}. "
326+
f"Status code: {status_code}"
327+
),
328+
final_url=final_url,
329+
status_code=status_code,
330+
route_selected=route_selected,
331+
host_profile_applied=host_profile_applied,
332+
challenge_detected=challenge_detected,
333+
blocked_reason=blocked_reason,
334+
artifact_paths=_extract_artifact_paths(fetch_metadata),
335+
metadata=fetch_metadata,
336+
)
224337
except Exception as e:
225338
logger.error(
226339
f"An error occurred while scraping {website_url}: {e}", exc_info=True
227340
)
228-
return f"An error occurred while scraping the website: {str(e)}"
341+
return FetchResult(
342+
markdown=f"An error occurred while scraping the website: {str(e)}",
343+
final_url=website_url,
344+
status_code=None,
345+
blocked_reason="exception",
346+
challenge_detected=False,
347+
artifact_paths={},
348+
metadata={"error": str(e)},
349+
)
229350
finally:
230351
if owns_manager and manager:
231352
await manager.stop()
232353

233354

355+
async def aread_website_markdown(
356+
website_url: str,
357+
config: Optional[Union[Dict[str, Any], ParserConfig, BrowserConfig]] = None,
358+
selector: Optional[str] = None,
359+
max_length: Optional[int] = None,
360+
playwright_manager: Optional[Any] = None,
361+
) -> str:
362+
"""Backward-compatible markdown fetch API returning only markdown text."""
363+
result = await aread_website_markdown_result(
364+
website_url,
365+
config=config,
366+
selector=selector,
367+
max_length=max_length,
368+
playwright_manager=playwright_manager,
369+
)
370+
return result.markdown
371+
372+
234373
# Backward-compatible alias
235374
_arun_scrape_markdown = aread_website_markdown
236375

0 commit comments

Comments
 (0)