1010import logging
1111import re
1212from concurrent .futures import Future
13+ from dataclasses import dataclass , field
1314from threading import Thread
1415from typing import Any , Coroutine , Dict , List , Optional , TypeVar , Union
1516
2324T = TypeVar ("T" )
2425
2526
27+ @dataclass (frozen = True )
28+ class FetchResult :
29+ """Structured fetch result that preserves markdown plus routing metadata."""
30+
31+ markdown : str
32+ final_url : str
33+ status_code : Optional [int ]
34+ route_selected : str = ""
35+ host_profile_applied : str = ""
36+ challenge_detected : bool = False
37+ blocked_reason : str = ""
38+ artifact_paths : Dict [str , str ] = field (default_factory = dict )
39+ metadata : Dict [str , Any ] = field (default_factory = dict )
40+
41+ def as_dict (self ) -> Dict [str , Any ]:
42+ return {
43+ "markdown" : self .markdown ,
44+ "final_url" : self .final_url ,
45+ "status_code" : self .status_code ,
46+ "route_selected" : self .route_selected ,
47+ "host_profile_applied" : self .host_profile_applied ,
48+ "challenge_detected" : self .challenge_detected ,
49+ "blocked_reason" : self .blocked_reason ,
50+ "artifact_paths" : dict (self .artifact_paths ),
51+ "metadata" : dict (self .metadata ),
52+ }
53+
54+
2655def _run_coro_sync (coro : Coroutine [Any , Any , T ]) -> T :
2756 """
2857 Run an async coroutine from synchronous call sites.
@@ -154,13 +183,51 @@ def _dict_to_browser_config(config: Dict[str, Any]) -> "BrowserConfig":
154183 return BrowserConfig .from_dict (config )
155184
156185
157- async def aread_website_markdown (
186+ def _extract_route_selected (metadata : Dict [str , Any ]) -> str :
187+ """Resolve a stable route label from Playwright smart-fetch metadata."""
188+ candidate_values : List [str ] = []
189+ for key in ("attempt_profile" , "route_selected" , "fetch_route" , "strategy" ):
190+ value = metadata .get (key )
191+ if isinstance (value , str ) and value .strip ():
192+ candidate_values .append (value .strip ())
193+ resolved_routing = metadata .get ("resolved_routing" , {})
194+ if isinstance (resolved_routing , dict ):
195+ for key in ("strategy" , "primary_strategy" , "mode" ):
196+ value = resolved_routing .get (key )
197+ if isinstance (value , str ) and value .strip ():
198+ candidate_values .append (value .strip ())
199+ return candidate_values [0 ] if candidate_values else ""
200+
201+
202+ def _extract_artifact_paths (metadata : Dict [str , Any ]) -> Dict [str , str ]:
203+ """Collect artifact-like path fields without assuming a rigid metadata schema."""
204+ artifact_paths : Dict [str , str ] = {}
205+ for key , value in metadata .items ():
206+ if not isinstance (value , str ) or not value .strip ():
207+ continue
208+ lowered_key = key .lower ()
209+ lowered_value = value .lower ()
210+ if (
211+ "path" in lowered_key
212+ or "artifact" in lowered_key
213+ or "screenshot" in lowered_key
214+ ):
215+ artifact_paths [key ] = value
216+ continue
217+ if lowered_value .endswith (
218+ (".png" , ".jpg" , ".jpeg" , ".pdf" , ".html" , ".json" , ".md" )
219+ ):
220+ artifact_paths [key ] = value
221+ return artifact_paths
222+
223+
224+ async def aread_website_markdown_result (
158225 website_url : str ,
159226 config : Optional [Union [Dict [str , Any ], ParserConfig , BrowserConfig ]] = None ,
160227 selector : Optional [str ] = None ,
161228 max_length : Optional [int ] = None ,
162229 playwright_manager : Optional [Any ] = None ,
163- ) -> str :
230+ ) -> FetchResult :
164231 """Async version of read_website_markdown.
165232
166233 Fetches a website and converts the content to clean Markdown.
@@ -192,6 +259,19 @@ async def aread_website_markdown(
192259
193260 # Use Smart Fetch for robustness
194261 content , final_url , status_code = await manager .smart_fetch (url = website_url )
262+ fetch_metadata = (
263+ manager .get_last_fetch_metadata ()
264+ if hasattr (manager , "get_last_fetch_metadata" )
265+ else {}
266+ )
267+ if not isinstance (fetch_metadata , dict ):
268+ fetch_metadata = {}
269+ route_selected = _extract_route_selected (fetch_metadata )
270+ host_profile_applied = str (
271+ fetch_metadata .get ("active_host_profile_applied" , "" )
272+ ).strip ()
273+ blocked_reason = str (fetch_metadata .get ("blocked_reason" , "" )).strip ()
274+ challenge_detected = blocked_reason .lower () not in {"" , "none" }
195275
196276 if status_code == 200 and content :
197277 # Selector filtering (BeautifulSoup)
@@ -201,7 +281,17 @@ async def aread_website_markdown(
201281 if selected_tag :
202282 content = str (selected_tag )
203283 else :
204- return f"Error: Selector '{ selector } ' not found on { website_url } "
284+ return FetchResult (
285+ markdown = f"Error: Selector '{ selector } ' not found on { website_url } " ,
286+ final_url = final_url ,
287+ status_code = status_code ,
288+ route_selected = route_selected ,
289+ host_profile_applied = host_profile_applied ,
290+ challenge_detected = challenge_detected ,
291+ blocked_reason = blocked_reason ,
292+ artifact_paths = _extract_artifact_paths (fetch_metadata ),
293+ metadata = fetch_metadata ,
294+ )
205295
206296 # Convert to Markdown
207297 markdown = MarkdownConverter .to_markdown (content , base_url = final_url )
@@ -218,19 +308,68 @@ async def aread_website_markdown(
218308 logger .info (
219309 f"Successfully scraped and converted { len (markdown )} chars from { final_url } "
220310 )
221- return output
222- else :
223- return f"Error: Failed to retrieve content from { website_url } . Status code: { status_code } "
311+ return FetchResult (
312+ markdown = output ,
313+ final_url = final_url ,
314+ status_code = status_code ,
315+ route_selected = route_selected ,
316+ host_profile_applied = host_profile_applied ,
317+ challenge_detected = challenge_detected ,
318+ blocked_reason = blocked_reason ,
319+ artifact_paths = _extract_artifact_paths (fetch_metadata ),
320+ metadata = fetch_metadata ,
321+ )
322+
323+ return FetchResult (
324+ markdown = (
325+ f"Error: Failed to retrieve content from { website_url } . "
326+ f"Status code: { status_code } "
327+ ),
328+ final_url = final_url ,
329+ status_code = status_code ,
330+ route_selected = route_selected ,
331+ host_profile_applied = host_profile_applied ,
332+ challenge_detected = challenge_detected ,
333+ blocked_reason = blocked_reason ,
334+ artifact_paths = _extract_artifact_paths (fetch_metadata ),
335+ metadata = fetch_metadata ,
336+ )
224337 except Exception as e :
225338 logger .error (
226339 f"An error occurred while scraping { website_url } : { e } " , exc_info = True
227340 )
228- return f"An error occurred while scraping the website: { str (e )} "
341+ return FetchResult (
342+ markdown = f"An error occurred while scraping the website: { str (e )} " ,
343+ final_url = website_url ,
344+ status_code = None ,
345+ blocked_reason = "exception" ,
346+ challenge_detected = False ,
347+ artifact_paths = {},
348+ metadata = {"error" : str (e )},
349+ )
229350 finally :
230351 if owns_manager and manager :
231352 await manager .stop ()
232353
233354
355+ async def aread_website_markdown (
356+ website_url : str ,
357+ config : Optional [Union [Dict [str , Any ], ParserConfig , BrowserConfig ]] = None ,
358+ selector : Optional [str ] = None ,
359+ max_length : Optional [int ] = None ,
360+ playwright_manager : Optional [Any ] = None ,
361+ ) -> str :
362+ """Backward-compatible markdown fetch API returning only markdown text."""
363+ result = await aread_website_markdown_result (
364+ website_url ,
365+ config = config ,
366+ selector = selector ,
367+ max_length = max_length ,
368+ playwright_manager = playwright_manager ,
369+ )
370+ return result .markdown
371+
372+
234373# Backward-compatible alias
235374_arun_scrape_markdown = aread_website_markdown
236375
0 commit comments