diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index f7544866a..c763ab578 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1491,6 +1491,8 @@ def __init__( # Network and Console Capturing Parameters capture_network_requests: bool = False, capture_console_messages: bool = False, + # Mermaid diagram source capture + extract_mermaid_source: bool = False, # Connection Parameters method: str = "GET", stream: bool = False, @@ -1632,6 +1634,8 @@ def __init__( # Network and Console Capturing Parameters self.capture_network_requests = capture_network_requests self.capture_console_messages = capture_console_messages + # Mermaid diagram source capture + self.extract_mermaid_source = extract_mermaid_source # Connection Parameters self.stream = stream diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b9de25f6b..e3a2b672f 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -27,6 +27,44 @@ import aiofiles import aiohttp import chardet + +# --------------------------------------------------------------------------- +# Mermaid source capture — injected via page.add_init_script() before goto() +# Runs at document creation time, before mermaid.js replaces
+# with SVGs. Captured sources land in window.__mermaidSources.
+# ---------------------------------------------------------------------------
+_MERMAID_OBSERVER_SCRIPT = """
+(function () {
+ if (window.__mermaidSources) return; // guard against double-injection
+ window.__mermaidSources = [];
+ const _captured = new WeakSet();
+
+ function _captureMermaid() {
+ document.querySelectorAll(
+ 'pre.mermaid, div.mermaid, pre[class*="mermaid"], [class="mermaid"]'
+ ).forEach(function (el) {
+ if (!_captured.has(el) && el.textContent.trim()) {
+ _captured.add(el);
+ window.__mermaidSources.push(el.textContent.trim());
+ }
+ });
+ }
+
+ // Capture anything already in the DOM
+ _captureMermaid();
+
+ // Watch for mermaid elements added dynamically (e.g. React/Next.js SSR hydration)
+ var _observer = new MutationObserver(_captureMermaid);
+ _observer.observe(document.documentElement || document, {
+ subtree: true, childList: true
+ });
+
+ // Stop observing once the page is interactive (mermaid.js will have run)
+ document.addEventListener('DOMContentLoaded', function () {
+ setTimeout(function () { _observer.disconnect(); }, 3000);
+ }, { once: true });
+})();
+"""
from aiohttp.client import ClientTimeout
from urllib.parse import urlparse
from types import MappingProxyType
@@ -720,6 +758,12 @@ async def handle_request_failed_capture(request):
),
)
+ # Inject mermaid source capture observer before navigation
+ # Must run before mermaid.js replaces with SVGs
+ mermaid_sources: List[str] = []
+ if getattr(config, "extract_mermaid_source", False):
+ await page.add_init_script(_MERMAID_OBSERVER_SCRIPT)
+
# Handle page navigation and content loading
if not config.js_only:
await self.execute_hook("before_goto", page, context=context, url=url, config=config)
@@ -1019,6 +1063,27 @@ async def handle_request_failed_capture(request):
await self.execute_hook("on_execution_started", page, context=context, config=config)
await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)
+ # Retrieve mermaid sources captured by the init_script observer
+ if getattr(config, "extract_mermaid_source", False):
+ try:
+ raw = await self.adapter.evaluate(
+ page, "() => window.__mermaidSources || []"
+ )
+ mermaid_sources = [s for s in (raw or []) if isinstance(s, str) and s.strip()]
+ if mermaid_sources:
+ self.logger.info(
+ message="Captured {count} mermaid diagram source(s)",
+ tag="MERMAID",
+ params={"count": len(mermaid_sources)},
+ )
+ except Exception as e:
+ self.logger.warning(
+ message="Failed to retrieve mermaid sources: {error}",
+ tag="MERMAID",
+ params={"error": str(e)},
+ )
+ mermaid_sources = []
+
# --- Phase 4: DOM processing before HTML capture ---
# Update image dimensions if needed
@@ -1162,6 +1227,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
# Include captured data if enabled
network_requests=captured_requests if config.capture_network_requests else None,
console_messages=captured_console if config.capture_console_messages else None,
+ mermaid_sources=mermaid_sources if mermaid_sources else None,
)
except Exception as e:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index b0950ff8f..f0109e4eb 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -461,6 +461,7 @@ async def arun(
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
js_execution_result = async_response.js_execution_result
+ mermaid_sources = async_response.mermaid_sources
self.logger.url_status(
url=cache_context.display_url,
@@ -479,6 +480,7 @@ async def arun(
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
+ mermaid_sources=mermaid_sources,
**kwargs,
)
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 9853f788f..8575192d2 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -721,30 +721,117 @@ def _scrap(
elif content_element is None:
content_element = body
- # Replace mermaid SVGs with text before they get stripped
+ # Replace mermaid SVGs with their source (or extracted labels) before
+ # they get stripped by the HTML cleaner.
+ mermaid_sources: list = list(kwargs.get("mermaid_sources") or [])
+ mermaid_source_iter = iter(mermaid_sources)
+
for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'):
try:
+ parent = svg.getparent()
+ if parent is None:
+ continue
+
+ # --- Path A: actual mermaid source captured pre-render ---
+ source = next(mermaid_source_iter, None)
+ if source:
+ placeholder = lhtml.Element("pre")
+ placeholder.set("class", "language-mermaid")
+ code = etree.SubElement(placeholder, "code")
+ code.set("class", "language-mermaid")
+ code.text = source
+
+ # Find ancestor (mermaid.js wraps the SVG in the
+ # original element). Replace the
+ # whole ancestor so we don't create nested fences.
+ pre_ancestor = None
+ node = parent
+ while node is not None:
+ if node.tag == "pre":
+ pre_ancestor = node
+ break
+ node = node.getparent()
+
+ if pre_ancestor is not None:
+ pre_parent = pre_ancestor.getparent()
+ if pre_parent is not None:
+ pre_parent.replace(pre_ancestor, placeholder)
+ else:
+ parent.replace(svg, placeholder)
+ continue
+
+ # --- Path B: fallback — extract visible labels from the SVG ---
diagram_type = svg.get("aria-roledescription", "diagram")
- # Extract text from node/edge labels
labels = []
- seen = set()
+ seen: set = set()
+
+ # foreignObject-based labels (flowchart, class, state…)
for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"):
text = el.text_content().strip()
if text and text not in seen:
seen.add(text)
labels.append(text)
- if labels:
- # Build a pre block so it survives markdown conversion
+
+ # Native SVG text/tspan fallback (sequence, gantt, git…)
+ if not labels:
+ for el in svg.xpath(
+ './/*[local-name()="text"] | .//*[local-name()="tspan"]'
+ ):
+ text = (el.text or "").strip()
+ if text and text not in seen:
+ seen.add(text)
+ labels.append(text)
+
+ if not labels:
+ continue
+
+ # Detect outer to avoid nested fences
+ ancestor = parent
+ inside_pre = False
+ while ancestor is not None:
+ if ancestor.tag == "pre":
+ inside_pre = True
+ break
+ ancestor = ancestor.getparent()
+
+ if inside_pre:
+ placeholder = lhtml.Element("span")
+ placeholder.text = "\n".join(labels)
+ parent.replace(svg, placeholder)
+ else:
placeholder = lhtml.Element("pre")
+ placeholder.set("class", "language-mermaid")
code = etree.SubElement(placeholder, "code")
code.set("class", "language-mermaid")
code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels)
- parent = svg.getparent()
- if parent is not None:
- parent.replace(svg, placeholder)
+ parent.replace(svg, placeholder)
except Exception:
pass
+ # --- Pass 2: handle raw .mermaid containers whose source was
+ # captured by the observer but mermaid.js never rendered them
+ # (e.g. MkDocs Material with Intersection-Observer lazy rendering).
+ # Only process elements that still contain raw text (no SVG child).
+ remaining_sources = list(mermaid_source_iter) # sources not consumed above
+ if remaining_sources:
+ raw_containers = body.xpath(
+ './/*[contains(concat(" ", normalize-space(@class), " "), " mermaid ")]'
+ '[not(.//svg)]'
+ )
+ for container, source in zip(raw_containers, remaining_sources):
+ try:
+ parent = container.getparent()
+ if parent is None:
+ continue
+ placeholder = lhtml.Element("pre")
+ placeholder.set("class", "language-mermaid")
+ code = etree.SubElement(placeholder, "code")
+ code.set("class", "language-mermaid")
+ code.text = source
+ parent.replace(container, placeholder)
+ except Exception:
+ pass
+
# Remove script and style tags
for tag in ["style", "link", "meta", "noscript"]:
for element in body.xpath(f".//{tag}"):
diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
index 9f241bacd..269356ec2 100644
--- a/crawl4ai/html2text/__init__.py
+++ b/crawl4ai/html2text/__init__.py
@@ -1146,7 +1146,14 @@ def handle_tag(self, tag, attrs, start):
# Handle pre tags
if tag == "pre":
if start:
- self.o("\n```\n") # Markdown code block start
+ # Detect language from class="language-X" on the element
+ cls = attrs.get("class", "")
+ lang = ""
+ for part in cls.split():
+ if part.startswith("language-"):
+ lang = part[len("language-"):]
+ break
+ self.o(f"\n```{lang}\n") # Markdown code block start
self.inside_pre = True
else:
self.o("\n```\n") # Markdown code block end
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 506538970..0f9d9879c 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -343,6 +343,7 @@ class AsyncCrawlResponse(BaseModel):
redirected_status_code: Optional[int] = None
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
+ mermaid_sources: Optional[List[str]] = None
model_config = ConfigDict(arbitrary_types_allowed=True)