From 2d2285796da5985a2a30bd5a98bff74bdb8893d0 Mon Sep 17 00:00:00 2001 From: hafezparast Date: Sat, 25 Apr 2026 12:01:58 +0800 Subject: [PATCH 1/2] feat: capture mermaid diagram source code during crawl (#1043) When `extract_mermaid_source=True` is set on CrawlerRunConfig, crawl4ai injects a MutationObserver via `page.add_init_script()` that captures `
` source text before mermaid.js replaces them with
SVGs. Captured sources are stored in `window.__mermaidSources`, retrieved
after page load, and embedded as proper ` ```mermaid ``` ` fenced blocks
in the markdown output.

- Two-path SVG replacement in content_scraping_strategy:
  - Path A: use actual captured source code (exact fidelity)
  - Path B: fallback label extraction from rendered SVG (aria-roledescription,
    .nodeLabel, SVG text/tspan elements) with outer-pre guard to prevent
    nested fenced code blocks
- html2text updated to read `class="language-*"` on `
` tags and emit
  the correct fence language identifier
- `AsyncCrawlResponse.mermaid_sources` carries sources through the pipeline
- Default is `False`; set to `True` to enable

Co-Authored-By: Claude Sonnet 4.6 
---
 crawl4ai/async_configs.py             |  4 ++
 crawl4ai/async_crawler_strategy.py    | 66 ++++++++++++++++++++++
 crawl4ai/async_webcrawler.py          |  2 +
 crawl4ai/content_scraping_strategy.py | 79 ++++++++++++++++++++++++---
 crawl4ai/html2text/__init__.py        |  9 ++-
 crawl4ai/models.py                    |  1 +
 6 files changed, 152 insertions(+), 9 deletions(-)

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index f7544866a..c763ab578 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1491,6 +1491,8 @@ def __init__(
         # Network and Console Capturing Parameters
         capture_network_requests: bool = False,
         capture_console_messages: bool = False,
+        # Mermaid diagram source capture
+        extract_mermaid_source: bool = False,
         # Connection Parameters
         method: str = "GET",
         stream: bool = False,
@@ -1632,6 +1634,8 @@ def __init__(
         # Network and Console Capturing Parameters
         self.capture_network_requests = capture_network_requests
         self.capture_console_messages = capture_console_messages
+        # Mermaid diagram source capture
+        self.extract_mermaid_source = extract_mermaid_source
 
         # Connection Parameters
         self.stream = stream
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index b9de25f6b..e3a2b672f 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -27,6 +27,44 @@
 import aiofiles
 import aiohttp
 import chardet
+
+# ---------------------------------------------------------------------------
+# Mermaid source capture — injected via page.add_init_script() before goto()
+# Runs at document creation time, before mermaid.js replaces 
+# with SVGs.  Captured sources land in window.__mermaidSources.
+# ---------------------------------------------------------------------------
+_MERMAID_OBSERVER_SCRIPT = """
+(function () {
+  if (window.__mermaidSources) return;   // guard against double-injection
+  window.__mermaidSources = [];
+  const _captured = new WeakSet();
+
+  function _captureMermaid() {
+    document.querySelectorAll(
+      'pre.mermaid, div.mermaid, pre[class*="mermaid"], [class="mermaid"]'
+    ).forEach(function (el) {
+      if (!_captured.has(el) && el.textContent.trim()) {
+        _captured.add(el);
+        window.__mermaidSources.push(el.textContent.trim());
+      }
+    });
+  }
+
+  // Capture anything already in the DOM
+  _captureMermaid();
+
+  // Watch for mermaid elements added dynamically (e.g. React/Next.js SSR hydration)
+  var _observer = new MutationObserver(_captureMermaid);
+  _observer.observe(document.documentElement || document, {
+    subtree: true, childList: true
+  });
+
+  // Stop observing once the page is interactive (mermaid.js will have run)
+  document.addEventListener('DOMContentLoaded', function () {
+    setTimeout(function () { _observer.disconnect(); }, 3000);
+  }, { once: true });
+})();
+"""
 from aiohttp.client import ClientTimeout
 from urllib.parse import urlparse
 from types import MappingProxyType
@@ -720,6 +758,12 @@ async def handle_request_failed_capture(request):
                     ),
                 )
 
+            # Inject mermaid source capture observer before navigation
+            # Must run before mermaid.js replaces 
 with SVGs
+            mermaid_sources: List[str] = []
+            if getattr(config, "extract_mermaid_source", False):
+                await page.add_init_script(_MERMAID_OBSERVER_SCRIPT)
+
             # Handle page navigation and content loading
             if not config.js_only:
                 await self.execute_hook("before_goto", page, context=context, url=url, config=config)
@@ -1019,6 +1063,27 @@ async def handle_request_failed_capture(request):
                 await self.execute_hook("on_execution_started", page, context=context, config=config)
                 await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)
 
+            # Retrieve mermaid sources captured by the init_script observer
+            if getattr(config, "extract_mermaid_source", False):
+                try:
+                    raw = await self.adapter.evaluate(
+                        page, "() => window.__mermaidSources || []"
+                    )
+                    mermaid_sources = [s for s in (raw or []) if isinstance(s, str) and s.strip()]
+                    if mermaid_sources:
+                        self.logger.info(
+                            message="Captured {count} mermaid diagram source(s)",
+                            tag="MERMAID",
+                            params={"count": len(mermaid_sources)},
+                        )
+                except Exception as e:
+                    self.logger.warning(
+                        message="Failed to retrieve mermaid sources: {error}",
+                        tag="MERMAID",
+                        params={"error": str(e)},
+                    )
+                    mermaid_sources = []
+
             # --- Phase 4: DOM processing before HTML capture ---
 
             # Update image dimensions if needed
@@ -1162,6 +1227,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
                 # Include captured data if enabled
                 network_requests=captured_requests if config.capture_network_requests else None,
                 console_messages=captured_console if config.capture_console_messages else None,
+                mermaid_sources=mermaid_sources if mermaid_sources else None,
             )
 
         except Exception as e:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index b0950ff8f..f0109e4eb 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -461,6 +461,7 @@ async def arun(
                                 screenshot_data = async_response.screenshot
                                 pdf_data = async_response.pdf_data
                                 js_execution_result = async_response.js_execution_result
+                                mermaid_sources = async_response.mermaid_sources
 
                                 self.logger.url_status(
                                     url=cache_context.display_url,
@@ -479,6 +480,7 @@ async def arun(
                                     is_raw_html=True if url.startswith("raw:") else False,
                                     redirected_url=async_response.redirected_url,
                                     original_scheme=urlparse(url).scheme,
+                                    mermaid_sources=mermaid_sources,
                                     **kwargs,
                                 )
 
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 9853f788f..9fc3e4540 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -721,27 +721,90 @@ def _scrap(
             elif content_element is None:
                 content_element = body
 
-            # Replace mermaid SVGs with text before they get stripped
+            # Replace mermaid SVGs with their source (or extracted labels) before
+            # they get stripped by the HTML cleaner.
+            mermaid_sources: list = list(kwargs.get("mermaid_sources") or [])
+            mermaid_source_iter = iter(mermaid_sources)
+
             for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'):
                 try:
+                    parent = svg.getparent()
+                    if parent is None:
+                        continue
+
+                    # --- Path A: actual mermaid source captured pre-render ---
+                    source = next(mermaid_source_iter, None)
+                    if source:
+                        placeholder = lhtml.Element("pre")
+                        placeholder.set("class", "language-mermaid")
+                        code = etree.SubElement(placeholder, "code")
+                        code.set("class", "language-mermaid")
+                        code.text = source
+
+                        # Find ancestor 
 (mermaid.js wraps the SVG in the
+                        # original 
 element).  Replace the
+                        # whole ancestor so we don't create nested fences.
+                        pre_ancestor = None
+                        node = parent
+                        while node is not None:
+                            if node.tag == "pre":
+                                pre_ancestor = node
+                                break
+                            node = node.getparent()
+
+                        if pre_ancestor is not None:
+                            pre_parent = pre_ancestor.getparent()
+                            if pre_parent is not None:
+                                pre_parent.replace(pre_ancestor, placeholder)
+                        else:
+                            parent.replace(svg, placeholder)
+                        continue
+
+                    # --- Path B: fallback — extract visible labels from the SVG ---
                     diagram_type = svg.get("aria-roledescription", "diagram")
-                    # Extract text from node/edge labels
                     labels = []
-                    seen = set()
+                    seen: set = set()
+
+                    # foreignObject-based labels (flowchart, class, state…)
                     for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"):
                         text = el.text_content().strip()
                         if text and text not in seen:
                             seen.add(text)
                             labels.append(text)
-                    if labels:
-                        # Build a pre block so it survives markdown conversion
+
+                    # Native SVG text/tspan fallback (sequence, gantt, git…)
+                    if not labels:
+                        for el in svg.xpath(
+                            './/*[local-name()="text"] | .//*[local-name()="tspan"]'
+                        ):
+                            text = (el.text or "").strip()
+                            if text and text not in seen:
+                                seen.add(text)
+                                labels.append(text)
+
+                    if not labels:
+                        continue
+
+                    # Detect outer 
 to avoid nested fences
+                    ancestor = parent
+                    inside_pre = False
+                    while ancestor is not None:
+                        if ancestor.tag == "pre":
+                            inside_pre = True
+                            break
+                        ancestor = ancestor.getparent()
+
+                    if inside_pre:
+                        placeholder = lhtml.Element("span")
+                        placeholder.text = "\n".join(labels)
+                        parent.replace(svg, placeholder)
+                    else:
                         placeholder = lhtml.Element("pre")
+                        placeholder.set("class", "language-mermaid")
                         code = etree.SubElement(placeholder, "code")
                         code.set("class", "language-mermaid")
                         code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels)
-                        parent = svg.getparent()
-                        if parent is not None:
-                            parent.replace(svg, placeholder)
+                        parent.replace(svg, placeholder)
                 except Exception:
                     pass
 
diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
index 9f241bacd..269356ec2 100644
--- a/crawl4ai/html2text/__init__.py
+++ b/crawl4ai/html2text/__init__.py
@@ -1146,7 +1146,14 @@ def handle_tag(self, tag, attrs, start):
         # Handle pre tags
         if tag == "pre":
             if start:
-                self.o("\n```\n")  # Markdown code block start
+                # Detect language from class="language-X" on the 
 element
+                cls = attrs.get("class", "")
+                lang = ""
+                for part in cls.split():
+                    if part.startswith("language-"):
+                        lang = part[len("language-"):]
+                        break
+                self.o(f"\n```{lang}\n")  # Markdown code block start
                 self.inside_pre = True
             else:
                 self.o("\n```\n")  # Markdown code block end
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 506538970..0f9d9879c 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -343,6 +343,7 @@ class AsyncCrawlResponse(BaseModel):
     redirected_status_code: Optional[int] = None
     network_requests: Optional[List[Dict[str, Any]]] = None
     console_messages: Optional[List[Dict[str, Any]]] = None
+    mermaid_sources: Optional[List[str]] = None
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 

From 527512d5aafaf5d45370c644d2f329799c056081 Mon Sep 17 00:00:00 2001
From: hafezparast 
Date: Sun, 26 Apr 2026 12:05:23 +0800
Subject: [PATCH 2/2] fix: handle unrendered .mermaid containers (MkDocs
 Material lazy rendering)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Pass 2 in mermaid processing: after SVG replacement, also convert raw
.mermaid containers that still contain text (no  child). This handles
frameworks like MkDocs Material that use Intersection Observer to lazily
render mermaid — the observer captures sources pre-render, but SVGs are
never in the DOM at crawl time.

Co-Authored-By: Claude Sonnet 4.6 
---
 crawl4ai/content_scraping_strategy.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 9fc3e4540..8575192d2 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -808,6 +808,30 @@ def _scrap(
                 except Exception:
                     pass
 
+            # --- Pass 2: handle raw .mermaid containers whose source was
+            # captured by the observer but mermaid.js never rendered them
+            # (e.g. MkDocs Material with Intersection-Observer lazy rendering).
+            # Only process elements that still contain raw text (no SVG child).
+            remaining_sources = list(mermaid_source_iter)  # sources not consumed above
+            if remaining_sources:
+                raw_containers = body.xpath(
+                    './/*[contains(concat(" ", normalize-space(@class), " "), " mermaid ")]'
+                    '[not(.//svg)]'
+                )
+                for container, source in zip(raw_containers, remaining_sources):
+                    try:
+                        parent = container.getparent()
+                        if parent is None:
+                            continue
+                        placeholder = lhtml.Element("pre")
+                        placeholder.set("class", "language-mermaid")
+                        code = etree.SubElement(placeholder, "code")
+                        code.set("class", "language-mermaid")
+                        code.text = source
+                        parent.replace(container, placeholder)
+                    except Exception:
+                        pass
+
             # Remove script and style tags
             for tag in ["style", "link", "meta", "noscript"]:
                 for element in body.xpath(f".//{tag}"):