unclecode · hafezparast · Apr 25, 2026 · Apr 26, 2026
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -1491,6 +1491,8 @@ def __init__(
         # Network and Console Capturing Parameters
         capture_network_requests: bool = False,
         capture_console_messages: bool = False,
+        # Mermaid diagram source capture
+        extract_mermaid_source: bool = False,
         # Connection Parameters
         method: str = "GET",
         stream: bool = False,
@@ -1632,6 +1634,8 @@ def __init__(
         # Network and Console Capturing Parameters
         self.capture_network_requests = capture_network_requests
         self.capture_console_messages = capture_console_messages
+        # Mermaid diagram source capture
+        self.extract_mermaid_source = extract_mermaid_source
 
         # Connection Parameters
         self.stream = stream

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -27,6 +27,44 @@
 import aiofiles
 import aiohttp
 import chardet
+
+# ---------------------------------------------------------------------------
+# Mermaid source capture — injected via page.add_init_script() before goto()
+# Runs at document creation time, before mermaid.js replaces <pre class="mermaid">
+# with SVGs.  Captured sources land in window.__mermaidSources.
+# ---------------------------------------------------------------------------
+_MERMAID_OBSERVER_SCRIPT = """
+(function () {
+  if (window.__mermaidSources) return;   // guard against double-injection
+  window.__mermaidSources = [];
+  const _captured = new WeakSet();
+
+  function _captureMermaid() {
+    document.querySelectorAll(
+      'pre.mermaid, div.mermaid, pre[class*="mermaid"], [class="mermaid"]'
+    ).forEach(function (el) {
+      if (!_captured.has(el) && el.textContent.trim()) {
+        _captured.add(el);
+        window.__mermaidSources.push(el.textContent.trim());
+      }
+    });
+  }
+
+  // Capture anything already in the DOM
+  _captureMermaid();
+
+  // Watch for mermaid elements added dynamically (e.g. React/Next.js SSR hydration)
+  var _observer = new MutationObserver(_captureMermaid);
+  _observer.observe(document.documentElement || document, {
+    subtree: true, childList: true
+  });
+
+  // Stop observing once the page is interactive (mermaid.js will have run)
+  document.addEventListener('DOMContentLoaded', function () {
+    setTimeout(function () { _observer.disconnect(); }, 3000);
+  }, { once: true });
+})();
+"""
 from aiohttp.client import ClientTimeout
 from urllib.parse import urlparse
 from types import MappingProxyType
@@ -720,6 +758,12 @@ async def handle_request_failed_capture(request):
                     ),
                 )
 
+            # Inject mermaid source capture observer before navigation
+            # Must run before mermaid.js replaces <pre class="mermaid"> with SVGs
+            mermaid_sources: List[str] = []
+            if getattr(config, "extract_mermaid_source", False):
+                await page.add_init_script(_MERMAID_OBSERVER_SCRIPT)
+
             # Handle page navigation and content loading
             if not config.js_only:
                 await self.execute_hook("before_goto", page, context=context, url=url, config=config)
@@ -1019,6 +1063,27 @@ async def handle_request_failed_capture(request):
                 await self.execute_hook("on_execution_started", page, context=context, config=config)
                 await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)
 
+            # Retrieve mermaid sources captured by the init_script observer
+            if getattr(config, "extract_mermaid_source", False):
+                try:
+                    raw = await self.adapter.evaluate(
+                        page, "() => window.__mermaidSources || []"
+                    )
+                    mermaid_sources = [s for s in (raw or []) if isinstance(s, str) and s.strip()]
+                    if mermaid_sources:
+                        self.logger.info(
+                            message="Captured {count} mermaid diagram source(s)",
+                            tag="MERMAID",
+                            params={"count": len(mermaid_sources)},
+                        )
+                except Exception as e:
+                    self.logger.warning(
+                        message="Failed to retrieve mermaid sources: {error}",
+                        tag="MERMAID",
+                        params={"error": str(e)},
+                    )
+                    mermaid_sources = []
+
             # --- Phase 4: DOM processing before HTML capture ---
 
             # Update image dimensions if needed
@@ -1162,6 +1227,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
                 # Include captured data if enabled
                 network_requests=captured_requests if config.capture_network_requests else None,
                 console_messages=captured_console if config.capture_console_messages else None,
+                mermaid_sources=mermaid_sources if mermaid_sources else None,
             )
 
         except Exception as e:

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -461,6 +461,7 @@ async def arun(
                                 screenshot_data = async_response.screenshot
                                 pdf_data = async_response.pdf_data
                                 js_execution_result = async_response.js_execution_result
+                                mermaid_sources = async_response.mermaid_sources
 
                                 self.logger.url_status(
                                     url=cache_context.display_url,
@@ -479,6 +480,7 @@ async def arun(
                                     is_raw_html=True if url.startswith("raw:") else False,
                                     redirected_url=async_response.redirected_url,
                                     original_scheme=urlparse(url).scheme,
+                                    mermaid_sources=mermaid_sources,
                                     **kwargs,
                                 )
 

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
@@ -721,30 +721,117 @@ def _scrap(
             elif content_element is None:
                 content_element = body
 
-            # Replace mermaid SVGs with text before they get stripped
+            # Replace mermaid SVGs with their source (or extracted labels) before
+            # they get stripped by the HTML cleaner.
+            mermaid_sources: list = list(kwargs.get("mermaid_sources") or [])
+            mermaid_source_iter = iter(mermaid_sources)
+
             for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'):
                 try:
+                    parent = svg.getparent()
+                    if parent is None:
+                        continue
+
+                    # --- Path A: actual mermaid source captured pre-render ---
+                    source = next(mermaid_source_iter, None)
+                    if source:
+                        placeholder = lhtml.Element("pre")
+                        placeholder.set("class", "language-mermaid")
+                        code = etree.SubElement(placeholder, "code")
+                        code.set("class", "language-mermaid")
+                        code.text = source
+
+                        # Find ancestor <pre> (mermaid.js wraps the SVG in the
+                        # original <pre class="mermaid"> element).  Replace the
+                        # whole ancestor so we don't create nested fences.
+                        pre_ancestor = None
+                        node = parent
+                        while node is not None:
+                            if node.tag == "pre":
+                                pre_ancestor = node
+                                break
+                            node = node.getparent()
+
+                        if pre_ancestor is not None:
+                            pre_parent = pre_ancestor.getparent()
+                            if pre_parent is not None:
+                                pre_parent.replace(pre_ancestor, placeholder)
+                        else:
+                            parent.replace(svg, placeholder)
+                        continue
+
+                    # --- Path B: fallback — extract visible labels from the SVG ---
                     diagram_type = svg.get("aria-roledescription", "diagram")
-                    # Extract text from node/edge labels
                     labels = []
-                    seen = set()
+                    seen: set = set()
+
+                    # foreignObject-based labels (flowchart, class, state…)
                     for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"):
                         text = el.text_content().strip()
                         if text and text not in seen:
                             seen.add(text)
                             labels.append(text)
-                    if labels:
-                        # Build a pre block so it survives markdown conversion
+
+                    # Native SVG text/tspan fallback (sequence, gantt, git…)
+                    if not labels:
+                        for el in svg.xpath(
+                            './/*[local-name()="text"] | .//*[local-name()="tspan"]'
+                        ):
+                            text = (el.text or "").strip()
+                            if text and text not in seen:
+                                seen.add(text)
+                                labels.append(text)
+
+                    if not labels:
+                        continue
+
+                    # Detect outer <pre> to avoid nested fences
+                    ancestor = parent
+                    inside_pre = False
+                    while ancestor is not None:
+                        if ancestor.tag == "pre":
+                            inside_pre = True
+                            break
+                        ancestor = ancestor.getparent()
+
+                    if inside_pre:
+                        placeholder = lhtml.Element("span")
+                        placeholder.text = "\n".join(labels)
+                        parent.replace(svg, placeholder)
+                    else:
                         placeholder = lhtml.Element("pre")
+                        placeholder.set("class", "language-mermaid")
                         code = etree.SubElement(placeholder, "code")
                         code.set("class", "language-mermaid")
                         code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels)
-                        parent = svg.getparent()
-                        if parent is not None:
-                            parent.replace(svg, placeholder)
+                        parent.replace(svg, placeholder)
                 except Exception:
                     pass
 
+            # --- Pass 2: handle raw .mermaid containers whose source was
+            # captured by the observer but mermaid.js never rendered them
+            # (e.g. MkDocs Material with Intersection-Observer lazy rendering).
+            # Only process elements that still contain raw text (no SVG child).
+            remaining_sources = list(mermaid_source_iter)  # sources not consumed above
+            if remaining_sources:
+                raw_containers = body.xpath(
+                    './/*[contains(concat(" ", normalize-space(@class), " "), " mermaid ")]'
+                    '[not(.//svg)]'
+                )
+                for container, source in zip(raw_containers, remaining_sources):
+                    try:
+                        parent = container.getparent()
+                        if parent is None:
+                            continue
+                        placeholder = lhtml.Element("pre")
+                        placeholder.set("class", "language-mermaid")
+                        code = etree.SubElement(placeholder, "code")
+                        code.set("class", "language-mermaid")
+                        code.text = source
+                        parent.replace(container, placeholder)
+                    except Exception:
+                        pass
+
             # Remove script and style tags
             for tag in ["style", "link", "meta", "noscript"]:
                 for element in body.xpath(f".//{tag}"):

diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
@@ -1146,7 +1146,14 @@ def handle_tag(self, tag, attrs, start):
         # Handle pre tags
         if tag == "pre":
             if start:
-                self.o("\n```\n")  # Markdown code block start
+                # Detect language from class="language-X" on the <pre> element
+                cls = attrs.get("class", "")
+                lang = ""
+                for part in cls.split():
+                    if part.startswith("language-"):
+                        lang = part[len("language-"):]
+                        break
+                self.o(f"\n```{lang}\n")  # Markdown code block start
                 self.inside_pre = True
             else:
                 self.o("\n```\n")  # Markdown code block end

diff --git a/crawl4ai/models.py b/crawl4ai/models.py
@@ -343,6 +343,7 @@ class AsyncCrawlResponse(BaseModel):
     redirected_status_code: Optional[int] = None
     network_requests: Optional[List[Dict[str, Any]]] = None
     console_messages: Optional[List[Dict[str, Any]]] = None
+    mermaid_sources: Optional[List[str]] = None
 
     model_config = ConfigDict(arbitrary_types_allowed=True)