Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1491,6 +1491,8 @@ def __init__(
# Network and Console Capturing Parameters
capture_network_requests: bool = False,
capture_console_messages: bool = False,
# Mermaid diagram source capture
extract_mermaid_source: bool = False,
# Connection Parameters
method: str = "GET",
stream: bool = False,
Expand Down Expand Up @@ -1632,6 +1634,8 @@ def __init__(
# Network and Console Capturing Parameters
self.capture_network_requests = capture_network_requests
self.capture_console_messages = capture_console_messages
# Mermaid diagram source capture
self.extract_mermaid_source = extract_mermaid_source

# Connection Parameters
self.stream = stream
Expand Down
66 changes: 66 additions & 0 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,44 @@
import aiofiles
import aiohttp
import chardet

# ---------------------------------------------------------------------------
# Mermaid source capture — injected via page.add_init_script() before goto()
# Runs at document creation time, before mermaid.js replaces <pre class="mermaid">
# with SVGs. Captured sources land in window.__mermaidSources.
# ---------------------------------------------------------------------------
_MERMAID_OBSERVER_SCRIPT = """
(function () {
if (window.__mermaidSources) return; // guard against double-injection
window.__mermaidSources = [];
const _captured = new WeakSet();

function _captureMermaid() {
document.querySelectorAll(
'pre.mermaid, div.mermaid, pre[class*="mermaid"], [class="mermaid"]'
).forEach(function (el) {
if (!_captured.has(el) && el.textContent.trim()) {
_captured.add(el);
window.__mermaidSources.push(el.textContent.trim());
}
});
}

// Capture anything already in the DOM
_captureMermaid();

// Watch for mermaid elements added dynamically (e.g. React/Next.js SSR hydration)
var _observer = new MutationObserver(_captureMermaid);
_observer.observe(document.documentElement || document, {
subtree: true, childList: true
});

// Stop observing once the page is interactive (mermaid.js will have run)
document.addEventListener('DOMContentLoaded', function () {
setTimeout(function () { _observer.disconnect(); }, 3000);
}, { once: true });
})();
"""
from aiohttp.client import ClientTimeout
from urllib.parse import urlparse
from types import MappingProxyType
Expand Down Expand Up @@ -720,6 +758,12 @@ async def handle_request_failed_capture(request):
),
)

# Inject mermaid source capture observer before navigation
# Must run before mermaid.js replaces <pre class="mermaid"> with SVGs
mermaid_sources: List[str] = []
if getattr(config, "extract_mermaid_source", False):
await page.add_init_script(_MERMAID_OBSERVER_SCRIPT)

# Handle page navigation and content loading
if not config.js_only:
await self.execute_hook("before_goto", page, context=context, url=url, config=config)
Expand Down Expand Up @@ -1019,6 +1063,27 @@ async def handle_request_failed_capture(request):
await self.execute_hook("on_execution_started", page, context=context, config=config)
await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)

# Retrieve mermaid sources captured by the init_script observer
if getattr(config, "extract_mermaid_source", False):
try:
raw = await self.adapter.evaluate(
page, "() => window.__mermaidSources || []"
)
mermaid_sources = [s for s in (raw or []) if isinstance(s, str) and s.strip()]
if mermaid_sources:
self.logger.info(
message="Captured {count} mermaid diagram source(s)",
tag="MERMAID",
params={"count": len(mermaid_sources)},
)
except Exception as e:
self.logger.warning(
message="Failed to retrieve mermaid sources: {error}",
tag="MERMAID",
params={"error": str(e)},
)
mermaid_sources = []

# --- Phase 4: DOM processing before HTML capture ---

# Update image dimensions if needed
Expand Down Expand Up @@ -1162,6 +1227,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
# Include captured data if enabled
network_requests=captured_requests if config.capture_network_requests else None,
console_messages=captured_console if config.capture_console_messages else None,
mermaid_sources=mermaid_sources if mermaid_sources else None,
)

except Exception as e:
Expand Down
2 changes: 2 additions & 0 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ async def arun(
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
js_execution_result = async_response.js_execution_result
mermaid_sources = async_response.mermaid_sources

self.logger.url_status(
url=cache_context.display_url,
Expand All @@ -479,6 +480,7 @@ async def arun(
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
mermaid_sources=mermaid_sources,
**kwargs,
)

Expand Down
103 changes: 95 additions & 8 deletions crawl4ai/content_scraping_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,30 +721,117 @@ def _scrap(
elif content_element is None:
content_element = body

# Replace mermaid SVGs with text before they get stripped
# Replace mermaid SVGs with their source (or extracted labels) before
# they get stripped by the HTML cleaner.
mermaid_sources: list = list(kwargs.get("mermaid_sources") or [])
mermaid_source_iter = iter(mermaid_sources)

for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'):
try:
parent = svg.getparent()
if parent is None:
continue

# --- Path A: actual mermaid source captured pre-render ---
source = next(mermaid_source_iter, None)
if source:
placeholder = lhtml.Element("pre")
placeholder.set("class", "language-mermaid")
code = etree.SubElement(placeholder, "code")
code.set("class", "language-mermaid")
code.text = source

# Find ancestor <pre> (mermaid.js wraps the SVG in the
# original <pre class="mermaid"> element). Replace the
# whole ancestor so we don't create nested fences.
pre_ancestor = None
node = parent
while node is not None:
if node.tag == "pre":
pre_ancestor = node
break
node = node.getparent()

if pre_ancestor is not None:
pre_parent = pre_ancestor.getparent()
if pre_parent is not None:
pre_parent.replace(pre_ancestor, placeholder)
else:
parent.replace(svg, placeholder)
continue

# --- Path B: fallback — extract visible labels from the SVG ---
diagram_type = svg.get("aria-roledescription", "diagram")
# Extract text from node/edge labels
labels = []
seen = set()
seen: set = set()

# foreignObject-based labels (flowchart, class, state…)
for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"):
text = el.text_content().strip()
if text and text not in seen:
seen.add(text)
labels.append(text)
if labels:
# Build a pre block so it survives markdown conversion

# Native SVG text/tspan fallback (sequence, gantt, git…)
if not labels:
for el in svg.xpath(
'.//*[local-name()="text"] | .//*[local-name()="tspan"]'
):
text = (el.text or "").strip()
if text and text not in seen:
seen.add(text)
labels.append(text)

if not labels:
continue

# Detect outer <pre> to avoid nested fences
ancestor = parent
inside_pre = False
while ancestor is not None:
if ancestor.tag == "pre":
inside_pre = True
break
ancestor = ancestor.getparent()

if inside_pre:
placeholder = lhtml.Element("span")
placeholder.text = "\n".join(labels)
parent.replace(svg, placeholder)
else:
placeholder = lhtml.Element("pre")
placeholder.set("class", "language-mermaid")
code = etree.SubElement(placeholder, "code")
code.set("class", "language-mermaid")
code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels)
parent = svg.getparent()
if parent is not None:
parent.replace(svg, placeholder)
parent.replace(svg, placeholder)
except Exception:
pass

# --- Pass 2: handle raw .mermaid containers whose source was
# captured by the observer but mermaid.js never rendered them
# (e.g. MkDocs Material with Intersection-Observer lazy rendering).
# Only process elements that still contain raw text (no SVG child).
remaining_sources = list(mermaid_source_iter) # sources not consumed above
if remaining_sources:
raw_containers = body.xpath(
'.//*[contains(concat(" ", normalize-space(@class), " "), " mermaid ")]'
'[not(.//svg)]'
)
for container, source in zip(raw_containers, remaining_sources):
try:
parent = container.getparent()
if parent is None:
continue
placeholder = lhtml.Element("pre")
placeholder.set("class", "language-mermaid")
code = etree.SubElement(placeholder, "code")
code.set("class", "language-mermaid")
code.text = source
parent.replace(container, placeholder)
except Exception:
pass

# Remove script and style tags
for tag in ["style", "link", "meta", "noscript"]:
for element in body.xpath(f".//{tag}"):
Expand Down
9 changes: 8 additions & 1 deletion crawl4ai/html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,14 @@ def handle_tag(self, tag, attrs, start):
# Handle pre tags
if tag == "pre":
if start:
self.o("\n```\n") # Markdown code block start
# Detect language from class="language-X" on the <pre> element
cls = attrs.get("class", "")
lang = ""
for part in cls.split():
if part.startswith("language-"):
lang = part[len("language-"):]
break
self.o(f"\n```{lang}\n") # Markdown code block start
self.inside_pre = True
else:
self.o("\n```\n") # Markdown code block end
Expand Down
1 change: 1 addition & 0 deletions crawl4ai/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ class AsyncCrawlResponse(BaseModel):
redirected_status_code: Optional[int] = None
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
mermaid_sources: Optional[List[str]] = None

model_config = ConfigDict(arbitrary_types_allowed=True)

Expand Down