Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 12 additions & 25 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@
from array import array
from .html2text import html2text, CustomHTML2Text
# from .config import *
from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
from .config import (
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
IMAGE_SCORE_THRESHOLD,
DEFAULT_PROVIDER,
PROVIDER_MODELS,
ONLY_TEXT_ELIGIBLE_TAGS,
)
import httpx
from socket import gaierror
from pathlib import Path
Expand Down Expand Up @@ -1382,33 +1389,13 @@ def process_element(element: element.PageElement) -> bool:
return True # Always keep video and audio elements

if element.name != "pre":
if element.name in [
"b",
"i",
"u",
"span",
"del",
"ins",
"sub",
"sup",
"strong",
"em",
"code",
"kbd",
"var",
"s",
"q",
"abbr",
"cite",
"dfn",
"time",
"small",
"mark",
]:
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
replacement_text = element.get_text()
if kwargs.get("only_text", False):
element.replace_with(element.get_text())
element.replace_with(replacement_text)
else:
element.unwrap()
return bool(replacement_text.strip())
elif element.name != "img":
element.attrs = {}

Expand Down
17 changes: 17 additions & 0 deletions tests/regression/test_reg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from crawl4ai.utils import (
extract_xml_data,
extract_xml_data_legacy,
get_content_of_website_optimized,
normalize_url,
normalize_url_for_deep_crawl,
efficient_normalize_url_for_deep_crawl,
Expand Down Expand Up @@ -113,6 +114,22 @@ def test_missing_tag(self):
assert result["missing"] == ""


class TestContentExtraction:
"""Verify utility HTML cleanup preserves neighboring text."""

def test_only_text_sup_preserves_following_text(self):
"""Fix for sup cleanup removing text after the closing tag."""
result = get_content_of_website_optimized(
"https://example.com",
"<html><body><p>Alpha<sup>1</sup>Beta</p></body></html>",
only_text=True,
)

assert result is not None
assert "Alpha1Beta" in result["cleaned_html"]
assert "Beta" in result["markdown"]


# ===================================================================
# URL normalization
# ===================================================================
Expand Down