diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 4b3d96906..141d0d0d4 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -12,7 +12,14 @@ from array import array from .html2text import html2text, CustomHTML2Text # from .config import * -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + IMAGE_SCORE_THRESHOLD, + DEFAULT_PROVIDER, + PROVIDER_MODELS, + ONLY_TEXT_ELIGIBLE_TAGS, +) import httpx from socket import gaierror from pathlib import Path @@ -1382,33 +1389,13 @@ def process_element(element: element.PageElement) -> bool: return True # Always keep video and audio elements if element.name != "pre": - if element.name in [ - "b", - "i", - "u", - "span", - "del", - "ins", - "sub", - "sup", - "strong", - "em", - "code", - "kbd", - "var", - "s", - "q", - "abbr", - "cite", - "dfn", - "time", - "small", - "mark", - ]: + if element.name in ONLY_TEXT_ELIGIBLE_TAGS: + replacement_text = element.get_text() if kwargs.get("only_text", False): - element.replace_with(element.get_text()) + element.replace_with(replacement_text) else: element.unwrap() + return bool(replacement_text.strip()) elif element.name != "img": element.attrs = {} diff --git a/tests/regression/test_reg_utils.py b/tests/regression/test_reg_utils.py index dfc63c42d..d090e6342 100644 --- a/tests/regression/test_reg_utils.py +++ b/tests/regression/test_reg_utils.py @@ -10,6 +10,7 @@ from crawl4ai.utils import ( extract_xml_data, extract_xml_data_legacy, + get_content_of_website_optimized, normalize_url, normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl, @@ -113,6 +114,22 @@ def test_missing_tag(self): assert result["missing"] == "" +class TestContentExtraction: + """Verify utility HTML cleanup preserves neighboring text.""" + + def test_only_text_sup_preserves_following_text(self): + """Fix for sup cleanup removing text after the closing tag.""" + result = get_content_of_website_optimized( + "https://example.com", + "
Alpha1Beta
", + only_text=True, + ) + + assert result is not None + assert "Alpha1Beta" in result["cleaned_html"] + assert "Beta" in result["markdown"] + + # =================================================================== # URL normalization # ===================================================================