unclecode · er-mene · May 4, 2026
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -12,7 +12,14 @@
 from array import array
 from .html2text import html2text, CustomHTML2Text
 # from .config import *
-from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
+from .config import (
+    MIN_WORD_THRESHOLD,
+    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    IMAGE_SCORE_THRESHOLD,
+    DEFAULT_PROVIDER,
+    PROVIDER_MODELS,
+    ONLY_TEXT_ELIGIBLE_TAGS,
+)
 import httpx
 from socket import gaierror
 from pathlib import Path
@@ -1382,33 +1389,13 @@ def process_element(element: element.PageElement) -> bool:
                 return True  # Always keep video and audio elements
 
             if element.name != "pre":
-                if element.name in [
-                    "b",
-                    "i",
-                    "u",
-                    "span",
-                    "del",
-                    "ins",
-                    "sub",
-                    "sup",
-                    "strong",
-                    "em",
-                    "code",
-                    "kbd",
-                    "var",
-                    "s",
-                    "q",
-                    "abbr",
-                    "cite",
-                    "dfn",
-                    "time",
-                    "small",
-                    "mark",
-                ]:
+                if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
+                    replacement_text = element.get_text()
                     if kwargs.get("only_text", False):
-                        element.replace_with(element.get_text())
+                        element.replace_with(replacement_text)
                     else:
                         element.unwrap()
+                    return bool(replacement_text.strip())
                 elif element.name != "img":
                     element.attrs = {}
 

diff --git a/tests/regression/test_reg_utils.py b/tests/regression/test_reg_utils.py
@@ -10,6 +10,7 @@
 from crawl4ai.utils import (
     extract_xml_data,
     extract_xml_data_legacy,
+    get_content_of_website_optimized,
     normalize_url,
     normalize_url_for_deep_crawl,
     efficient_normalize_url_for_deep_crawl,
@@ -113,6 +114,22 @@ def test_missing_tag(self):
         assert result["missing"] == ""
 
 
+class TestContentExtraction:
+    """Verify utility HTML cleanup preserves neighboring text."""
+
+    def test_only_text_sup_preserves_following_text(self):
+        """Fix for sup cleanup removing text after the closing tag."""
+        result = get_content_of_website_optimized(
+            "https://example.com",
+            "<html><body><p>Alpha<sup>1</sup>Beta</p></body></html>",
+            only_text=True,
+        )
+
+        assert result is not None
+        assert "Alpha1Beta" in result["cleaned_html"]
+        assert "Beta" in result["markdown"]
+
+
 # ===================================================================
 # URL normalization
 # ===================================================================