From 2445c39f24790bb2022f7afb5d0d354a458141f5 Mon Sep 17 00:00:00 2001
From: Kyle Duren <kyle.duren@yahooinc.com>
Date: Mon, 18 May 2026 16:39:43 -0400
Subject: [PATCH 1/3] esi: replace _findOpeningTag with memchr

Replace the hand-rolled two-state-machine loop with memchr + memcmp.
memchr delegates scanning to the platform's optimized implementation
(e.g. __memchr_avx2 on glibc x86-64) to skip non-'<' bytes, then
memcmp verifies each candidate anchor.

This eliminates the KMP-failure limitation noted in the original
comment: the old implementation could mishandle opening sequences
like '<e<esi'; the new one finds the correct anchor unconditionally.

Adds four parser_test sections covering boundary cases specific to
the new implementation: exact-length prefix at chunk end for both
<esi: and <!--esi, <!--esi without required trailing whitespace, and
multiple false '<' anchors before a valid tag.
---
 plugins/esi/lib/EsiParser.cc    | 94 ++++++++++++++-------------------
 plugins/esi/test/parser_test.cc | 72 +++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 55 deletions(-)
diff --git a/plugins/esi/lib/EsiParser.cc b/plugins/esi/lib/EsiParser.cc
index 44d5a0bb9de..d28f5c9db65 100644
--- a/plugins/esi/lib/EsiParser.cc
+++ b/plugins/esi/lib/EsiParser.cc
@@ -27,6 +27,7 @@
 #include <ts/ts.h>
 
 #include <cctype>
+#include <cstring>
 
 using std::string;
 using namespace EsiLib;
@@ -182,68 +183,51 @@ EsiParser::_compareData(const string &data, size_t pos, const char *str, int str
   return PARTIAL_MATCH;
 }
 
-/** This implementation is optimized but not completely correct.  If
- * the opening tag were to have a repeating opening sequence ('<e<esi'
- * or something like that), this will break. However that is not the
- * case for the two opening tags we are looking for */
+/** Uses memchr to skip non-'<' bytes, then memcmp to verify each candidate
+ * anchor.  Delegates scanning to the platform's optimized memchr
+ * implementation.  Does not have the KMP-failure limitation of the original
+ * state-machine. */
 EsiParser::MATCH_TYPE
 EsiParser::_findOpeningTag(const string &data, size_t start_pos, size_t &opening_tag_pos, bool &is_html_comment_node) const
 {
-  size_t i_data = start_pos;
-  int    i_esi = 0, i_html_comment = 0;
-
-  while (i_data < data.size()) {
-    if (data[i_data] == ESI_TAG_PREFIX[i_esi]) {
-      if (++i_esi == ESI_TAG_PREFIX_LEN) {
-        is_html_comment_node = false;
-        opening_tag_pos      = i_data - i_esi + 1;
+  const char *const buf     = data.data();
+  const size_t      total   = data.size();
+  const size_t      esi_len = ESI_TAG_PREFIX_LEN;
+  const size_t      hlen    = HTML_COMMENT_NODE_INFO.tag_suffix_len;
+  size_t            i       = start_pos;
+
+  while (i < total) {
+    const char *p = static_cast<const char *>(memchr(buf + i, '<', total - i));
+    if (!p)
+      return NO_MATCH;
+    const size_t pos   = static_cast<size_t>(p - buf);
+    const size_t avail = total - pos;
+
+    if (avail >= esi_len && memcmp(p, ESI_TAG_PREFIX, esi_len) == 0) {
+      is_html_comment_node = false;
+      opening_tag_pos      = pos;
+      return COMPLETE_MATCH;
+    }
+    // hlen+1 bytes needed: hlen for the tag, 1 for the required trailing whitespace
+    if (avail > hlen && memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, hlen) == 0) {
+      const char ch = p[hlen];
+      if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
+        is_html_comment_node = true;
+        opening_tag_pos      = pos;
         return COMPLETE_MATCH;
       }
-    } else {
-      if (i_esi) {
-        i_esi = 0;
-        --i_data; // we do this to reexamine the current char as target string might start from here
-        if (i_html_comment) {
-          --i_html_comment; // in case other target string has started matching, adjust it's index
-        }
-      }
     }
-    // doing the exact same thing for the other target string
-    if (i_html_comment < HTML_COMMENT_NODE_INFO.tag_suffix_len &&
-        data[i_data] == HTML_COMMENT_NODE_INFO.tag_suffix[i_html_comment]) {
-      if (++i_html_comment == HTML_COMMENT_NODE_INFO.tag_suffix_len && i_data + 1 < data.size()) {
-        char ch = data[i_data + 1]; //<!--esi must follow by a space char
-        if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
-          is_html_comment_node = true;
-          opening_tag_pos      = i_data - i_html_comment + 1;
-          return COMPLETE_MATCH;
-        }
-      }
-    } else {
-      if (i_html_comment) {
-        i_html_comment = 0;
-        --i_data; // same comments from above applies
-        if (i_esi) {
-          --i_esi;
-        }
-      }
+    if (avail < esi_len && memcmp(p, ESI_TAG_PREFIX, avail) == 0) {
+      is_html_comment_node = false;
+      opening_tag_pos      = pos;
+      return PARTIAL_MATCH;
     }
-    ++i_data;
-  }
-  // partial matches; with the nature of our current opening tags, the
-  // only way we can have a partial match for both target strings is
-  // if the last char of the input string is '<' and that is not
-  // enough information to differentiate the tags; Anyway, the parser
-  // takes no action for a partial match
-  if (i_esi) {
-    is_html_comment_node = false;
-    opening_tag_pos      = i_data - i_esi;
-    return PARTIAL_MATCH;
-  }
-  if (i_html_comment) {
-    is_html_comment_node = true;
-    opening_tag_pos      = i_data - i_html_comment;
-    return PARTIAL_MATCH;
+    if (avail <= hlen && memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, avail) == 0) {
+      is_html_comment_node = true;
+      opening_tag_pos      = pos;
+      return PARTIAL_MATCH;
+    }
+    i = pos + 1;
   }
   return NO_MATCH;
 }
diff --git a/plugins/esi/test/parser_test.cc b/plugins/esi/test/parser_test.cc
index 36aff86cf21..535b5eab3fb 100644
--- a/plugins/esi/test/parser_test.cc
+++ b/plugins/esi/test/parser_test.cc
@@ -986,6 +986,78 @@ TEST_CASE("esi parser test")
     REQUIRE(node_list.size() == 6);
   }
 
+  SECTION("chunk boundary: <esi: prefix exact at chunk end")
+  {
+    // avail == ESI_TAG_PREFIX_LEN (5) at end of first chunk — exercises the
+    // avail >= esi_len complete-match branch with nothing left in the buffer.
+    DocNodeList node_list;
+    REQUIRE(parser.parseChunk("pre<esi:", node_list) == true);
+    REQUIRE(parser.parseChunk("include src=url/>", node_list) == true);
+    REQUIRE(parser.completeParse(node_list) == true);
+    REQUIRE(node_list.size() == 2);
+    auto it = node_list.begin();
+    REQUIRE(it->type == DocNode::TYPE_PRE);
+    REQUIRE(it->data_len == 3);
+    REQUIRE(strncmp(it->data, "pre", 3) == 0);
+    ++it;
+    REQUIRE(it->type == DocNode::TYPE_INCLUDE);
+    REQUIRE(it->attr_list.size() == 1);
+    check_node_attr(it->attr_list.front(), "src", "url");
+  }
+
+  SECTION("chunk boundary: <!--esi prefix exact at chunk end")
+  {
+    // avail == hlen (7) at end of first chunk — exercises the avail <= hlen
+    // partial-match branch; the trailing whitespace arrives in the next chunk.
+    DocNodeList node_list;
+    REQUIRE(parser.parseChunk("pre<!--esi", node_list) == true);
+    REQUIRE(parser.parseChunk(" <esi:comment text=blah/>-->", node_list) == true);
+    REQUIRE(parser.completeParse(node_list) == true);
+    REQUIRE(node_list.size() == 2);
+    auto it = node_list.begin();
+    REQUIRE(it->type == DocNode::TYPE_PRE);
+    REQUIRE(it->data_len == 3);
+    REQUIRE(strncmp(it->data, "pre", 3) == 0);
+    ++it;
+    REQUIRE(it->type == DocNode::TYPE_HTML_COMMENT);
+    REQUIRE(it->data_len == static_cast<int>(strlen("<esi:comment text=blah/>")));
+    REQUIRE(strncmp(it->data, "<esi:comment text=blah/>", it->data_len) == 0);
+  }
+
+  SECTION("<!--esi without trailing whitespace is not a tag")
+  {
+    // <!--esi_ has no whitespace after the prefix — must be treated as PRE
+    // and scanning must continue to find the real <esi: tag that follows.
+    DocNodeList node_list;
+    REQUIRE(parser.parse(node_list, "<!--esi_nospace<esi:include src=url/>") == true);
+    REQUIRE(node_list.size() == 2);
+    auto it = node_list.begin();
+    REQUIRE(it->type == DocNode::TYPE_PRE);
+    REQUIRE(it->data_len == static_cast<int>(strlen("<!--esi_nospace")));
+    REQUIRE(strncmp(it->data, "<!--esi_nospace", it->data_len) == 0);
+    ++it;
+    REQUIRE(it->type == DocNode::TYPE_INCLUDE);
+    REQUIRE(it->attr_list.size() == 1);
+    check_node_attr(it->attr_list.front(), "src", "url");
+  }
+
+  SECTION("multiple false '<' anchors before valid tag")
+  {
+    // Exercises the memchr loop iterating past several non-tag '<' bytes
+    // before landing on a real <esi: tag.  All skipped content becomes PRE.
+    DocNodeList node_list;
+    REQUIRE(parser.parse(node_list, "< <a> <!--esi_bad <esi:include src=url/>") == true);
+    REQUIRE(node_list.size() == 2);
+    auto it = node_list.begin();
+    REQUIRE(it->type == DocNode::TYPE_PRE);
+    REQUIRE(it->data_len == static_cast<int>(strlen("< <a> <!--esi_bad ")));
+    REQUIRE(strncmp(it->data, "< <a> <!--esi_bad ", it->data_len) == 0);
+    ++it;
+    REQUIRE(it->type == DocNode::TYPE_INCLUDE);
+    REQUIRE(it->attr_list.size() == 1);
+    check_node_attr(it->attr_list.front(), "src", "url");
+  }
+
   SECTION("No handler attr")
   {
     string input_data = "<esi:special-include />";

From fdf09bf3b709e38dbc2470771f28129c30409567 Mon Sep 17 00:00:00 2001
From: Kyle Duren <pixitha@users.noreply.github.com>
Date: Mon, 18 May 2026 18:27:01 -0400
Subject: [PATCH 2/3] fixing style issue

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 plugins/esi/lib/EsiParser.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/plugins/esi/lib/EsiParser.cc b/plugins/esi/lib/EsiParser.cc
index d28f5c9db65..6a7ce4ee2e0 100644
--- a/plugins/esi/lib/EsiParser.cc
+++ b/plugins/esi/lib/EsiParser.cc
@@ -198,8 +198,9 @@ EsiParser::_findOpeningTag(const string &data, size_t start_pos, size_t &opening
 
   while (i < total) {
     const char *p = static_cast<const char *>(memchr(buf + i, '<', total - i));
-    if (!p)
+    if (!p) {
       return NO_MATCH;
+    }
     const size_t pos   = static_cast<size_t>(p - buf);
     const size_t avail = total - pos;
 

From 6f60f189e3d89bf14ecfcf8cb712cc01d5419b71 Mon Sep 17 00:00:00 2001
From: Kyle Duren <kyle.duren@yahooinc.com>
Date: Tue, 19 May 2026 17:30:15 -0400
Subject: [PATCH 3/3] esi: inline ESI_TAG_PREFIX_LEN and
 HTML_COMMENT_NODE_INFO.tag_suffix_len

Remove the hlen and esi_len local aliases so the length argument to each
memcmp call is read from the same named constant as the string argument,
consistent with how ESI_TAG_PREFIX and HTML_COMMENT_NODE_INFO.tag_suffix
were already written in full.

Addresses review feedback from zwoop on PR #13173.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 plugins/esi/lib/EsiParser.cc | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/plugins/esi/lib/EsiParser.cc b/plugins/esi/lib/EsiParser.cc
index 6a7ce4ee2e0..c376a2d93b9 100644
--- a/plugins/esi/lib/EsiParser.cc
+++ b/plugins/esi/lib/EsiParser.cc
@@ -190,11 +190,9 @@ EsiParser::_compareData(const string &data, size_t pos, const char *str, int str
 EsiParser::MATCH_TYPE
 EsiParser::_findOpeningTag(const string &data, size_t start_pos, size_t &opening_tag_pos, bool &is_html_comment_node) const
 {
-  const char *const buf     = data.data();
-  const size_t      total   = data.size();
-  const size_t      esi_len = ESI_TAG_PREFIX_LEN;
-  const size_t      hlen    = HTML_COMMENT_NODE_INFO.tag_suffix_len;
-  size_t            i       = start_pos;
+  const char *const buf   = data.data();
+  const size_t      total = data.size();
+  size_t            i     = start_pos;
 
   while (i < total) {
     const char *p = static_cast<const char *>(memchr(buf + i, '<', total - i));
@@ -204,26 +202,28 @@ EsiParser::_findOpeningTag(const string &data, size_t start_pos, size_t &opening
     const size_t pos   = static_cast<size_t>(p - buf);
     const size_t avail = total - pos;
 
-    if (avail >= esi_len && memcmp(p, ESI_TAG_PREFIX, esi_len) == 0) {
+    if (avail >= static_cast<size_t>(ESI_TAG_PREFIX_LEN) && memcmp(p, ESI_TAG_PREFIX, ESI_TAG_PREFIX_LEN) == 0) {
       is_html_comment_node = false;
       opening_tag_pos      = pos;
       return COMPLETE_MATCH;
     }
-    // hlen+1 bytes needed: hlen for the tag, 1 for the required trailing whitespace
-    if (avail > hlen && memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, hlen) == 0) {
-      const char ch = p[hlen];
+    // tag_suffix_len+1 bytes needed: tag_suffix_len for the tag, 1 for the required trailing whitespace
+    if (avail > static_cast<size_t>(HTML_COMMENT_NODE_INFO.tag_suffix_len) &&
+        memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, HTML_COMMENT_NODE_INFO.tag_suffix_len) == 0) {
+      const char ch = p[HTML_COMMENT_NODE_INFO.tag_suffix_len];
       if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
         is_html_comment_node = true;
         opening_tag_pos      = pos;
         return COMPLETE_MATCH;
       }
     }
-    if (avail < esi_len && memcmp(p, ESI_TAG_PREFIX, avail) == 0) {
+    if (avail < static_cast<size_t>(ESI_TAG_PREFIX_LEN) && memcmp(p, ESI_TAG_PREFIX, avail) == 0) {
       is_html_comment_node = false;
       opening_tag_pos      = pos;
       return PARTIAL_MATCH;
     }
-    if (avail <= hlen && memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, avail) == 0) {
+    if (avail <= static_cast<size_t>(HTML_COMMENT_NODE_INFO.tag_suffix_len) &&
+        memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, avail) == 0) {
       is_html_comment_node = true;
       opening_tag_pos      = pos;
       return PARTIAL_MATCH;