From 2445c39f24790bb2022f7afb5d0d354a458141f5 Mon Sep 17 00:00:00 2001 From: Kyle Duren Date: Mon, 18 May 2026 16:39:43 -0400 Subject: [PATCH 1/3] esi: replace _findOpeningTag with memchr Replace the hand-rolled two-state-machine loop with memchr + memcmp. memchr delegates scanning to the platform's optimized implementation (e.g. __memchr_avx2 on glibc x86-64) to skip non-'<' bytes, then memcmp verifies each candidate anchor. This eliminates the KMP-failure limitation noted in the original comment: the old implementation could mishandle opening sequences like ' #include +#include using std::string; using namespace EsiLib; @@ -182,68 +183,51 @@ EsiParser::_compareData(const string &data, size_t pos, const char *str, int str return PARTIAL_MATCH; } -/** This implementation is optimized but not completely correct. If - * the opening tag were to have a repeating opening sequence ('(memchr(buf + i, '<', total - i)); + if (!p) + return NO_MATCH; + const size_t pos = static_cast(p - buf); + const size_t avail = total - pos; + + if (avail >= esi_len && memcmp(p, ESI_TAG_PREFIX, esi_len) == 0) { + is_html_comment_node = false; + opening_tag_pos = pos; + return COMPLETE_MATCH; + } + // hlen+1 bytes needed: hlen for the tag, 1 for the required trailing whitespace + if (avail > hlen && memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, hlen) == 0) { + const char ch = p[hlen]; + if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') { + is_html_comment_node = true; + opening_tag_pos = pos; return COMPLETE_MATCH; } - } else { - if (i_esi) { - i_esi = 0; - --i_data; // we do this to reexamine the current char as target string might start from here - if (i_html_comment) { - --i_html_comment; // in case other target string has started matching, adjust it's index - } - } } - // doing the exact same thing for the other target string - if (i_html_comment < HTML_COMMENT_NODE_INFO.tag_suffix_len && - data[i_data] == HTML_COMMENT_NODE_INFO.tag_suffix[i_html_comment]) { - if (++i_html_comment == HTML_COMMENT_NODE_INFO.tag_suffix_len && i_data + 1 < data.size()) { - char ch = data[i_data + 1]; //", node_list) == true); + REQUIRE(parser.completeParse(node_list) == true); + REQUIRE(node_list.size() == 2); + auto it = node_list.begin(); + REQUIRE(it->type == DocNode::TYPE_PRE); + REQUIRE(it->data_len == 3); + REQUIRE(strncmp(it->data, "pre", 3) == 0); + ++it; + REQUIRE(it->type == DocNode::TYPE_HTML_COMMENT); + REQUIRE(it->data_len == static_cast(strlen(""))); + REQUIRE(strncmp(it->data, "", it->data_len) == 0); + } + + SECTION("