Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 40 additions & 55 deletions plugins/esi/lib/EsiParser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <ts/ts.h>

#include <cctype>
#include <cstring>

using std::string;
using namespace EsiLib;
Expand Down Expand Up @@ -182,68 +183,52 @@ EsiParser::_compareData(const string &data, size_t pos, const char *str, int str
return PARTIAL_MATCH;
}

/** This implementation is optimized but not completely correct. If
* the opening tag were to have a repeating opening sequence ('<e<esi'
* or something like that), this will break. However that is not the
* case for the two opening tags we are looking for */
/** Uses memchr to skip non-'<' bytes, then memcmp to verify each candidate
* anchor. Delegates scanning to the platform's optimized memchr
* implementation. Does not have the KMP-failure limitation of the original
* state-machine. */
EsiParser::MATCH_TYPE
EsiParser::_findOpeningTag(const string &data, size_t start_pos, size_t &opening_tag_pos, bool &is_html_comment_node) const
{
size_t i_data = start_pos;
int i_esi = 0, i_html_comment = 0;

while (i_data < data.size()) {
if (data[i_data] == ESI_TAG_PREFIX[i_esi]) {
if (++i_esi == ESI_TAG_PREFIX_LEN) {
is_html_comment_node = false;
opening_tag_pos = i_data - i_esi + 1;
const char *const buf = data.data();
const size_t total = data.size();
size_t i = start_pos;

while (i < total) {
const char *p = static_cast<const char *>(memchr(buf + i, '<', total - i));
if (!p) {
return NO_MATCH;
}
const size_t pos = static_cast<size_t>(p - buf);
const size_t avail = total - pos;

if (avail >= static_cast<size_t>(ESI_TAG_PREFIX_LEN) && memcmp(p, ESI_TAG_PREFIX, ESI_TAG_PREFIX_LEN) == 0) {
is_html_comment_node = false;
opening_tag_pos = pos;
return COMPLETE_MATCH;
}
// tag_suffix_len+1 bytes needed: tag_suffix_len for the tag, 1 for the required trailing whitespace
if (avail > static_cast<size_t>(HTML_COMMENT_NODE_INFO.tag_suffix_len) &&
memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, HTML_COMMENT_NODE_INFO.tag_suffix_len) == 0) {
const char ch = p[HTML_COMMENT_NODE_INFO.tag_suffix_len];
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
is_html_comment_node = true;
opening_tag_pos = pos;
return COMPLETE_MATCH;
}
} else {
if (i_esi) {
i_esi = 0;
--i_data; // we do this to reexamine the current char as target string might start from here
if (i_html_comment) {
--i_html_comment; // in case other target string has started matching, adjust it's index
}
}
}
// doing the exact same thing for the other target string
if (i_html_comment < HTML_COMMENT_NODE_INFO.tag_suffix_len &&
data[i_data] == HTML_COMMENT_NODE_INFO.tag_suffix[i_html_comment]) {
if (++i_html_comment == HTML_COMMENT_NODE_INFO.tag_suffix_len && i_data + 1 < data.size()) {
char ch = data[i_data + 1]; //<!--esi must follow by a space char
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
is_html_comment_node = true;
opening_tag_pos = i_data - i_html_comment + 1;
return COMPLETE_MATCH;
}
}
} else {
if (i_html_comment) {
i_html_comment = 0;
--i_data; // same comments from above applies
if (i_esi) {
--i_esi;
}
}
if (avail < static_cast<size_t>(ESI_TAG_PREFIX_LEN) && memcmp(p, ESI_TAG_PREFIX, avail) == 0) {
is_html_comment_node = false;
opening_tag_pos = pos;
return PARTIAL_MATCH;
}
++i_data;
}
// partial matches; with the nature of our current opening tags, the
// only way we can have a partial match for both target strings is
// if the last char of the input string is '<' and that is not
// enough information to differentiate the tags; Anyway, the parser
// takes no action for a partial match
if (i_esi) {
is_html_comment_node = false;
opening_tag_pos = i_data - i_esi;
return PARTIAL_MATCH;
}
if (i_html_comment) {
is_html_comment_node = true;
opening_tag_pos = i_data - i_html_comment;
return PARTIAL_MATCH;
if (avail <= static_cast<size_t>(HTML_COMMENT_NODE_INFO.tag_suffix_len) &&
memcmp(p, HTML_COMMENT_NODE_INFO.tag_suffix, avail) == 0) {
is_html_comment_node = true;
opening_tag_pos = pos;
return PARTIAL_MATCH;
}
i = pos + 1;
}
return NO_MATCH;
}
Expand Down
72 changes: 72 additions & 0 deletions plugins/esi/test/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,78 @@ TEST_CASE("esi parser test")
REQUIRE(node_list.size() == 6);
}

SECTION("chunk boundary: <esi: prefix exact at chunk end")
{
// avail == ESI_TAG_PREFIX_LEN (5) at end of first chunk — exercises the
// avail >= esi_len complete-match branch with nothing left in the buffer.
DocNodeList node_list;
REQUIRE(parser.parseChunk("pre<esi:", node_list) == true);
REQUIRE(parser.parseChunk("include src=url/>", node_list) == true);
REQUIRE(parser.completeParse(node_list) == true);
REQUIRE(node_list.size() == 2);
auto it = node_list.begin();
REQUIRE(it->type == DocNode::TYPE_PRE);
REQUIRE(it->data_len == 3);
REQUIRE(strncmp(it->data, "pre", 3) == 0);
++it;
REQUIRE(it->type == DocNode::TYPE_INCLUDE);
REQUIRE(it->attr_list.size() == 1);
check_node_attr(it->attr_list.front(), "src", "url");
}

SECTION("chunk boundary: <!--esi prefix exact at chunk end")
{
// avail == hlen (7) at end of first chunk — exercises the avail <= hlen
// partial-match branch; the trailing whitespace arrives in the next chunk.
DocNodeList node_list;
REQUIRE(parser.parseChunk("pre<!--esi", node_list) == true);
REQUIRE(parser.parseChunk(" <esi:comment text=blah/>-->", node_list) == true);
REQUIRE(parser.completeParse(node_list) == true);
REQUIRE(node_list.size() == 2);
auto it = node_list.begin();
REQUIRE(it->type == DocNode::TYPE_PRE);
REQUIRE(it->data_len == 3);
REQUIRE(strncmp(it->data, "pre", 3) == 0);
++it;
REQUIRE(it->type == DocNode::TYPE_HTML_COMMENT);
REQUIRE(it->data_len == static_cast<int>(strlen("<esi:comment text=blah/>")));
REQUIRE(strncmp(it->data, "<esi:comment text=blah/>", it->data_len) == 0);
}

SECTION("<!--esi without trailing whitespace is not a tag")
{
// <!--esi_ has no whitespace after the prefix — must be treated as PRE
// and scanning must continue to find the real <esi: tag that follows.
DocNodeList node_list;
REQUIRE(parser.parse(node_list, "<!--esi_nospace<esi:include src=url/>") == true);
REQUIRE(node_list.size() == 2);
auto it = node_list.begin();
REQUIRE(it->type == DocNode::TYPE_PRE);
REQUIRE(it->data_len == static_cast<int>(strlen("<!--esi_nospace")));
REQUIRE(strncmp(it->data, "<!--esi_nospace", it->data_len) == 0);
++it;
REQUIRE(it->type == DocNode::TYPE_INCLUDE);
REQUIRE(it->attr_list.size() == 1);
check_node_attr(it->attr_list.front(), "src", "url");
}

SECTION("multiple false '<' anchors before valid tag")
{
// Exercises the memchr loop iterating past several non-tag '<' bytes
// before landing on a real <esi: tag. All skipped content becomes PRE.
DocNodeList node_list;
REQUIRE(parser.parse(node_list, "< <a> <!--esi_bad <esi:include src=url/>") == true);
REQUIRE(node_list.size() == 2);
auto it = node_list.begin();
REQUIRE(it->type == DocNode::TYPE_PRE);
REQUIRE(it->data_len == static_cast<int>(strlen("< <a> <!--esi_bad ")));
REQUIRE(strncmp(it->data, "< <a> <!--esi_bad ", it->data_len) == 0);
++it;
REQUIRE(it->type == DocNode::TYPE_INCLUDE);
REQUIRE(it->attr_list.size() == 1);
check_node_attr(it->attr_list.front(), "src", "url");
}

SECTION("No handler attr")
{
string input_data = "<esi:special-include />";
Expand Down