diff --git a/src/mwparserfromhell/parser/ctokenizer/common.h b/src/mwparserfromhell/parser/ctokenizer/common.h index dbd13b6..602a999 100644 --- a/src/mwparserfromhell/parser/ctokenizer/common.h +++ b/src/mwparserfromhell/parser/ctokenizer/common.h @@ -47,6 +47,9 @@ SOFTWARE. #define PyUnicode_FROM_SINGLE(chr) \ PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) +/* EOF sentinel: use a value beyond valid Unicode range (0x110000 is first invalid) */ +#define TOKENIZER_EOF ((Py_UCS4) 0x110000) + /* Error handling macros */ #define BAD_ROUTE self->route_state diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index f42add1..7c4c20b 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -509,7 +509,7 @@ Tokenizer_parse_bracketed_uri_scheme(Tokenizer *self) if (!buffer) { return -1; } - while ((this = Tokenizer_read(self, 0))) { + while ((this = Tokenizer_read(self, 0)) != TOKENIZER_EOF) { i = 0; while (1) { if (!valid[i]) { @@ -678,7 +678,7 @@ Tokenizer_is_uri_end(Tokenizer *self, Py_UCS4 this, Py_UCS4 next) Py_UCS4 after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; - return (!this || this == '\n' || this == '[' || this == ']' || this == '<' || + return (this == TOKENIZER_EOF || this == '\n' || this == '[' || this == ']' || this == '<' || this == '>' || this == '"' || this == ' ' || (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) || (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || @@ -703,7 +703,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer * return NULL; } this = Tokenizer_read(self, 0); - if (!this || this == '\n' || this == ' ' || this == ']') { + if (this == TOKENIZER_EOF || this == '\n' || this == ' ' || this == ']') { return Tokenizer_fail_route(self); } if (!brackets && this == '[') { @@ -729,7 +729,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer * return NULL; } } else if (brackets) { - if (!this || this == '\n') { + if (this == TOKENIZER_EOF || this == '\n') { return Tokenizer_fail_route(self); } if (this == ']') { @@ -1040,7 +1040,7 @@ Tokenizer_really_parse_entity(Tokenizer *self) } self->head++; this = Tokenizer_read(self, 0); - if (!this) { + if (this == TOKENIZER_EOF) { Tokenizer_fail_route(self); return 0; } @@ -1051,7 +1051,7 @@ Tokenizer_really_parse_entity(Tokenizer *self) } self->head++; this = Tokenizer_read(self, 0); - if (!this) { + if (this == TOKENIZER_EOF) { Tokenizer_fail_route(self); return 0; } @@ -1233,7 +1233,7 @@ Tokenizer_parse_comment(Tokenizer *self) } while (1) { this = Tokenizer_read(self, 0); - if (!this) { + if (this == TOKENIZER_EOF) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; @@ -1597,7 +1597,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self) while (1) { this = Tokenizer_read(self, 0); next = Tokenizer_read(self, 1); - if (!this) { + if (this == TOKENIZER_EOF) { return Tokenizer_fail_route(self); } else if (this == '<' && next == '/') { self->head += 2; @@ -1639,7 +1639,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self) } return Tokenizer_pop(self); } - if (!this || this == '\n') { + if (this == TOKENIZER_EOF || this == '\n') { no_matching_end: Textbuffer_dealloc(buffer); self->head = reset; @@ -1790,7 +1790,7 @@ Tokenizer_really_parse_tag(Tokenizer *self) next = Tokenizer_read(self, 1); can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) || data->context & TAG_NOTE_SPACE); - if (!this) { + if (this == TOKENIZER_EOF) { if (self->topstack->context & LC_TAG_ATTR) { if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die @@ -1863,7 +1863,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer *self) } while (1) { this = Tokenizer_read(self, pos); - if (Py_UNICODE_ISSPACE(this) || is_marker(this)) { + if (this == TOKENIZER_EOF || Py_UNICODE_ISSPACE(this) || is_marker(this)) { name = Textbuffer_render(buf); if (!name) { Textbuffer_dealloc(buf); @@ -2442,7 +2442,7 @@ Tokenizer_handle_table_style(Tokenizer *self, Py_UCS4 end_token) return NULL; } return padding; - } else if (!this || this == end_token) { + } else if (this == TOKENIZER_EOF || this == end_token) { if (self->topstack->context & LC_TAG_ATTR) { if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die @@ -2845,7 +2845,7 @@ Tokenizer_has_leading_whitespace(Tokenizer *self) Py_UCS4 current_character; while (1) { current_character = Tokenizer_read_backwards(self, offset); - if (!current_character || current_character == '\n') { + if (current_character == TOKENIZER_EOF || current_character == '\n') { return 1; } else if (!Py_UNICODE_ISSPACE(current_character)) { return 0; @@ -2876,6 +2876,9 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) while (1) { this = Tokenizer_read(self, 0); this_context = self->topstack->context; + if (this == TOKENIZER_EOF) { + return Tokenizer_handle_end(self, this_context); + } if (this_context & AGG_UNSAFE) { if (Tokenizer_verify_safe(self, this_context, this) < 0) { if (this_context & AGG_DOUBLE) { @@ -2892,9 +2895,6 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) self->head++; continue; } - if (!this) { - return Tokenizer_handle_end(self, this_context); - } if (PyErr_CheckSignals()) { return NULL; } @@ -2913,7 +2913,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) return NULL; } } else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) { - if (!(self->global & GL_HEADING) && (!last || last == '\n') && + if (!(self->global & GL_HEADING) && (last == TOKENIZER_EOF || last == '\n') && next == '=') { if (Tokenizer_parse_heading(self)) { return NULL; @@ -2956,7 +2956,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) if (Tokenizer_parse_external_link(self, 1)) { return NULL; } - } else if (this == ':' && !is_marker(last)) { + } else if (this == ':' && last != TOKENIZER_EOF && !is_marker(last)) { if (Tokenizer_parse_external_link(self, 0)) { return NULL; } @@ -2964,7 +2964,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) return Tokenizer_pop(self); } else if (this == '=' && !(self->global & GL_HEADING) && !(this_context & LC_TEMPLATE)) { - if (!last || last == '\n') { + if (last == TOKENIZER_EOF || last == '\n') { if (Tokenizer_parse_heading(self)) { return NULL; } @@ -2988,7 +2988,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) } else if (Tokenizer_emit_char(self, this)) { return NULL; } - } else if (this == '<' && next == '/' && Tokenizer_read(self, 2)) { + } else if (this == '<' && next == '/' && Tokenizer_read(self, 2) != TOKENIZER_EOF) { if (this_context & LC_TAG_BODY ? Tokenizer_handle_tag_open_close(self) : Tokenizer_handle_invalid_tag_start(self)) { return NULL; @@ -3008,12 +3008,12 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) if (temp != Py_None) { return temp; } - } else if ((!last || last == '\n') && + } else if ((last == TOKENIZER_EOF || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { if (Tokenizer_handle_list(self)) { return NULL; } - } else if ((!last || last == '\n') && + } else if ((last == TOKENIZER_EOF || last == '\n') && (this == '-' && this == next && this == Tokenizer_read(self, 2) && this == Tokenizer_read(self, 3))) { if (Tokenizer_handle_hr(self)) { diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h index d72b2b2..9f6da8f 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h @@ -25,11 +25,11 @@ SOFTWARE. #include "common.h" static const Py_UCS4 MARKERS[] = { - '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', - '#', '*', ';', ':', '/', '-', '!', '\n', '\0', + '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', + '#', '*', ';', ':', '/', '-', '!', '\n', }; -#define NUM_MARKERS 19 +#define NUM_MARKERS 18 /* Functions */ diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_support.c b/src/mwparserfromhell/parser/ctokenizer/tok_support.c index 4c495ce..aa9d80b 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -442,7 +442,7 @@ Tokenizer_read(Tokenizer *self, Py_ssize_t delta) Py_ssize_t index = self->head + delta; if (index >= self->text.length) { - return '\0'; + return TOKENIZER_EOF; } return read_codepoint(&self->text, index); } @@ -456,7 +456,7 @@ Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta) Py_ssize_t index; if (delta > self->head) { - return '\0'; + return TOKENIZER_EOF; } index = self->head - delta; return read_codepoint(&self->text, index); diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 5864b48..4b196d0 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -145,3 +145,31 @@ def test_describe_context(): assert "" == contexts.describe(0) ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT) assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx + + +@pytest.mark.parametrize( + "tokenizer", + filter(None, (CTokenizer, PyTokenizer)), + ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer", +) +def test_nul_byte_preservation(tokenizer): + """Test that NUL bytes (\\x00) are preserved in the input instead of being treated as EOF. + + Regression test for issue #355: C tokenizer was returning '\\0' both for real NUL bytes + in the input and for EOF, causing silent truncation at the first NUL. + """ + # Basic NUL preservation + result = str(Builder().build(tokenizer().tokenize("a\x00b"))) + assert result == "a\x00b", f"Expected 'a\\x00b', got {result!r}" + + # NUL in the middle of text + result = str(Builder().build(tokenizer().tokenize("hello\x00world"))) + assert result == "hello\x00world", f"Expected 'hello\\x00world', got {result!r}" + + # NUL inside template + result = str(Builder().build(tokenizer().tokenize("{{a\x00b}}"))) + assert result == "{{a\x00b}}", f"Expected '{{{{a\\x00b}}}}', got {result!r}" + + # Multiple NULs + result = str(Builder().build(tokenizer().tokenize("a\x00b\x00c"))) + assert result == "a\x00b\x00c", f"Expected 'a\\x00b\\x00c', got {result!r}"