From 986b71a8548b29781dcdb97460648bd208c32647 Mon Sep 17 00:00:00 2001 From: June Kim Date: Mon, 11 May 2026 20:49:14 -0700 Subject: [PATCH 1/4] Fix C tokenizer NUL byte truncation (issue #355) The C tokenizer was silently truncating input at the first NUL byte (\x00) because it used '\0' both as a valid input character and as the EOF sentinel. Root cause: - Tokenizer_read() returned '\0' for both: 1. Real NUL bytes in the input 2. End-of-input (when index >= text.length) - This made them indistinguishable, causing real NULs to be treated as EOF Fix: 1. Define TOKENIZER_EOF as 0x110000 (first invalid Unicode code point) 2. Update Tokenizer_read() and Tokenizer_read_backwards() to return TOKENIZER_EOF instead of '\0' for out-of-bounds reads 3. Replace all `!this` and `'\0'` checks with explicit `TOKENIZER_EOF` checks 4. Remove '\0' from the MARKERS array (no longer needed as EOF marker) 5. Move EOF check before is_marker() in main parse loop to ensure TOKENIZER_EOF doesn't try to emit as a character 6. Fix Tokenizer_has_leading_whitespace() to recognize TOKENIZER_EOF The Python tokenizer already preserved NUL bytes correctly; this brings the C tokenizer into parity. Regression test added: test_nul_byte_preservation() verifies that both tokenizers now preserve NUL bytes in plain text, templates, and multiple-NUL scenarios. --- .../parser/ctokenizer/common.h | 3 ++ .../parser/ctokenizer/tok_parse.c | 30 +++++++++---------- .../parser/ctokenizer/tok_parse.h | 6 ++-- .../parser/ctokenizer/tok_support.c | 4 +-- tests/test_tokenizer.py | 28 +++++++++++++++++ 5 files changed, 51 insertions(+), 20 deletions(-) diff --git a/src/mwparserfromhell/parser/ctokenizer/common.h b/src/mwparserfromhell/parser/ctokenizer/common.h index dbd13b68..602a9994 100644 --- a/src/mwparserfromhell/parser/ctokenizer/common.h +++ b/src/mwparserfromhell/parser/ctokenizer/common.h @@ -47,6 +47,9 @@ SOFTWARE. #define PyUnicode_FROM_SINGLE(chr) \ PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) +/* EOF sentinel: use a value beyond valid Unicode range (0x110000 is first invalid) */ +#define TOKENIZER_EOF ((Py_UCS4) 0x110000) + /* Error handling macros */ #define BAD_ROUTE self->route_state diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index f42add15..999df041 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -509,7 +509,7 @@ Tokenizer_parse_bracketed_uri_scheme(Tokenizer *self) if (!buffer) { return -1; } - while ((this = Tokenizer_read(self, 0))) { + while ((this = Tokenizer_read(self, 0)) != TOKENIZER_EOF) { i = 0; while (1) { if (!valid[i]) { @@ -678,7 +678,7 @@ Tokenizer_is_uri_end(Tokenizer *self, Py_UCS4 this, Py_UCS4 next) Py_UCS4 after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; - return (!this || this == '\n' || this == '[' || this == ']' || this == '<' || + return (this == TOKENIZER_EOF || this == '\n' || this == '[' || this == ']' || this == '<' || this == '>' || this == '"' || this == ' ' || (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) || (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || @@ -703,7 +703,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer * return NULL; } this = Tokenizer_read(self, 0); - if (!this || this == '\n' || this == ' ' || this == ']') { + if (this == TOKENIZER_EOF || this == '\n' || this == ' ' || this == ']') { return Tokenizer_fail_route(self); } if (!brackets && this == '[') { @@ -729,7 +729,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer * return NULL; } } else if (brackets) { - if (!this || this == '\n') { + if (this == TOKENIZER_EOF || this == '\n') { return Tokenizer_fail_route(self); } if (this == ']') { @@ -1040,7 +1040,7 @@ Tokenizer_really_parse_entity(Tokenizer *self) } self->head++; this = Tokenizer_read(self, 0); - if (!this) { + if (this == TOKENIZER_EOF) { Tokenizer_fail_route(self); return 0; } @@ -1051,7 +1051,7 @@ Tokenizer_really_parse_entity(Tokenizer *self) } self->head++; this = Tokenizer_read(self, 0); - if (!this) { + if (this == TOKENIZER_EOF) { Tokenizer_fail_route(self); return 0; } @@ -1233,7 +1233,7 @@ Tokenizer_parse_comment(Tokenizer *self) } while (1) { this = Tokenizer_read(self, 0); - if (!this) { + if (this == TOKENIZER_EOF) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; @@ -1597,7 +1597,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self) while (1) { this = Tokenizer_read(self, 0); next = Tokenizer_read(self, 1); - if (!this) { + if (this == TOKENIZER_EOF) { return Tokenizer_fail_route(self); } else if (this == '<' && next == '/') { self->head += 2; @@ -1639,7 +1639,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self) } return Tokenizer_pop(self); } - if (!this || this == '\n') { + if (this == TOKENIZER_EOF || this == '\n') { no_matching_end: Textbuffer_dealloc(buffer); self->head = reset; @@ -1790,7 +1790,7 @@ Tokenizer_really_parse_tag(Tokenizer *self) next = Tokenizer_read(self, 1); can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) || data->context & TAG_NOTE_SPACE); - if (!this) { + if (this == TOKENIZER_EOF) { if (self->topstack->context & LC_TAG_ATTR) { if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die @@ -2442,7 +2442,7 @@ Tokenizer_handle_table_style(Tokenizer *self, Py_UCS4 end_token) return NULL; } return padding; - } else if (!this || this == end_token) { + } else if (this == TOKENIZER_EOF || this == end_token) { if (self->topstack->context & LC_TAG_ATTR) { if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die @@ -2845,7 +2845,7 @@ Tokenizer_has_leading_whitespace(Tokenizer *self) Py_UCS4 current_character; while (1) { current_character = Tokenizer_read_backwards(self, offset); - if (!current_character || current_character == '\n') { + if (current_character == TOKENIZER_EOF || current_character == '\n') { return 1; } else if (!Py_UNICODE_ISSPACE(current_character)) { return 0; @@ -2876,6 +2876,9 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) while (1) { this = Tokenizer_read(self, 0); this_context = self->topstack->context; + if (this == TOKENIZER_EOF) { + return Tokenizer_handle_end(self, this_context); + } if (this_context & AGG_UNSAFE) { if (Tokenizer_verify_safe(self, this_context, this) < 0) { if (this_context & AGG_DOUBLE) { @@ -2892,9 +2895,6 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) self->head++; continue; } - if (!this) { - return Tokenizer_handle_end(self, this_context); - } if (PyErr_CheckSignals()) { return NULL; } diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h index d72b2b29..9f6da8f6 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h @@ -25,11 +25,11 @@ SOFTWARE. #include "common.h" static const Py_UCS4 MARKERS[] = { - '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', - '#', '*', ';', ':', '/', '-', '!', '\n', '\0', + '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', + '#', '*', ';', ':', '/', '-', '!', '\n', }; -#define NUM_MARKERS 19 +#define NUM_MARKERS 18 /* Functions */ diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_support.c b/src/mwparserfromhell/parser/ctokenizer/tok_support.c index 4c495ceb..aa9d80b0 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -442,7 +442,7 @@ Tokenizer_read(Tokenizer *self, Py_ssize_t delta) Py_ssize_t index = self->head + delta; if (index >= self->text.length) { - return '\0'; + return TOKENIZER_EOF; } return read_codepoint(&self->text, index); } @@ -456,7 +456,7 @@ Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta) Py_ssize_t index; if (delta > self->head) { - return '\0'; + return TOKENIZER_EOF; } index = self->head - delta; return read_codepoint(&self->text, index); diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 5864b48c..4b196d00 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -145,3 +145,31 @@ def test_describe_context(): assert "" == contexts.describe(0) ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT) assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx + + +@pytest.mark.parametrize( + "tokenizer", + filter(None, (CTokenizer, PyTokenizer)), + ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer", +) +def test_nul_byte_preservation(tokenizer): + """Test that NUL bytes (\\x00) are preserved in the input instead of being treated as EOF. + + Regression test for issue #355: C tokenizer was returning '\\0' both for real NUL bytes + in the input and for EOF, causing silent truncation at the first NUL. + """ + # Basic NUL preservation + result = str(Builder().build(tokenizer().tokenize("a\x00b"))) + assert result == "a\x00b", f"Expected 'a\\x00b', got {result!r}" + + # NUL in the middle of text + result = str(Builder().build(tokenizer().tokenize("hello\x00world"))) + assert result == "hello\x00world", f"Expected 'hello\\x00world', got {result!r}" + + # NUL inside template + result = str(Builder().build(tokenizer().tokenize("{{a\x00b}}"))) + assert result == "{{a\x00b}}", f"Expected '{{{{a\\x00b}}}}', got {result!r}" + + # Multiple NULs + result = str(Builder().build(tokenizer().tokenize("a\x00b\x00c"))) + assert result == "a\x00b\x00c", f"Expected 'a\\x00b\\x00c', got {result!r}" From 0f5f898bb76c18331d588794c1a84ade74a1afe7 Mon Sep 17 00:00:00 2001 From: June Kim Date: Mon, 11 May 2026 22:27:11 -0700 Subject: [PATCH 2/4] fix: replace remaining !last checks with TOKENIZER_EOF comparison Four start-of-input checks in the main parse loop still used !last (falsy NUL) to detect beginning-of-input. After the TOKENIZER_EOF sentinel change, Tokenizer_read_backwards returns 0x110000 instead of '\0', so !last is always false and headings, lists, and horizontal rules at position 0 would silently fail to parse. --- src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index 999df041..04d252a7 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -2913,7 +2913,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) return NULL; } } else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) { - if (!(self->global & GL_HEADING) && (!last || last == '\n') && + if (!(self->global & GL_HEADING) && (last == TOKENIZER_EOF || last == '\n') && next == '=') { if (Tokenizer_parse_heading(self)) { return NULL; @@ -2964,7 +2964,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) return Tokenizer_pop(self); } else if (this == '=' && !(self->global & GL_HEADING) && !(this_context & LC_TEMPLATE)) { - if (!last || last == '\n') { + if (last == TOKENIZER_EOF || last == '\n') { if (Tokenizer_parse_heading(self)) { return NULL; } @@ -3008,12 +3008,12 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) if (temp != Py_None) { return temp; } - } else if ((!last || last == '\n') && + } else if ((last == TOKENIZER_EOF || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { if (Tokenizer_handle_list(self)) { return NULL; } - } else if ((!last || last == '\n') && + } else if ((last == TOKENIZER_EOF || last == '\n') && (this == '-' && this == next && this == Tokenizer_read(self, 2) && this == Tokenizer_read(self, 3))) { if (Tokenizer_handle_hr(self)) { From 158b8967e17c0bf387518e3e423be19f17ca33db Mon Sep 17 00:00:00 2001 From: June Kim Date: Mon, 11 May 2026 22:41:58 -0700 Subject: [PATCH 3/4] fix: check for EOF instead of truthiness in tag close handling Tokenizer_read returns TOKENIZER_EOF for end-of-input, not a falsy value. The old truthiness check let NUL bytes truncate parsing. --- src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index 04d252a7..c8f7c0e2 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -2988,7 +2988,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) } else if (Tokenizer_emit_char(self, this)) { return NULL; } - } else if (this == '<' && next == '/' && Tokenizer_read(self, 2)) { + } else if (this == '<' && next == '/' && Tokenizer_read(self, 2) != TOKENIZER_EOF) { if (this_context & LC_TAG_BODY ? Tokenizer_handle_tag_open_close(self) : Tokenizer_handle_invalid_tag_start(self)) { return NULL; From 669edbcefa2c52673a4447f41b1f23e600c946ca Mon Sep 17 00:00:00 2001 From: June Kim Date: Tue, 12 May 2026 01:29:14 -0700 Subject: [PATCH 4/4] fix: handle TOKENIZER_EOF in invalid tag start loop and bare colon check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs introduced by the NUL truncation fix: 1. Tokenizer_handle_invalid_tag_start: the tag name scanning loop checked is_marker() and Py_UNICODE_ISSPACE() to terminate, but TOKENIZER_EOF (0x110000) matches neither, causing an infinite loop when an incomplete closing tag like "". 2. Tokenizer_parse colon handling: the bare external link check "this == ':' && !is_marker(last)" fired at start-of-input because is_marker(TOKENIZER_EOF) returns false (0x110000 not in MARKERS). This intercepted ":" before the list handler could run, breaking definition list items like ":text" →
. --- src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index c8f7c0e2..7c4c20b4 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1863,7 +1863,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer *self) } while (1) { this = Tokenizer_read(self, pos); - if (Py_UNICODE_ISSPACE(this) || is_marker(this)) { + if (this == TOKENIZER_EOF || Py_UNICODE_ISSPACE(this) || is_marker(this)) { name = Textbuffer_render(buf); if (!name) { Textbuffer_dealloc(buf); @@ -2956,7 +2956,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push) if (Tokenizer_parse_external_link(self, 1)) { return NULL; } - } else if (this == ':' && !is_marker(last)) { + } else if (this == ':' && last != TOKENIZER_EOF && !is_marker(last)) { if (Tokenizer_parse_external_link(self, 0)) { return NULL; }