earwig · kimjune01 · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/src/mwparserfromhell/parser/ctokenizer/common.h b/src/mwparserfromhell/parser/ctokenizer/common.h
@@ -47,6 +47,9 @@ SOFTWARE.
 #define PyUnicode_FROM_SINGLE(chr)                                                     \
     PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
 
+/* EOF sentinel: use a value beyond valid Unicode range (0x110000 is first invalid) */
+#define TOKENIZER_EOF ((Py_UCS4) 0x110000)
+
 /* Error handling macros */
 
 #define BAD_ROUTE         self->route_state

diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -509,7 +509,7 @@ Tokenizer_parse_bracketed_uri_scheme(Tokenizer *self)
         if (!buffer) {
             return -1;
         }
-        while ((this = Tokenizer_read(self, 0))) {
+        while ((this = Tokenizer_read(self, 0)) != TOKENIZER_EOF) {
             i = 0;
             while (1) {
                 if (!valid[i]) {
@@ -678,7 +678,7 @@ Tokenizer_is_uri_end(Tokenizer *self, Py_UCS4 this, Py_UCS4 next)
     Py_UCS4 after = Tokenizer_read(self, 2);
     uint64_t ctx = self->topstack->context;
 
-    return (!this || this == '\n' || this == '[' || this == ']' || this == '<' ||
+    return (this == TOKENIZER_EOF || this == '\n' || this == '[' || this == ']' || this == '<' ||
             this == '>' || this == '"' || this == ' ' ||
             (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) ||
             (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
@@ -703,7 +703,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer *
         return NULL;
     }
     this = Tokenizer_read(self, 0);
-    if (!this || this == '\n' || this == ' ' || this == ']') {
+    if (this == TOKENIZER_EOF || this == '\n' || this == ' ' || this == ']') {
         return Tokenizer_fail_route(self);
     }
     if (!brackets && this == '[') {
@@ -729,7 +729,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer *
                 return NULL;
             }
         } else if (brackets) {
-            if (!this || this == '\n') {
+            if (this == TOKENIZER_EOF || this == '\n') {
                 return Tokenizer_fail_route(self);
             }
             if (this == ']') {
@@ -1040,7 +1040,7 @@ Tokenizer_really_parse_entity(Tokenizer *self)
     }
     self->head++;
     this = Tokenizer_read(self, 0);
-    if (!this) {
+    if (this == TOKENIZER_EOF) {
         Tokenizer_fail_route(self);
         return 0;
     }
@@ -1051,7 +1051,7 @@ Tokenizer_really_parse_entity(Tokenizer *self)
         }
         self->head++;
         this = Tokenizer_read(self, 0);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             Tokenizer_fail_route(self);
             return 0;
         }
@@ -1233,7 +1233,7 @@ Tokenizer_parse_comment(Tokenizer *self)
     }
     while (1) {
         this = Tokenizer_read(self, 0);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             comment = Tokenizer_pop(self);
             Py_XDECREF(comment);
             self->head = reset;
@@ -1597,7 +1597,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self)
     while (1) {
         this = Tokenizer_read(self, 0);
         next = Tokenizer_read(self, 1);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             return Tokenizer_fail_route(self);
         } else if (this == '<' && next == '/') {
             self->head += 2;
@@ -1639,7 +1639,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self)
                     }
                     return Tokenizer_pop(self);
                 }
-                if (!this || this == '\n') {
+                if (this == TOKENIZER_EOF || this == '\n') {
                 no_matching_end:
                     Textbuffer_dealloc(buffer);
                     self->head = reset;
@@ -1790,7 +1790,7 @@ Tokenizer_really_parse_tag(Tokenizer *self)
         next = Tokenizer_read(self, 1);
         can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
                     data->context & TAG_NOTE_SPACE);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             if (self->topstack->context & LC_TAG_ATTR) {
                 if (data->context & TAG_QUOTED) {
                     // Unclosed attribute quote: reset, don't die
@@ -1863,7 +1863,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer *self)
     }
     while (1) {
         this = Tokenizer_read(self, pos);
-        if (Py_UNICODE_ISSPACE(this) || is_marker(this)) {
+        if (this == TOKENIZER_EOF || Py_UNICODE_ISSPACE(this) || is_marker(this)) {
             name = Textbuffer_render(buf);
             if (!name) {
                 Textbuffer_dealloc(buf);
@@ -2442,7 +2442,7 @@ Tokenizer_handle_table_style(Tokenizer *self, Py_UCS4 end_token)
                 return NULL;
             }
             return padding;
-        } else if (!this || this == end_token) {
+        } else if (this == TOKENIZER_EOF || this == end_token) {
             if (self->topstack->context & LC_TAG_ATTR) {
                 if (data->context & TAG_QUOTED) {
                     // Unclosed attribute quote: reset, don't die
@@ -2845,7 +2845,7 @@ Tokenizer_has_leading_whitespace(Tokenizer *self)
     Py_UCS4 current_character;
     while (1) {
         current_character = Tokenizer_read_backwards(self, offset);
-        if (!current_character || current_character == '\n') {
+        if (current_character == TOKENIZER_EOF || current_character == '\n') {
             return 1;
         } else if (!Py_UNICODE_ISSPACE(current_character)) {
             return 0;
@@ -2876,6 +2876,9 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
     while (1) {
         this = Tokenizer_read(self, 0);
         this_context = self->topstack->context;
+        if (this == TOKENIZER_EOF) {
+            return Tokenizer_handle_end(self, this_context);
+        }
         if (this_context & AGG_UNSAFE) {
             if (Tokenizer_verify_safe(self, this_context, this) < 0) {
                 if (this_context & AGG_DOUBLE) {
@@ -2892,9 +2895,6 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             self->head++;
             continue;
         }
-        if (!this) {
-            return Tokenizer_handle_end(self, this_context);
-        }
         if (PyErr_CheckSignals()) {
             return NULL;
         }
@@ -2913,7 +2913,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
                 return NULL;
             }
         } else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) {
-            if (!(self->global & GL_HEADING) && (!last || last == '\n') &&
+            if (!(self->global & GL_HEADING) && (last == TOKENIZER_EOF || last == '\n') &&
                 next == '=') {
                 if (Tokenizer_parse_heading(self)) {
                     return NULL;
@@ -2956,15 +2956,15 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             if (Tokenizer_parse_external_link(self, 1)) {
                 return NULL;
             }
-        } else if (this == ':' && !is_marker(last)) {
+        } else if (this == ':' && last != TOKENIZER_EOF && !is_marker(last)) {
             if (Tokenizer_parse_external_link(self, 0)) {
                 return NULL;
             }
         } else if (this == ']' && this_context & LC_EXT_LINK_TITLE) {
             return Tokenizer_pop(self);
         } else if (this == '=' && !(self->global & GL_HEADING) &&
                    !(this_context & LC_TEMPLATE)) {
-            if (!last || last == '\n') {
+            if (last == TOKENIZER_EOF || last == '\n') {
                 if (Tokenizer_parse_heading(self)) {
                     return NULL;
                 }
@@ -2988,7 +2988,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             } else if (Tokenizer_emit_char(self, this)) {
                 return NULL;
             }
-        } else if (this == '<' && next == '/' && Tokenizer_read(self, 2)) {
+        } else if (this == '<' && next == '/' && Tokenizer_read(self, 2) != TOKENIZER_EOF) {
             if (this_context & LC_TAG_BODY ? Tokenizer_handle_tag_open_close(self)
                                            : Tokenizer_handle_invalid_tag_start(self)) {
                 return NULL;
@@ -3008,12 +3008,12 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             if (temp != Py_None) {
                 return temp;
             }
-        } else if ((!last || last == '\n') &&
+        } else if ((last == TOKENIZER_EOF || last == '\n') &&
                    (this == '#' || this == '*' || this == ';' || this == ':')) {
             if (Tokenizer_handle_list(self)) {
                 return NULL;
             }
-        } else if ((!last || last == '\n') &&
+        } else if ((last == TOKENIZER_EOF || last == '\n') &&
                    (this == '-' && this == next && this == Tokenizer_read(self, 2) &&
                     this == Tokenizer_read(self, 3))) {
             if (Tokenizer_handle_hr(self)) {

diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h
@@ -25,11 +25,11 @@ SOFTWARE.
 #include "common.h"
 
 static const Py_UCS4 MARKERS[] = {
-    '{', '}', '[', ']', '<', '>', '|', '=',  '&',  '\'',
-    '#', '*', ';', ':', '/', '-', '!', '\n', '\0',
+    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'',
+    '#', '*', ';', ':', '/', '-', '!', '\n',
 };
 
-#define NUM_MARKERS 19
+#define NUM_MARKERS 18
 
 /* Functions */
 

diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_support.c b/src/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -442,7 +442,7 @@ Tokenizer_read(Tokenizer *self, Py_ssize_t delta)
     Py_ssize_t index = self->head + delta;
 
     if (index >= self->text.length) {
-        return '\0';
+        return TOKENIZER_EOF;
     }
     return read_codepoint(&self->text, index);
 }
@@ -456,7 +456,7 @@ Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta)
     Py_ssize_t index;
 
     if (delta > self->head) {
-        return '\0';
+        return TOKENIZER_EOF;
     }
     index = self->head - delta;
     return read_codepoint(&self->text, index);

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -145,3 +145,31 @@ def test_describe_context():
     assert "" == contexts.describe(0)
     ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT)
     assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx
+
+
+@pytest.mark.parametrize(
+    "tokenizer",
+    filter(None, (CTokenizer, PyTokenizer)),
+    ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer",
+)
+def test_nul_byte_preservation(tokenizer):
+    """Test that NUL bytes (\\x00) are preserved in the input instead of being treated as EOF.
+
+    Regression test for issue #355: C tokenizer was returning '\\0' both for real NUL bytes
+    in the input and for EOF, causing silent truncation at the first NUL.
+    """
+    # Basic NUL preservation
+    result = str(Builder().build(tokenizer().tokenize("a\x00b")))
+    assert result == "a\x00b", f"Expected 'a\\x00b', got {result!r}"
+
+    # NUL in the middle of text
+    result = str(Builder().build(tokenizer().tokenize("hello\x00world")))
+    assert result == "hello\x00world", f"Expected 'hello\\x00world', got {result!r}"
+
+    # NUL inside template
+    result = str(Builder().build(tokenizer().tokenize("{{a\x00b}}")))
+    assert result == "{{a\x00b}}", f"Expected '{{{{a\\x00b}}}}', got {result!r}"
+
+    # Multiple NULs
+    result = str(Builder().build(tokenizer().tokenize("a\x00b\x00c")))
+    assert result == "a\x00b\x00c", f"Expected 'a\\x00b\\x00c', got {result!r}"