From 986b71a8548b29781dcdb97460648bd208c32647 Mon Sep 17 00:00:00 2001
From: June Kim <kimjune01@gmail.com>
Date: Mon, 11 May 2026 20:49:14 -0700
Subject: [PATCH 1/4] Fix C tokenizer NUL byte truncation (issue #355)

The C tokenizer was silently truncating input at the first NUL byte (\x00)
because it used '\0' both as a valid input character and as the EOF sentinel.

Root cause:
- Tokenizer_read() returned '\0' for both:
  1. Real NUL bytes in the input
  2. End-of-input (when index >= text.length)
- This made them indistinguishable, causing real NULs to be treated as EOF

Fix:
1. Define TOKENIZER_EOF as 0x110000 (first invalid Unicode code point)
2. Update Tokenizer_read() and Tokenizer_read_backwards() to return
   TOKENIZER_EOF instead of '\0' for out-of-bounds reads
3. Replace all `!this` and `'\0'` checks with explicit `TOKENIZER_EOF` checks
4. Remove '\0' from the MARKERS array (no longer needed as EOF marker)
5. Move EOF check before is_marker() in main parse loop to ensure
   TOKENIZER_EOF doesn't try to emit as a character
6. Fix Tokenizer_has_leading_whitespace() to recognize TOKENIZER_EOF

The Python tokenizer already preserved NUL bytes correctly; this brings
the C tokenizer into parity.

Regression test added: test_nul_byte_preservation() verifies that both
tokenizers now preserve NUL bytes in plain text, templates, and
multiple-NUL scenarios.
---
 .../parser/ctokenizer/common.h                |  3 ++
 .../parser/ctokenizer/tok_parse.c             | 30 +++++++++----------
 .../parser/ctokenizer/tok_parse.h             |  6 ++--
 .../parser/ctokenizer/tok_support.c           |  4 +--
 tests/test_tokenizer.py                       | 28 +++++++++++++++++
 5 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/src/mwparserfromhell/parser/ctokenizer/common.h b/src/mwparserfromhell/parser/ctokenizer/common.h
index dbd13b68..602a9994 100644
--- a/src/mwparserfromhell/parser/ctokenizer/common.h
+++ b/src/mwparserfromhell/parser/ctokenizer/common.h
@@ -47,6 +47,9 @@ SOFTWARE.
 #define PyUnicode_FROM_SINGLE(chr)                                                     \
     PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
 
+/* EOF sentinel: use a value beyond valid Unicode range (0x110000 is first invalid) */
+#define TOKENIZER_EOF ((Py_UCS4) 0x110000)
+
 /* Error handling macros */
 
 #define BAD_ROUTE         self->route_state
diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
index f42add15..999df041 100644
--- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -509,7 +509,7 @@ Tokenizer_parse_bracketed_uri_scheme(Tokenizer *self)
         if (!buffer) {
             return -1;
         }
-        while ((this = Tokenizer_read(self, 0))) {
+        while ((this = Tokenizer_read(self, 0)) != TOKENIZER_EOF) {
             i = 0;
             while (1) {
                 if (!valid[i]) {
@@ -678,7 +678,7 @@ Tokenizer_is_uri_end(Tokenizer *self, Py_UCS4 this, Py_UCS4 next)
     Py_UCS4 after = Tokenizer_read(self, 2);
     uint64_t ctx = self->topstack->context;
 
-    return (!this || this == '\n' || this == '[' || this == ']' || this == '<' ||
+    return (this == TOKENIZER_EOF || this == '\n' || this == '[' || this == ']' || this == '<' ||
             this == '>' || this == '"' || this == ' ' ||
             (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) ||
             (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
@@ -703,7 +703,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer *
         return NULL;
     }
     this = Tokenizer_read(self, 0);
-    if (!this || this == '\n' || this == ' ' || this == ']') {
+    if (this == TOKENIZER_EOF || this == '\n' || this == ' ' || this == ']') {
         return Tokenizer_fail_route(self);
     }
     if (!brackets && this == '[') {
@@ -729,7 +729,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer *
                 return NULL;
             }
         } else if (brackets) {
-            if (!this || this == '\n') {
+            if (this == TOKENIZER_EOF || this == '\n') {
                 return Tokenizer_fail_route(self);
             }
             if (this == ']') {
@@ -1040,7 +1040,7 @@ Tokenizer_really_parse_entity(Tokenizer *self)
     }
     self->head++;
     this = Tokenizer_read(self, 0);
-    if (!this) {
+    if (this == TOKENIZER_EOF) {
         Tokenizer_fail_route(self);
         return 0;
     }
@@ -1051,7 +1051,7 @@ Tokenizer_really_parse_entity(Tokenizer *self)
         }
         self->head++;
         this = Tokenizer_read(self, 0);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             Tokenizer_fail_route(self);
             return 0;
         }
@@ -1233,7 +1233,7 @@ Tokenizer_parse_comment(Tokenizer *self)
     }
     while (1) {
         this = Tokenizer_read(self, 0);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             comment = Tokenizer_pop(self);
             Py_XDECREF(comment);
             self->head = reset;
@@ -1597,7 +1597,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self)
     while (1) {
         this = Tokenizer_read(self, 0);
         next = Tokenizer_read(self, 1);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             return Tokenizer_fail_route(self);
         } else if (this == '<' && next == '/') {
             self->head += 2;
@@ -1639,7 +1639,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self)
                     }
                     return Tokenizer_pop(self);
                 }
-                if (!this || this == '\n') {
+                if (this == TOKENIZER_EOF || this == '\n') {
                 no_matching_end:
                     Textbuffer_dealloc(buffer);
                     self->head = reset;
@@ -1790,7 +1790,7 @@ Tokenizer_really_parse_tag(Tokenizer *self)
         next = Tokenizer_read(self, 1);
         can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
                     data->context & TAG_NOTE_SPACE);
-        if (!this) {
+        if (this == TOKENIZER_EOF) {
             if (self->topstack->context & LC_TAG_ATTR) {
                 if (data->context & TAG_QUOTED) {
                     // Unclosed attribute quote: reset, don't die
@@ -2442,7 +2442,7 @@ Tokenizer_handle_table_style(Tokenizer *self, Py_UCS4 end_token)
                 return NULL;
             }
             return padding;
-        } else if (!this || this == end_token) {
+        } else if (this == TOKENIZER_EOF || this == end_token) {
             if (self->topstack->context & LC_TAG_ATTR) {
                 if (data->context & TAG_QUOTED) {
                     // Unclosed attribute quote: reset, don't die
@@ -2845,7 +2845,7 @@ Tokenizer_has_leading_whitespace(Tokenizer *self)
     Py_UCS4 current_character;
     while (1) {
         current_character = Tokenizer_read_backwards(self, offset);
-        if (!current_character || current_character == '\n') {
+        if (current_character == TOKENIZER_EOF || current_character == '\n') {
             return 1;
         } else if (!Py_UNICODE_ISSPACE(current_character)) {
             return 0;
@@ -2876,6 +2876,9 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
     while (1) {
         this = Tokenizer_read(self, 0);
         this_context = self->topstack->context;
+        if (this == TOKENIZER_EOF) {
+            return Tokenizer_handle_end(self, this_context);
+        }
         if (this_context & AGG_UNSAFE) {
             if (Tokenizer_verify_safe(self, this_context, this) < 0) {
                 if (this_context & AGG_DOUBLE) {
@@ -2892,9 +2895,6 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             self->head++;
             continue;
         }
-        if (!this) {
-            return Tokenizer_handle_end(self, this_context);
-        }
         if (PyErr_CheckSignals()) {
             return NULL;
         }
diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h
index d72b2b29..9f6da8f6 100644
--- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.h
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.h
@@ -25,11 +25,11 @@ SOFTWARE.
 #include "common.h"
 
 static const Py_UCS4 MARKERS[] = {
-    '{', '}', '[', ']', '<', '>', '|', '=',  '&',  '\'',
-    '#', '*', ';', ':', '/', '-', '!', '\n', '\0',
+    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'',
+    '#', '*', ';', ':', '/', '-', '!', '\n',
 };
 
-#define NUM_MARKERS 19
+#define NUM_MARKERS 18
 
 /* Functions */
 
diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_support.c b/src/mwparserfromhell/parser/ctokenizer/tok_support.c
index 4c495ceb..aa9d80b0 100644
--- a/src/mwparserfromhell/parser/ctokenizer/tok_support.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -442,7 +442,7 @@ Tokenizer_read(Tokenizer *self, Py_ssize_t delta)
     Py_ssize_t index = self->head + delta;
 
     if (index >= self->text.length) {
-        return '\0';
+        return TOKENIZER_EOF;
     }
     return read_codepoint(&self->text, index);
 }
@@ -456,7 +456,7 @@ Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta)
     Py_ssize_t index;
 
     if (delta > self->head) {
-        return '\0';
+        return TOKENIZER_EOF;
     }
     index = self->head - delta;
     return read_codepoint(&self->text, index);
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 5864b48c..4b196d00 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -145,3 +145,31 @@ def test_describe_context():
     assert "" == contexts.describe(0)
     ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT)
     assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx
+
+
+@pytest.mark.parametrize(
+    "tokenizer",
+    filter(None, (CTokenizer, PyTokenizer)),
+    ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer",
+)
+def test_nul_byte_preservation(tokenizer):
+    """Test that NUL bytes (\\x00) are preserved in the input instead of being treated as EOF.
+
+    Regression test for issue #355: C tokenizer was returning '\\0' both for real NUL bytes
+    in the input and for EOF, causing silent truncation at the first NUL.
+    """
+    # Basic NUL preservation
+    result = str(Builder().build(tokenizer().tokenize("a\x00b")))
+    assert result == "a\x00b", f"Expected 'a\\x00b', got {result!r}"
+
+    # NUL in the middle of text
+    result = str(Builder().build(tokenizer().tokenize("hello\x00world")))
+    assert result == "hello\x00world", f"Expected 'hello\\x00world', got {result!r}"
+
+    # NUL inside template
+    result = str(Builder().build(tokenizer().tokenize("{{a\x00b}}")))
+    assert result == "{{a\x00b}}", f"Expected '{{{{a\\x00b}}}}', got {result!r}"
+
+    # Multiple NULs
+    result = str(Builder().build(tokenizer().tokenize("a\x00b\x00c")))
+    assert result == "a\x00b\x00c", f"Expected 'a\\x00b\\x00c', got {result!r}"

From 0f5f898bb76c18331d588794c1a84ade74a1afe7 Mon Sep 17 00:00:00 2001
From: June Kim <kimjune01@gmail.com>
Date: Mon, 11 May 2026 22:27:11 -0700
Subject: [PATCH 2/4] fix: replace remaining !last checks with TOKENIZER_EOF
 comparison

Four start-of-input checks in the main parse loop still used !last
(falsy NUL) to detect beginning-of-input. After the TOKENIZER_EOF
sentinel change, Tokenizer_read_backwards returns 0x110000 instead of
'\0', so !last is always false and headings, lists, and horizontal
rules at position 0 would silently fail to parse.
---
 src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
index 999df041..04d252a7 100644
--- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -2913,7 +2913,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
                 return NULL;
             }
         } else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) {
-            if (!(self->global & GL_HEADING) && (!last || last == '\n') &&
+            if (!(self->global & GL_HEADING) && (last == TOKENIZER_EOF || last == '\n') &&
                 next == '=') {
                 if (Tokenizer_parse_heading(self)) {
                     return NULL;
@@ -2964,7 +2964,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             return Tokenizer_pop(self);
         } else if (this == '=' && !(self->global & GL_HEADING) &&
                    !(this_context & LC_TEMPLATE)) {
-            if (!last || last == '\n') {
+            if (last == TOKENIZER_EOF || last == '\n') {
                 if (Tokenizer_parse_heading(self)) {
                     return NULL;
                 }
@@ -3008,12 +3008,12 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             if (temp != Py_None) {
                 return temp;
             }
-        } else if ((!last || last == '\n') &&
+        } else if ((last == TOKENIZER_EOF || last == '\n') &&
                    (this == '#' || this == '*' || this == ';' || this == ':')) {
             if (Tokenizer_handle_list(self)) {
                 return NULL;
             }
-        } else if ((!last || last == '\n') &&
+        } else if ((last == TOKENIZER_EOF || last == '\n') &&
                    (this == '-' && this == next && this == Tokenizer_read(self, 2) &&
                     this == Tokenizer_read(self, 3))) {
             if (Tokenizer_handle_hr(self)) {

From 158b8967e17c0bf387518e3e423be19f17ca33db Mon Sep 17 00:00:00 2001
From: June Kim <kimjune01@gmail.com>
Date: Mon, 11 May 2026 22:41:58 -0700
Subject: [PATCH 3/4] fix: check for EOF instead of truthiness in tag close
 handling

Tokenizer_read returns TOKENIZER_EOF for end-of-input, not a falsy
value. The old truthiness check let NUL bytes truncate parsing.
---
 src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
index 04d252a7..c8f7c0e2 100644
--- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -2988,7 +2988,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             } else if (Tokenizer_emit_char(self, this)) {
                 return NULL;
             }
-        } else if (this == '<' && next == '/' && Tokenizer_read(self, 2)) {
+        } else if (this == '<' && next == '/' && Tokenizer_read(self, 2) != TOKENIZER_EOF) {
             if (this_context & LC_TAG_BODY ? Tokenizer_handle_tag_open_close(self)
                                            : Tokenizer_handle_invalid_tag_start(self)) {
                 return NULL;

From 669edbcefa2c52673a4447f41b1f23e600c946ca Mon Sep 17 00:00:00 2001
From: June Kim <kimjune01@gmail.com>
Date: Tue, 12 May 2026 01:29:14 -0700
Subject: [PATCH 4/4] fix: handle TOKENIZER_EOF in invalid tag start loop and
 bare colon check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs introduced by the NUL truncation fix:

1. Tokenizer_handle_invalid_tag_start: the tag name scanning loop
   checked is_marker() and Py_UNICODE_ISSPACE() to terminate, but
   TOKENIZER_EOF (0x110000) matches neither, causing an infinite loop
   when an incomplete closing tag like "</ref" reaches EOF without ">".

2. Tokenizer_parse colon handling: the bare external link check
   "this == ':' && !is_marker(last)" fired at start-of-input because
   is_marker(TOKENIZER_EOF) returns false (0x110000 not in MARKERS).
   This intercepted ":" before the list handler could run, breaking
   definition list items like ":text" → <dd>.
---
 src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
index c8f7c0e2..7c4c20b4 100644
--- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -1863,7 +1863,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer *self)
     }
     while (1) {
         this = Tokenizer_read(self, pos);
-        if (Py_UNICODE_ISSPACE(this) || is_marker(this)) {
+        if (this == TOKENIZER_EOF || Py_UNICODE_ISSPACE(this) || is_marker(this)) {
             name = Textbuffer_render(buf);
             if (!name) {
                 Textbuffer_dealloc(buf);
@@ -2956,7 +2956,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
             if (Tokenizer_parse_external_link(self, 1)) {
                 return NULL;
             }
-        } else if (this == ':' && !is_marker(last)) {
+        } else if (this == ':' && last != TOKENIZER_EOF && !is_marker(last)) {
             if (Tokenizer_parse_external_link(self, 0)) {
                 return NULL;
             }