Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/mwparserfromhell/parser/ctokenizer/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ SOFTWARE.
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)

/* EOF sentinel: use a value beyond valid Unicode range (0x110000 is first invalid) */
#define TOKENIZER_EOF ((Py_UCS4) 0x110000)

/* Error handling macros */

#define BAD_ROUTE self->route_state
Expand Down
44 changes: 22 additions & 22 deletions src/mwparserfromhell/parser/ctokenizer/tok_parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ Tokenizer_parse_bracketed_uri_scheme(Tokenizer *self)
if (!buffer) {
return -1;
}
while ((this = Tokenizer_read(self, 0))) {
while ((this = Tokenizer_read(self, 0)) != TOKENIZER_EOF) {
i = 0;
while (1) {
if (!valid[i]) {
Expand Down Expand Up @@ -678,7 +678,7 @@ Tokenizer_is_uri_end(Tokenizer *self, Py_UCS4 this, Py_UCS4 next)
Py_UCS4 after = Tokenizer_read(self, 2);
uint64_t ctx = self->topstack->context;

return (!this || this == '\n' || this == '[' || this == ']' || this == '<' ||
return (this == TOKENIZER_EOF || this == '\n' || this == '[' || this == ']' || this == '<' ||
this == '>' || this == '"' || this == ' ' ||
(this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) ||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
Expand All @@ -703,7 +703,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer *
return NULL;
}
this = Tokenizer_read(self, 0);
if (!this || this == '\n' || this == ' ' || this == ']') {
if (this == TOKENIZER_EOF || this == '\n' || this == ' ' || this == ']') {
return Tokenizer_fail_route(self);
}
if (!brackets && this == '[') {
Expand All @@ -729,7 +729,7 @@ Tokenizer_really_parse_external_link(Tokenizer *self, int brackets, Textbuffer *
return NULL;
}
} else if (brackets) {
if (!this || this == '\n') {
if (this == TOKENIZER_EOF || this == '\n') {
return Tokenizer_fail_route(self);
}
if (this == ']') {
Expand Down Expand Up @@ -1040,7 +1040,7 @@ Tokenizer_really_parse_entity(Tokenizer *self)
}
self->head++;
this = Tokenizer_read(self, 0);
if (!this) {
if (this == TOKENIZER_EOF) {
Tokenizer_fail_route(self);
return 0;
}
Expand All @@ -1051,7 +1051,7 @@ Tokenizer_really_parse_entity(Tokenizer *self)
}
self->head++;
this = Tokenizer_read(self, 0);
if (!this) {
if (this == TOKENIZER_EOF) {
Tokenizer_fail_route(self);
return 0;
}
Expand Down Expand Up @@ -1233,7 +1233,7 @@ Tokenizer_parse_comment(Tokenizer *self)
}
while (1) {
this = Tokenizer_read(self, 0);
if (!this) {
if (this == TOKENIZER_EOF) {
comment = Tokenizer_pop(self);
Py_XDECREF(comment);
self->head = reset;
Expand Down Expand Up @@ -1597,7 +1597,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self)
while (1) {
this = Tokenizer_read(self, 0);
next = Tokenizer_read(self, 1);
if (!this) {
if (this == TOKENIZER_EOF) {
return Tokenizer_fail_route(self);
} else if (this == '<' && next == '/') {
self->head += 2;
Expand Down Expand Up @@ -1639,7 +1639,7 @@ Tokenizer_handle_blacklisted_tag(Tokenizer *self)
}
return Tokenizer_pop(self);
}
if (!this || this == '\n') {
if (this == TOKENIZER_EOF || this == '\n') {
no_matching_end:
Textbuffer_dealloc(buffer);
self->head = reset;
Expand Down Expand Up @@ -1790,7 +1790,7 @@ Tokenizer_really_parse_tag(Tokenizer *self)
next = Tokenizer_read(self, 1);
can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
data->context & TAG_NOTE_SPACE);
if (!this) {
if (this == TOKENIZER_EOF) {
if (self->topstack->context & LC_TAG_ATTR) {
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
Expand Down Expand Up @@ -1863,7 +1863,7 @@ Tokenizer_handle_invalid_tag_start(Tokenizer *self)
}
while (1) {
this = Tokenizer_read(self, pos);
if (Py_UNICODE_ISSPACE(this) || is_marker(this)) {
if (this == TOKENIZER_EOF || Py_UNICODE_ISSPACE(this) || is_marker(this)) {
name = Textbuffer_render(buf);
if (!name) {
Textbuffer_dealloc(buf);
Expand Down Expand Up @@ -2442,7 +2442,7 @@ Tokenizer_handle_table_style(Tokenizer *self, Py_UCS4 end_token)
return NULL;
}
return padding;
} else if (!this || this == end_token) {
} else if (this == TOKENIZER_EOF || this == end_token) {
if (self->topstack->context & LC_TAG_ATTR) {
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
Expand Down Expand Up @@ -2845,7 +2845,7 @@ Tokenizer_has_leading_whitespace(Tokenizer *self)
Py_UCS4 current_character;
while (1) {
current_character = Tokenizer_read_backwards(self, offset);
if (!current_character || current_character == '\n') {
if (current_character == TOKENIZER_EOF || current_character == '\n') {
return 1;
} else if (!Py_UNICODE_ISSPACE(current_character)) {
return 0;
Expand Down Expand Up @@ -2876,6 +2876,9 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
while (1) {
this = Tokenizer_read(self, 0);
this_context = self->topstack->context;
if (this == TOKENIZER_EOF) {
return Tokenizer_handle_end(self, this_context);
}
if (this_context & AGG_UNSAFE) {
if (Tokenizer_verify_safe(self, this_context, this) < 0) {
if (this_context & AGG_DOUBLE) {
Expand All @@ -2892,9 +2895,6 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
self->head++;
continue;
}
if (!this) {
return Tokenizer_handle_end(self, this_context);
}
if (PyErr_CheckSignals()) {
return NULL;
}
Expand All @@ -2913,7 +2913,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
return NULL;
}
} else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) {
if (!(self->global & GL_HEADING) && (!last || last == '\n') &&
if (!(self->global & GL_HEADING) && (last == TOKENIZER_EOF || last == '\n') &&
next == '=') {
if (Tokenizer_parse_heading(self)) {
return NULL;
Expand Down Expand Up @@ -2956,15 +2956,15 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
if (Tokenizer_parse_external_link(self, 1)) {
return NULL;
}
} else if (this == ':' && !is_marker(last)) {
} else if (this == ':' && last != TOKENIZER_EOF && !is_marker(last)) {
if (Tokenizer_parse_external_link(self, 0)) {
return NULL;
}
} else if (this == ']' && this_context & LC_EXT_LINK_TITLE) {
return Tokenizer_pop(self);
} else if (this == '=' && !(self->global & GL_HEADING) &&
!(this_context & LC_TEMPLATE)) {
if (!last || last == '\n') {
if (last == TOKENIZER_EOF || last == '\n') {
if (Tokenizer_parse_heading(self)) {
return NULL;
}
Expand All @@ -2988,7 +2988,7 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
} else if (Tokenizer_emit_char(self, this)) {
return NULL;
}
} else if (this == '<' && next == '/' && Tokenizer_read(self, 2)) {
} else if (this == '<' && next == '/' && Tokenizer_read(self, 2) != TOKENIZER_EOF) {
if (this_context & LC_TAG_BODY ? Tokenizer_handle_tag_open_close(self)
: Tokenizer_handle_invalid_tag_start(self)) {
return NULL;
Expand All @@ -3008,12 +3008,12 @@ Tokenizer_parse(Tokenizer *self, uint64_t context, int push)
if (temp != Py_None) {
return temp;
}
} else if ((!last || last == '\n') &&
} else if ((last == TOKENIZER_EOF || last == '\n') &&
(this == '#' || this == '*' || this == ';' || this == ':')) {
if (Tokenizer_handle_list(self)) {
return NULL;
}
} else if ((!last || last == '\n') &&
} else if ((last == TOKENIZER_EOF || last == '\n') &&
(this == '-' && this == next && this == Tokenizer_read(self, 2) &&
this == Tokenizer_read(self, 3))) {
if (Tokenizer_handle_hr(self)) {
Expand Down
6 changes: 3 additions & 3 deletions src/mwparserfromhell/parser/ctokenizer/tok_parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ SOFTWARE.
#include "common.h"

static const Py_UCS4 MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'',
'#', '*', ';', ':', '/', '-', '!', '\n', '\0',
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'',
'#', '*', ';', ':', '/', '-', '!', '\n',
};

#define NUM_MARKERS 19
#define NUM_MARKERS 18

/* Functions */

Expand Down
4 changes: 2 additions & 2 deletions src/mwparserfromhell/parser/ctokenizer/tok_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ Tokenizer_read(Tokenizer *self, Py_ssize_t delta)
Py_ssize_t index = self->head + delta;

if (index >= self->text.length) {
return '\0';
return TOKENIZER_EOF;
}
return read_codepoint(&self->text, index);
}
Expand All @@ -456,7 +456,7 @@ Tokenizer_read_backwards(Tokenizer *self, Py_ssize_t delta)
Py_ssize_t index;

if (delta > self->head) {
return '\0';
return TOKENIZER_EOF;
}
index = self->head - delta;
return read_codepoint(&self->text, index);
Expand Down
28 changes: 28 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,31 @@ def test_describe_context():
assert "" == contexts.describe(0)
ctx = contexts.describe(contexts.TEMPLATE_PARAM_KEY | contexts.HAS_TEXT)
assert "TEMPLATE_PARAM_KEY|HAS_TEXT" == ctx


@pytest.mark.parametrize(
"tokenizer",
filter(None, (CTokenizer, PyTokenizer)),
ids=lambda t: "CTokenizer" if t.USES_C else "PyTokenizer",
)
def test_nul_byte_preservation(tokenizer):
"""Test that NUL bytes (\\x00) are preserved in the input instead of being treated as EOF.

Regression test for issue #355: C tokenizer was returning '\\0' both for real NUL bytes
in the input and for EOF, causing silent truncation at the first NUL.
"""
# Basic NUL preservation
result = str(Builder().build(tokenizer().tokenize("a\x00b")))
assert result == "a\x00b", f"Expected 'a\\x00b', got {result!r}"

# NUL in the middle of text
result = str(Builder().build(tokenizer().tokenize("hello\x00world")))
assert result == "hello\x00world", f"Expected 'hello\\x00world', got {result!r}"

# NUL inside template
result = str(Builder().build(tokenizer().tokenize("{{a\x00b}}")))
assert result == "{{a\x00b}}", f"Expected '{{{{a\\x00b}}}}', got {result!r}"

# Multiple NULs
result = str(Builder().build(tokenizer().tokenize("a\x00b\x00c")))
assert result == "a\x00b\x00c", f"Expected 'a\\x00b\\x00c', got {result!r}"