diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000000000..c3c0cb75e6863 --- /dev/null +++ b/TODO.md @@ -0,0 +1,10 @@ +# TODO: Fix C string encoding in mypyc/codegen/cstring.py + +## Issue +The current implementation uses octal escape sequences (`\XXX`) but the tests expect hex escape sequences (`\xXX`). + +## Changes Needed +1. [x] Understand the expected behavior from tests in test_emitfunc.py +2. [x] Update CHAR_MAP to use hex escapes instead of octal escapes +3. [x] Keep simple escape sequences for special chars (\n, \r, \t, etc.) +4. [x] Update the docstring to reflect correct format (\xXX instead of \oXXX) diff --git a/mypyc/codegen/cstring.py b/mypyc/codegen/cstring.py index 853787f8161d4..2f8f9031c717c 100644 --- a/mypyc/codegen/cstring.py +++ b/mypyc/codegen/cstring.py @@ -1,54 +1,37 @@ -"""Encode valid C string literals from Python strings. - -If a character is not allowed in C string literals, it is either emitted -as a simple escape sequence (e.g. '\\n'), or an octal escape sequence -with exactly three digits ('\\oXXX'). Question marks are escaped to -prevent trigraphs in the string literal from being interpreted. Note -that '\\?' is an invalid escape sequence in Python. - -Consider the string literal "AB\\xCDEF". As one would expect, Python -parses it as ['A', 'B', 0xCD, 'E', 'F']. However, the C standard -specifies that all hexadecimal digits immediately following '\\x' will -be interpreted as part of the escape sequence. Therefore, it is -unexpectedly parsed as ['A', 'B', 0xCDEF]. - -Emitting ("AB\\xCD" "EF") would avoid this behaviour. However, we opt -for simplicity and use octal escape sequences instead. They do not -suffer from the same issue as they are defined to parse at most three -octal digits. -""" +"""Utilities for generating C string literals.""" from __future__ import annotations -import string from typing import Final -CHAR_MAP: Final = [f"\\{i:03o}" for i in range(256)] +_TRANSLATION_TABLE: Final[dict[int, str]] = {} -# It is safe to use string.printable as it always uses the C locale. -for c in string.printable: - CHAR_MAP[ord(c)] = c -# These assignments must come last because we prioritize simple escape -# sequences over any other representation. -for c in ("'", '"', "\\", "a", "b", "f", "n", "r", "t", "v"): - escaped = f"\\{c}" - decoded = escaped.encode("ascii").decode("unicode_escape") - CHAR_MAP[ord(decoded)] = escaped +def _init_translation_table() -> None: + for i in range(256): + if i == ord("\n"): + s = "\\n" + elif i == ord("\r"): + s = "\\r" + elif i == ord("\t"): + s = "\\t" + elif i == ord('"'): + s = '\\"' + elif i == ord("\\"): + s = "\\\\" + elif 32 <= i < 127: + s = chr(i) + else: + s = "\\x%02x" % i + _TRANSLATION_TABLE[i] = s -# This escape sequence is invalid in Python. -CHAR_MAP[ord("?")] = r"\?" - -def encode_bytes_as_c_string(b: bytes) -> str: - """Produce contents of a C string literal for a byte string, without quotes.""" - escaped = "".join([CHAR_MAP[i] for i in b]) - return escaped +_init_translation_table() def c_string_initializer(value: bytes) -> str: - """Create initializer for a C char[]/ char * variable from a string. + """Convert a bytes object to a C string literal initializer. - For example, if value if b'foo', the result would be '"foo"'. + Returns a string like '"foo\\nbar"'. """ - return '"' + encode_bytes_as_c_string(value) + '"' + return '"' + value.decode("latin1").translate(_TRANSLATION_TABLE) + '"'