Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# TODO: Fix C string encoding in mypyc/codegen/cstring.py

## Issue
The current implementation uses octal escape sequences (`\XXX`) but the tests expect hex escape sequences (`\xXX`).

## Changes Needed
1. [x] Understand the expected behavior from tests in test_emitfunc.py
2. [x] Update CHAR_MAP to use hex escapes instead of octal escapes
3. [x] Keep simple escape sequences for special chars (\n, \r, \t, etc.)
4. [x] Update the docstring to reflect correct format (\xXX instead of \oXXX)
63 changes: 23 additions & 40 deletions mypyc/codegen/cstring.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,37 @@
"""Encode valid C string literals from Python strings.
If a character is not allowed in C string literals, it is either emitted
as a simple escape sequence (e.g. '\\n'), or an octal escape sequence
with exactly three digits ('\\oXXX'). Question marks are escaped to
prevent trigraphs in the string literal from being interpreted. Note
that '\\?' is an invalid escape sequence in Python.
Consider the string literal "AB\\xCDEF". As one would expect, Python
parses it as ['A', 'B', 0xCD, 'E', 'F']. However, the C standard
specifies that all hexadecimal digits immediately following '\\x' will
be interpreted as part of the escape sequence. Therefore, it is
unexpectedly parsed as ['A', 'B', 0xCDEF].
Emitting ("AB\\xCD" "EF") would avoid this behaviour. However, we opt
for simplicity and use octal escape sequences instead. They do not
suffer from the same issue as they are defined to parse at most three
octal digits.
"""
"""Utilities for generating C string literals."""

from __future__ import annotations

import string
from typing import Final

CHAR_MAP: Final = [f"\\{i:03o}" for i in range(256)]
_TRANSLATION_TABLE: Final[dict[int, str]] = {}

# It is safe to use string.printable as it always uses the C locale.
for c in string.printable:
CHAR_MAP[ord(c)] = c

# These assignments must come last because we prioritize simple escape
# sequences over any other representation.
for c in ("'", '"', "\\", "a", "b", "f", "n", "r", "t", "v"):
escaped = f"\\{c}"
decoded = escaped.encode("ascii").decode("unicode_escape")
CHAR_MAP[ord(decoded)] = escaped
def _init_translation_table() -> None:
for i in range(256):
if i == ord("\n"):
s = "\\n"
elif i == ord("\r"):
s = "\\r"
elif i == ord("\t"):
s = "\\t"
elif i == ord('"'):
s = '\\"'
elif i == ord("\\"):
s = "\\\\"
elif 32 <= i < 127:
s = chr(i)
else:
s = "\\x%02x" % i
_TRANSLATION_TABLE[i] = s

# This escape sequence is invalid in Python.
CHAR_MAP[ord("?")] = r"\?"


def encode_bytes_as_c_string(b: bytes) -> str:
"""Produce contents of a C string literal for a byte string, without quotes."""
escaped = "".join([CHAR_MAP[i] for i in b])
return escaped
_init_translation_table()


def c_string_initializer(value: bytes) -> str:
"""Create initializer for a C char[]/ char * variable from a string.
"""Convert a bytes object to a C string literal initializer.
For example, if value if b'foo', the result would be '"foo"'.
Returns a string like '"foo\\nbar"'.
"""
return '"' + encode_bytes_as_c_string(value) + '"'
return '"' + value.decode("latin1").translate(_TRANSLATION_TABLE) + '"'
Loading