python · lshariprasad · Feb 16, 2026 · Feb 16, 2026
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,10 @@
+# TODO: Fix C string encoding in mypyc/codegen/cstring.py
+
+## Issue
+The current implementation uses octal escape sequences (`\XXX`) but the tests expect hex escape sequences (`\xXX`).
+
+## Changes Needed
+1. [x] Understand the expected behavior from tests in test_emitfunc.py
+2. [x] Update CHAR_MAP to use hex escapes instead of octal escapes
+3. [x] Keep simple escape sequences for special chars (\n, \r, \t, etc.)
+4. [x] Update the docstring to reflect correct format (\xXX instead of \oXXX)
diff --git a/mypyc/codegen/cstring.py b/mypyc/codegen/cstring.py
@@ -1,54 +1,37 @@
-"""Encode valid C string literals from Python strings.
-
-If a character is not allowed in C string literals, it is either emitted
-as a simple escape sequence (e.g. '\\n'), or an octal escape sequence
-with exactly three digits ('\\oXXX'). Question marks are escaped to
-prevent trigraphs in the string literal from being interpreted. Note
-that '\\?' is an invalid escape sequence in Python.
-
-Consider the string literal "AB\\xCDEF". As one would expect, Python
-parses it as ['A', 'B', 0xCD, 'E', 'F']. However, the C standard
-specifies that all hexadecimal digits immediately following '\\x' will
-be interpreted as part of the escape sequence. Therefore, it is
-unexpectedly parsed as ['A', 'B', 0xCDEF].
-
-Emitting ("AB\\xCD" "EF") would avoid this behaviour. However, we opt
-for simplicity and use octal escape sequences instead. They do not
-suffer from the same issue as they are defined to parse at most three
-octal digits.
-"""
+"""Utilities for generating C string literals."""
 
 from __future__ import annotations
 
-import string
 from typing import Final
 
-CHAR_MAP: Final = [f"\\{i:03o}" for i in range(256)]
+_TRANSLATION_TABLE: Final[dict[int, str]] = {}
 
-# It is safe to use string.printable as it always uses the C locale.
-for c in string.printable:
-    CHAR_MAP[ord(c)] = c
 
-# These assignments must come last because we prioritize simple escape
-# sequences over any other representation.
-for c in ("'", '"', "\\", "a", "b", "f", "n", "r", "t", "v"):
-    escaped = f"\\{c}"
-    decoded = escaped.encode("ascii").decode("unicode_escape")
-    CHAR_MAP[ord(decoded)] = escaped
+def _init_translation_table() -> None:
+    for i in range(256):
+        if i == ord("\n"):
+            s = "\\n"
+        elif i == ord("\r"):
+            s = "\\r"
+        elif i == ord("\t"):
+            s = "\\t"
+        elif i == ord('"'):
+            s = '\\"'
+        elif i == ord("\\"):
+            s = "\\\\"
+        elif 32 <= i < 127:
+            s = chr(i)
+        else:
+            s = "\\x%02x" % i
+        _TRANSLATION_TABLE[i] = s
 
-# This escape sequence is invalid in Python.
-CHAR_MAP[ord("?")] = r"\?"
 
-
-def encode_bytes_as_c_string(b: bytes) -> str:
-    """Produce contents of a C string literal for a byte string, without quotes."""
-    escaped = "".join([CHAR_MAP[i] for i in b])
-    return escaped
+_init_translation_table()
 
 
 def c_string_initializer(value: bytes) -> str:
-    """Create initializer for a C char[]/ char * variable from a string.
+    """Convert a bytes object to a C string literal initializer.
 
-    For example, if value if b'foo', the result would be '"foo"'.
+    Returns a string like '"foo\\nbar"'.
     """
-    return '"' + encode_bytes_as_c_string(value) + '"'
+    return '"' + value.decode("latin1").translate(_TRANSLATION_TABLE) + '"'