Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mypy/typeshed/stubs/librt/librt/strings.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ def isspace(c: i32, /) -> bool: ...
def isdigit(c: i32, /) -> bool: ...
def isalnum(c: i32, /) -> bool: ...
def isalpha(c: i32, /) -> bool: ...
def isidentifier(c: i32, /) -> bool: ...
1 change: 0 additions & 1 deletion mypyc/ir/deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,5 +116,4 @@ def get_header(self) -> str:
STRING_WRITER_EXTRA_OPS: Final = SourceDep("stringwriter_extra_ops.c")
BYTEARRAY_EXTRA_OPS: Final = SourceDep("bytearray_extra_ops.c")
STR_EXTRA_OPS: Final = SourceDep("str_extra_ops.c")
CODEPOINT_EXTRA_OPS: Final = SourceDep("codepoint_extra_ops.c")
VECS_EXTRA_OPS: Final = SourceDep("vecs_extra_ops.c")
8 changes: 0 additions & 8 deletions mypyc/lib-rt/codepoint_extra_ops.c

This file was deleted.

28 changes: 0 additions & 28 deletions mypyc/lib-rt/codepoint_extra_ops.h

This file was deleted.

19 changes: 9 additions & 10 deletions mypyc/lib-rt/strings/librt_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <Python.h>
#include <stdint.h>
#include "CPy.h"
#include "codepoint_extra_ops.h"
#include "librt_strings.h"

#define CPY_BOOL_ERROR 2
Expand Down Expand Up @@ -1154,15 +1153,11 @@ read_f64_be(PyObject *module, PyObject *const *args, size_t nargs) {
return PyFloat_FromDouble(CPyBytes_ReadF64BEUnsafe(data + index));
}

// Codepoint classification helpers exposed to interpreted callers.
// The C-side names are prefixed `cp_` to avoid colliding with libc's
// <ctype.h> isspace / isdigit / etc. Compiled callers go through the
// LibRTStrings_* static inlines in codepoint_extra_ops.h instead.
//
// All wrappers parse a single int argument as i32 (codepoint) and
// dispatch to the corresponding LibRTStrings_* function. The parse
// step accepts any int but rejects values outside the i32 range with
// OverflowError, matching the input domain of the compiled fast path.
// Python-level wrappers (`cp_*`) for interpreted callers. The C-side names
// are prefixed `cp_` to avoid colliding with libc's <ctype.h> isspace etc.
// The LibRTStrings_Is* helpers themselves are static inline in librt_strings.h
// so they compile directly into mypyc-emitted code with no capsule
// indirection.

// Parse a Python int as i32 codepoint. Returns 0 on success and writes
// the value to *out; returns -1 on error with a Python exception set.
Expand Down Expand Up @@ -1194,6 +1189,7 @@ DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace)
DEFINE_CP_BOOL_WRAPPER(isdigit, LibRTStrings_IsDigit)
DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum)
DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha)
DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier)

static PyMethodDef librt_strings_module_methods[] = {
{"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL,
Expand Down Expand Up @@ -1268,6 +1264,9 @@ static PyMethodDef librt_strings_module_methods[] = {
{"isalpha", cp_isalpha, METH_O,
PyDoc_STR("Test whether a codepoint (i32) is a Unicode letter.")
},
{"isidentifier", cp_isidentifier, METH_O,
PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).")
},
{NULL, NULL, 0, NULL}
};

Expand Down
45 changes: 45 additions & 0 deletions mypyc/lib-rt/strings/librt_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include <Python.h>
#include <stdbool.h>
#include <stdint.h>
#include "CPy.h"
#include "librt_strings_common.h"

// ABI version -- only an exact match is compatible. This will only be changed in
Expand All @@ -28,4 +30,47 @@ typedef struct {
char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer
} StringWriterObject;

// Codepoint classification helpers. Inputs are signed i32 for compatibility
// with mypyc's int32_rprimitive; negative values are non-codepoints and
// return false. Defined `static inline` so they compile statically into
// both the librt.strings module and any mypyc-compiled extension that
// includes this header, avoiding the capsule indirection that would dwarf
// the work of a single Py_UNICODE_IS* macro call.

static inline bool LibRTStrings_IsSpace(int32_t c) {
return c >= 0 && Py_UNICODE_ISSPACE((Py_UCS4)c);
}

static inline bool LibRTStrings_IsDigit(int32_t c) {
return c >= 0 && Py_UNICODE_ISDIGIT((Py_UCS4)c);
}

static inline bool LibRTStrings_IsAlnum(int32_t c) {
return c >= 0 && Py_UNICODE_ISALNUM((Py_UCS4)c);
}

static inline bool LibRTStrings_IsAlpha(int32_t c) {
return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c);
}

// True if c could start a valid identifier (XID_Start, per PEP 3131).
// ASCII fast path covers `[A-Za-z_]`; non-ASCII delegates to CPython's
// PyUnicode_IsIdentifier on a 1-character string. Aborts via
// CPyError_OutOfMemory on allocation failure to keep this ERR_NEVER.
static inline bool LibRTStrings_IsIdentifier(int32_t c) {
if (c < 0) return false;
if (c < 128) {
return (c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_';
}
PyObject *s = PyUnicode_FromOrdinal((int)c);
if (s == NULL) {
CPyError_OutOfMemory();
}
int r = PyUnicode_IsIdentifier(s);
Py_DECREF(s);
return r == 1;
}

#endif // LIBRT_STRINGS_H
27 changes: 17 additions & 10 deletions mypyc/primitives/librt_strings_ops.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
from mypyc.ir.deps import (
BYTES_WRITER_EXTRA_OPS,
CODEPOINT_EXTRA_OPS,
LIBRT_STRINGS,
STRING_WRITER_EXTRA_OPS,
)
from mypyc.ir.deps import BYTES_WRITER_EXTRA_OPS, LIBRT_STRINGS, STRING_WRITER_EXTRA_OPS
from mypyc.ir.ops import ERR_MAGIC, ERR_MAGIC_OVERLAPPING, ERR_NEVER
from mypyc.ir.rtypes import (
bool_rprimitive,
Expand Down Expand Up @@ -402,7 +397,7 @@
return_type=bool_rprimitive,
c_function_name="LibRTStrings_IsSpace",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
dependencies=[LIBRT_STRINGS],
)

function_op(
Expand All @@ -411,7 +406,7 @@
return_type=bool_rprimitive,
c_function_name="LibRTStrings_IsDigit",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
dependencies=[LIBRT_STRINGS],
)

function_op(
Expand All @@ -420,7 +415,7 @@
return_type=bool_rprimitive,
c_function_name="LibRTStrings_IsAlnum",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
dependencies=[LIBRT_STRINGS],
)

function_op(
Expand All @@ -429,5 +424,17 @@
return_type=bool_rprimitive,
c_function_name="LibRTStrings_IsAlpha",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS],
dependencies=[LIBRT_STRINGS],
)

# isidentifier checks XID_Start semantics for a single codepoint, matching
# str.isidentifier() on a 1-character string. The non-ASCII path allocates
# and aborts via CPyError_OutOfMemory on failure, so this stays ERR_NEVER.
function_op(
name="librt.strings.isidentifier",
arg_types=[int32_rprimitive],
return_type=bool_rprimitive,
c_function_name="LibRTStrings_IsIdentifier",
error_kind=ERR_NEVER,
dependencies=[LIBRT_STRINGS],
)
14 changes: 14 additions & 0 deletions mypyc/test-data/irbuild-librt-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,17 @@ def is_a(c):
L0:
r0 = LibRTStrings_IsAlpha(c)
return r0

[case testLibrtStringsIsIdentifierIR]
from librt.strings import isidentifier
from mypy_extensions import i32

def is_id(c: i32) -> bool:
return isidentifier(c)
[out]
def is_id(c):
c :: i32
r0 :: bool
L0:
r0 = LibRTStrings_IsIdentifier(c)
return r0
5 changes: 4 additions & 1 deletion mypyc/test-data/run-librt-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -1443,7 +1443,7 @@ def test_new_without_init_is_usable() -> None:
[case testLibrtStringsCodepointClassifiers_librt]
from typing import Any
from mypy_extensions import i32
from librt.strings import isspace, isdigit, isalnum, isalpha
from librt.strings import isspace, isdigit, isalnum, isalpha, isidentifier

from testutil import assertRaises

Expand All @@ -1455,6 +1455,7 @@ def test_codepoint_classifiers() -> None:
assert not isdigit(bad)
assert not isalnum(bad)
assert not isalpha(bad)
assert not isidentifier(bad)
# Verify each codepoint primitive agrees with the matching str method
# across all Unicode codepoints, including the ord(chr(i)) round-trip.
# Any forces generic dispatch on the str side.
Expand All @@ -1466,6 +1467,7 @@ def test_codepoint_classifiers() -> None:
assert isdigit(o) == isdigit(i) == a.isdigit()
assert isalnum(o) == isalnum(i) == a.isalnum()
assert isalpha(o) == isalpha(i) == a.isalpha()
assert isidentifier(o) == isidentifier(i) == a.isidentifier()


def test_codepoint_classifiers_via_any() -> None:
Expand All @@ -1476,6 +1478,7 @@ def test_codepoint_classifiers_via_any() -> None:
(isdigit, "5", "a"),
(isalnum, "A", " "),
(isalpha, "A", " "),
(isidentifier, "A", "0"),
):
f: Any = fn
assert f(ord(true_input)) is True
Expand Down