From 283b2f8ffcf2e9c832b295f7a48255a054e555fb Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Fri, 8 May 2026 17:04:15 +0300 Subject: [PATCH 1/2] [mypyc] Add `librt.strings.isidentifier` codepoint primitive True if a codepoint can start a valid identifier (XID_Start, per PEP 3131). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII codepoints round-trip through PyUnicode_FromOrdinal + PyUnicode_IsIdentifier so the answer matches str.isidentifier on a 1-character string. The non-ASCII path is the first allocating helper in this series, so its body lives out-of-line in codepoint_extra_ops.c (it would otherwise be emitted as a separate copy in every translation unit that includes the header). On OOM it swallows the exception via PyErr_Clear() and returns False, which keeps the function ERR_NEVER. Documented at the call site so callers don't get a surprising silent failure. Stack: depends on the librt.strings.isspace primitive. --- mypy/typeshed/stubs/librt/librt/strings.pyi | 1 + mypyc/build.py | 7 ++++++- mypyc/lib-rt/codepoint_extra_ops.c | 16 ++++++++++++++-- mypyc/lib-rt/codepoint_extra_ops.h | 19 +++++++++++++++++++ mypyc/lib-rt/setup.py | 1 + mypyc/lib-rt/strings/librt_strings.c | 4 ++++ mypyc/primitives/librt_strings_ops.py | 12 ++++++++++++ mypyc/test-data/irbuild-librt-strings.test | 14 ++++++++++++++ mypyc/test-data/run-librt-strings.test | 5 ++++- 9 files changed, 75 insertions(+), 4 deletions(-) diff --git a/mypy/typeshed/stubs/librt/librt/strings.pyi b/mypy/typeshed/stubs/librt/librt/strings.pyi index 01aee3ff758d..7a028f9e7859 100644 --- a/mypy/typeshed/stubs/librt/librt/strings.pyi +++ b/mypy/typeshed/stubs/librt/librt/strings.pyi @@ -47,3 +47,4 @@ def isspace(c: i32, /) -> bool: ... def isdigit(c: i32, /) -> bool: ... def isalnum(c: i32, /) -> bool: ... def isalpha(c: i32, /) -> bool: ... +def isidentifier(c: i32, /) -> bool: ... diff --git a/mypyc/build.py b/mypyc/build.py index 13bd50fef3b1..4dad74a4e349 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -54,7 +54,12 @@ class ModDesc(NamedTuple): LIBRT_MODULES = [ ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]), - ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]), + ModDesc( + "librt.strings", + ["strings/librt_strings.c", "codepoint_extra_ops.c"], + ["codepoint_extra_ops.h"], + ["strings"], + ), ModDesc( "librt.base64", [ diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c index ca03eba4e6f5..c66351141dbf 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -4,5 +4,17 @@ // The classification helpers and the ASCII fast paths for case conversion // stay inline in codepoint_extra_ops.h; this file holds the slow paths // that round-trip through PyUnicode_FromOrdinal and CPython's Unicode -// machinery. Currently empty; populated as later commits add -// isidentifier, toupper, and tolower. +// machinery. + +bool LibRTStrings_IsIdentifier_slow(int32_t c) { + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + // OOM. Swallow and return false to keep the function ERR_NEVER; + // callers expect a defined answer, not a propagated exception. + PyErr_Clear(); + return false; + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; +} diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h index bb83f92e4b87..6c2a6c12b564 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -25,4 +25,23 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) { return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); } +// Slow path for non-ASCII isidentifier; defined out-of-line in +// codepoint_extra_ops.c because it allocates and calls into CPython. +bool LibRTStrings_IsIdentifier_slow(int32_t c); + +// True if c could start a valid identifier (matches XID_Start +// semantics, which is what str.isidentifier reports for a 1-character +// string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII +// delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling. +// Returns false on OOM in the slow path (the function stays ERR_NEVER). +static inline bool LibRTStrings_IsIdentifier(int32_t c) { + if (c < 0) return false; + if (c < 128) { + return (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_'; + } + return LibRTStrings_IsIdentifier_slow(c); +} + #endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/setup.py b/mypyc/lib-rt/setup.py index 371b322ca18b..1801c4e7dfa9 100644 --- a/mypyc/lib-rt/setup.py +++ b/mypyc/lib-rt/setup.py @@ -103,6 +103,7 @@ def run(self) -> None: "librt.strings", [ "strings/librt_strings.c", + "codepoint_extra_ops.c", "init.c", "int_ops.c", "exc_ops.c", diff --git a/mypyc/lib-rt/strings/librt_strings.c b/mypyc/lib-rt/strings/librt_strings.c index cbc3e5f753fa..62b4edffcd7f 100644 --- a/mypyc/lib-rt/strings/librt_strings.c +++ b/mypyc/lib-rt/strings/librt_strings.c @@ -1194,6 +1194,7 @@ DEFINE_CP_BOOL_WRAPPER(isspace, LibRTStrings_IsSpace) DEFINE_CP_BOOL_WRAPPER(isdigit, LibRTStrings_IsDigit) DEFINE_CP_BOOL_WRAPPER(isalnum, LibRTStrings_IsAlnum) DEFINE_CP_BOOL_WRAPPER(isalpha, LibRTStrings_IsAlpha) +DEFINE_CP_BOOL_WRAPPER(isidentifier, LibRTStrings_IsIdentifier) static PyMethodDef librt_strings_module_methods[] = { {"write_i16_le", (PyCFunction) write_i16_le, METH_FASTCALL, @@ -1268,6 +1269,9 @@ static PyMethodDef librt_strings_module_methods[] = { {"isalpha", cp_isalpha, METH_O, PyDoc_STR("Test whether a codepoint (i32) is a Unicode letter.") }, + {"isidentifier", cp_isidentifier, METH_O, + PyDoc_STR("Test whether a codepoint (i32) is a valid identifier start (XID_Start).") + }, {NULL, NULL, 0, NULL} }; diff --git a/mypyc/primitives/librt_strings_ops.py b/mypyc/primitives/librt_strings_ops.py index 93fa717cf529..312d5a16195b 100644 --- a/mypyc/primitives/librt_strings_ops.py +++ b/mypyc/primitives/librt_strings_ops.py @@ -431,3 +431,15 @@ error_kind=ERR_NEVER, dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], ) + +# isidentifier checks XID_Start semantics for a single codepoint, matching +# str.isidentifier() on a 1-character string. The non-ASCII path allocates +# but swallows OOM (returning False), keeping the function ERR_NEVER. +function_op( + name="librt.strings.isidentifier", + arg_types=[int32_rprimitive], + return_type=bool_rprimitive, + c_function_name="LibRTStrings_IsIdentifier", + error_kind=ERR_NEVER, + dependencies=[LIBRT_STRINGS, CODEPOINT_EXTRA_OPS], +) diff --git a/mypyc/test-data/irbuild-librt-strings.test b/mypyc/test-data/irbuild-librt-strings.test index e5d18b6eb852..e3aaa49bd6f9 100644 --- a/mypyc/test-data/irbuild-librt-strings.test +++ b/mypyc/test-data/irbuild-librt-strings.test @@ -387,3 +387,17 @@ def is_a(c): L0: r0 = LibRTStrings_IsAlpha(c) return r0 + +[case testLibrtStringsIsIdentifierIR] +from librt.strings import isidentifier +from mypy_extensions import i32 + +def is_id(c: i32) -> bool: + return isidentifier(c) +[out] +def is_id(c): + c :: i32 + r0 :: bool +L0: + r0 = LibRTStrings_IsIdentifier(c) + return r0 diff --git a/mypyc/test-data/run-librt-strings.test b/mypyc/test-data/run-librt-strings.test index aa38c713d384..0a3320ff6522 100644 --- a/mypyc/test-data/run-librt-strings.test +++ b/mypyc/test-data/run-librt-strings.test @@ -1443,7 +1443,7 @@ def test_new_without_init_is_usable() -> None: [case testLibrtStringsCodepointClassifiers_librt] from typing import Any from mypy_extensions import i32 -from librt.strings import isspace, isdigit, isalnum, isalpha +from librt.strings import isspace, isdigit, isalnum, isalpha, isidentifier from testutil import assertRaises @@ -1455,6 +1455,7 @@ def test_codepoint_classifiers() -> None: assert not isdigit(bad) assert not isalnum(bad) assert not isalpha(bad) + assert not isidentifier(bad) # Verify each codepoint primitive agrees with the matching str method # across all Unicode codepoints, including the ord(chr(i)) round-trip. # Any forces generic dispatch on the str side. @@ -1466,6 +1467,7 @@ def test_codepoint_classifiers() -> None: assert isdigit(o) == isdigit(i) == a.isdigit() assert isalnum(o) == isalnum(i) == a.isalnum() assert isalpha(o) == isalpha(i) == a.isalpha() + assert isidentifier(o) == isidentifier(i) == a.isidentifier() def test_codepoint_classifiers_via_any() -> None: @@ -1476,6 +1478,7 @@ def test_codepoint_classifiers_via_any() -> None: (isdigit, "5", "a"), (isalnum, "A", " "), (isalpha, "A", " "), + (isidentifier, "A", "0"), ): f: Any = fn assert f(ord(true_input)) is True From 472aab01704158c2bc9b441e5b0f8a34750ae228 Mon Sep 17 00:00:00 2001 From: vaggelisd Date: Fri, 22 May 2026 18:31:16 +0300 Subject: [PATCH 2/2] Address review: abort on OOM, keep _extra_ops out of librt build - codepoint_extra_ops.h: include CPy.h and move the isidentifier slow path inline into LibRTStrings_IsIdentifier. Aborts via CPyError_OutOfMemory on allocation failure (the helper is ERR_NEVER, so returning a silently-wrong bool under memory pressure was the wrong contract). Matches the pattern in the sibling _extra_ops.h files (all bodies static inline, CPy.h included for runtime helpers). - codepoint_extra_ops.c: reduce to a single-line shim that #includes the header. Exists only so SourceDep("codepoint_extra_ops.c") pulls the header into user mypyc extensions in include_runtime_files mode. - build.py / lib-rt/setup.py: drop codepoint_extra_ops.c from the librt.strings module sources. The _extra_ops.c files are mypyc-internal (linked into user extensions via SourceDep in mypyc/ir/deps.py); the librt.strings Python module shouldn't depend on them, matching how bytes_extra_ops, str_extra_ops, etc. are organized. librt.strings now picks up LibRTStrings_IsIdentifier via #include of the header. --- mypyc/build.py | 7 +------ mypyc/lib-rt/codepoint_extra_ops.c | 24 +++++------------------- mypyc/lib-rt/codepoint_extra_ops.h | 16 ++++++++++------ mypyc/lib-rt/setup.py | 1 - 4 files changed, 16 insertions(+), 32 deletions(-) diff --git a/mypyc/build.py b/mypyc/build.py index 4dad74a4e349..13bd50fef3b1 100644 --- a/mypyc/build.py +++ b/mypyc/build.py @@ -54,12 +54,7 @@ class ModDesc(NamedTuple): LIBRT_MODULES = [ ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]), - ModDesc( - "librt.strings", - ["strings/librt_strings.c", "codepoint_extra_ops.c"], - ["codepoint_extra_ops.h"], - ["strings"], - ), + ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]), ModDesc( "librt.base64", [ diff --git a/mypyc/lib-rt/codepoint_extra_ops.c b/mypyc/lib-rt/codepoint_extra_ops.c index c66351141dbf..3eba41727d25 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.c +++ b/mypyc/lib-rt/codepoint_extra_ops.c @@ -1,20 +1,6 @@ +// All codepoint helper bodies live in codepoint_extra_ops.h as static +// inline. This translation unit exists so the header is pulled into +// mypyc-compiled extensions via SourceDep("codepoint_extra_ops.c") in +// mypyc/ir/deps.py (which, in include_runtime_files mode, emits +// `#include ` into the generated __native.c). #include "codepoint_extra_ops.h" - -// Out-of-line bodies for codepoint helpers that are too large to inline. -// The classification helpers and the ASCII fast paths for case conversion -// stay inline in codepoint_extra_ops.h; this file holds the slow paths -// that round-trip through PyUnicode_FromOrdinal and CPython's Unicode -// machinery. - -bool LibRTStrings_IsIdentifier_slow(int32_t c) { - PyObject *s = PyUnicode_FromOrdinal((int)c); - if (s == NULL) { - // OOM. Swallow and return false to keep the function ERR_NEVER; - // callers expect a defined answer, not a propagated exception. - PyErr_Clear(); - return false; - } - int r = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - return r == 1; -} diff --git a/mypyc/lib-rt/codepoint_extra_ops.h b/mypyc/lib-rt/codepoint_extra_ops.h index 6c2a6c12b564..8d7201fdd70a 100644 --- a/mypyc/lib-rt/codepoint_extra_ops.h +++ b/mypyc/lib-rt/codepoint_extra_ops.h @@ -4,6 +4,7 @@ #include #include #include +#include "CPy.h" // Codepoint helpers for librt.strings. // Inputs are signed int32_t for compatibility with mypyc's i32 type. @@ -25,15 +26,12 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) { return c >= 0 && Py_UNICODE_ISALPHA((Py_UCS4)c); } -// Slow path for non-ASCII isidentifier; defined out-of-line in -// codepoint_extra_ops.c because it allocates and calls into CPython. -bool LibRTStrings_IsIdentifier_slow(int32_t c); - // True if c could start a valid identifier (matches XID_Start // semantics, which is what str.isidentifier reports for a 1-character // string). The ASCII fast path covers `[A-Za-z_]` inline; non-ASCII // delegates to PyUnicode_IsIdentifier for correct PEP 3131 handling. -// Returns false on OOM in the slow path (the function stays ERR_NEVER). +// Aborts via CPyError_OutOfMemory on allocation failure, so this helper +// stays ERR_NEVER. static inline bool LibRTStrings_IsIdentifier(int32_t c) { if (c < 0) return false; if (c < 128) { @@ -41,7 +39,13 @@ static inline bool LibRTStrings_IsIdentifier(int32_t c) { || (c >= 'A' && c <= 'Z') || c == '_'; } - return LibRTStrings_IsIdentifier_slow(c); + PyObject *s = PyUnicode_FromOrdinal((int)c); + if (s == NULL) { + CPyError_OutOfMemory(); + } + int r = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return r == 1; } #endif // MYPYC_CODEPOINT_EXTRA_OPS_H diff --git a/mypyc/lib-rt/setup.py b/mypyc/lib-rt/setup.py index 1801c4e7dfa9..371b322ca18b 100644 --- a/mypyc/lib-rt/setup.py +++ b/mypyc/lib-rt/setup.py @@ -103,7 +103,6 @@ def run(self) -> None: "librt.strings", [ "strings/librt_strings.c", - "codepoint_extra_ops.c", "init.c", "int_ops.c", "exc_ops.c",