Skip to content

Commit 2f35410

Browse files
[3.13] gh-80667: Fix case-sensitivity of some Unicode literal escapes (GH-107281) (GH-144754)
Lookup for CJK ideograms and Hangul syllables is now case-insensitive, as is the case for other character names. (cherry picked from commit e66f4a5) Co-authored-by: James <snoopjedi@gmail.com>
1 parent 99cc3d1 commit 2f35410

File tree

3 files changed

+18
-7
lines changed

3 files changed

+18
-7
lines changed

Lib/test/test_ucn.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ def test_hangul_syllables(self):
8888
self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
8989
self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
9090

91+
self.checkletter("haNGul SYllABle WAe", '\uc65c')
92+
self.checkletter("HAngUL syLLabLE waE", '\uc65c')
93+
9194
self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
9295

9396
def test_cjk_unified_ideographs(self):
@@ -103,6 +106,11 @@ def test_cjk_unified_ideographs(self):
103106
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
104107
self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
105108

109+
self.checkletter("cjK UniFIeD idEogRAph-3aBc", "\u3abc")
110+
self.checkletter("CJk uNIfiEd IDeOGraPH-3AbC", "\u3abc")
111+
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
112+
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
113+
106114
def test_bmp_characters(self):
107115
for code in range(0x10000):
108116
char = chr(code)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Literals using the ``\N{name}`` escape syntax can now construct CJK
2+
ideographs and Hangul syllables using case-insensitive names.

Modules/unicodedata.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1360,7 +1360,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
13601360
len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
13611361
if (len1 <= *len)
13621362
continue;
1363-
if (strncmp(str, s, len1) == 0) {
1363+
if (PyOS_strnicmp(str, s, len1) == 0) {
13641364
*len = len1;
13651365
*pos = i;
13661366
}
@@ -1392,7 +1392,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
13921392
* PUA */
13931393

13941394
/* Check for hangul syllables. */
1395-
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1395+
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
13961396
int len, L = -1, V = -1, T = -1;
13971397
const char *pos = name + 16;
13981398
find_syllable(pos, &len, &L, LCount, 0);
@@ -1410,7 +1410,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14101410
}
14111411

14121412
/* Check for unified ideographs. */
1413-
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1413+
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
14141414
/* Four or five hexdigits must follow. */
14151415
unsigned int v;
14161416
v = 0;
@@ -1420,10 +1420,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14201420
return 0;
14211421
while (namelen--) {
14221422
v *= 16;
1423-
if (*name >= '0' && *name <= '9')
1424-
v += *name - '0';
1425-
else if (*name >= 'A' && *name <= 'F')
1426-
v += *name - 'A' + 10;
1423+
Py_UCS1 c = Py_TOUPPER(*name);
1424+
if (c >= '0' && c <= '9')
1425+
v += c - '0';
1426+
else if (c >= 'A' && c <= 'F')
1427+
v += c - 'A' + 10;
14271428
else
14281429
return 0;
14291430
name++;

0 commit comments

Comments
 (0)