Skip to content

Commit 8b7b5a9

Browse files
gh-80667: Fix Tangut ideographs names in unicodedata (GH-144789)
Co-authored-by: Pierre Le Marre <dev@wismill.eu>
1 parent ebe02e4 commit 8b7b5a9

File tree

6 files changed

+254
-64
lines changed

6 files changed

+254
-64
lines changed

Lib/test/test_ucn.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,30 @@ def test_cjk_unified_ideographs(self):
111111
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
112112
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
113113

114+
def test_tangut_ideographs(self):
115+
self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000")
116+
self.checkletter("TANGUT IDEOGRAPH-187FF", "\U000187ff")
117+
self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00")
118+
self.checkletter("TANGUT IDEOGRAPH-18D1E", "\U00018d1e")
119+
self.checkletter("tangut ideograph-18d1e", "\U00018d1e")
120+
121+
def test_egyptian_hieroglyphs(self):
122+
self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460")
123+
self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa")
124+
self.checkletter("egyptian hieroglyph-143fa", "\U000143fa")
125+
126+
def test_khitan_small_script_characters(self):
127+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00")
128+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5")
129+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
130+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
131+
self.checkletter("khitan small script character-18cff", "\U00018cff")
132+
133+
def test_nushu_characters(self):
134+
self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170")
135+
self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb")
136+
self.checkletter("nushu character-1b2fb", "\U0001b2fb")
137+
114138
def test_bmp_characters(self):
115139
for code in range(0x10000):
116140
char = chr(code)

Lib/test/test_unicodedata.py

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,60 @@ def test_function_checksum(self):
128128
result = h.hexdigest()
129129
self.assertEqual(result, self.expectedchecksum)
130130

131+
def test_name(self):
132+
name = self.db.name
133+
self.assertRaises(ValueError, name, '\0')
134+
self.assertRaises(ValueError, name, '\n')
135+
self.assertRaises(ValueError, name, '\x1F')
136+
self.assertRaises(ValueError, name, '\x7F')
137+
self.assertRaises(ValueError, name, '\x9F')
138+
self.assertRaises(ValueError, name, '\uFFFE')
139+
self.assertRaises(ValueError, name, '\uFFFF')
140+
self.assertRaises(ValueError, name, '\U0010FFFF')
141+
self.assertEqual(name('\U0010FFFF', 42), 42)
142+
143+
self.assertEqual(name(' '), 'SPACE')
144+
self.assertEqual(name('1'), 'DIGIT ONE')
145+
self.assertEqual(name('A'), 'LATIN CAPITAL LETTER A')
146+
self.assertEqual(name('\xA0'), 'NO-BREAK SPACE')
147+
self.assertEqual(name('\u0221', None), None if self.old else
148+
'LATIN SMALL LETTER D WITH CURL')
149+
self.assertEqual(name('\u3400'), 'CJK UNIFIED IDEOGRAPH-3400')
150+
self.assertEqual(name('\u9FA5'), 'CJK UNIFIED IDEOGRAPH-9FA5')
151+
self.assertEqual(name('\uAC00'), 'HANGUL SYLLABLE GA')
152+
self.assertEqual(name('\uD7A3'), 'HANGUL SYLLABLE HIH')
153+
self.assertEqual(name('\uF900'), 'CJK COMPATIBILITY IDEOGRAPH-F900')
154+
self.assertEqual(name('\uFA6A'), 'CJK COMPATIBILITY IDEOGRAPH-FA6A')
155+
self.assertEqual(name('\uFBF9'),
156+
'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA '
157+
'ABOVE WITH ALEF MAKSURA ISOLATED FORM')
158+
self.assertEqual(name('\U00013460', None), None if self.old else
159+
'EGYPTIAN HIEROGLYPH-13460')
160+
self.assertEqual(name('\U000143FA', None), None if self.old else
161+
'EGYPTIAN HIEROGLYPH-143FA')
162+
self.assertEqual(name('\U00017000', None), None if self.old else
163+
'TANGUT IDEOGRAPH-17000')
164+
self.assertEqual(name('\U00018B00', None), None if self.old else
165+
'KHITAN SMALL SCRIPT CHARACTER-18B00')
166+
self.assertEqual(name('\U00018CD5', None), None if self.old else
167+
'KHITAN SMALL SCRIPT CHARACTER-18CD5')
168+
self.assertEqual(name('\U00018CFF', None), None if self.old else
169+
'KHITAN SMALL SCRIPT CHARACTER-18CFF')
170+
self.assertEqual(name('\U00018D1E', None), None if self.old else
171+
'TANGUT IDEOGRAPH-18D1E')
172+
self.assertEqual(name('\U0001B170', None), None if self.old else
173+
'NUSHU CHARACTER-1B170')
174+
self.assertEqual(name('\U0001B2FB', None), None if self.old else
175+
'NUSHU CHARACTER-1B2FB')
176+
self.assertEqual(name('\U0001FBA8', None), None if self.old else
177+
'BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO '
178+
'MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE')
179+
self.assertEqual(name('\U0002A6D6'), 'CJK UNIFIED IDEOGRAPH-2A6D6')
180+
self.assertEqual(name('\U0002FA1D'), 'CJK COMPATIBILITY IDEOGRAPH-2FA1D')
181+
self.assertEqual(name('\U00033479', None), None if self.old else
182+
'CJK UNIFIED IDEOGRAPH-33479')
183+
184+
@requires_resource('cpu')
131185
def test_name_inverse_lookup(self):
132186
for char in iterallchars():
133187
looked_name = self.db.name(char, None)
@@ -151,6 +205,17 @@ def test_lookup_nonexistant(self):
151205
"HANDBUG",
152206
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
153207
"???",
208+
"CJK UNIFIED IDEOGRAPH-03400",
209+
"CJK UNIFIED IDEOGRAPH-020000",
210+
"CJK UNIFIED IDEOGRAPH-33FF",
211+
"CJK UNIFIED IDEOGRAPH-F900",
212+
"CJK UNIFIED IDEOGRAPH-13460",
213+
"CJK UNIFIED IDEOGRAPH-17000",
214+
"CJK UNIFIED IDEOGRAPH-18B00",
215+
"CJK UNIFIED IDEOGRAPH-1B170",
216+
"CJK COMPATIBILITY IDEOGRAPH-3400",
217+
"TANGUT IDEOGRAPH-3400",
218+
"HANGUL SYLLABLE AC00",
154219
]:
155220
self.assertRaises(KeyError, self.db.lookup, nonexistent)
156221

@@ -613,7 +678,47 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
613678
# (e.g. 'make distclean && make') to get the correct checksum.
614679
expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349'
615680
if quicktest else
616-
'65670ae03a324c5f9e826a4de3e25bae4d73c9b7')
681+
'180bdc91143d8aa2eb9dd6726e66d37606205942')
682+
683+
@requires_resource('network')
684+
def test_all_names(self):
685+
TESTDATAFILE = "DerivedName.txt"
686+
testdata = download_test_data_file(TESTDATAFILE)
687+
688+
with testdata:
689+
self.run_name_tests(testdata)
690+
691+
def run_name_tests(self, testdata):
692+
names_ref = {}
693+
694+
def parse_cp(s):
695+
return int(s, 16)
696+
697+
# Parse data
698+
for line in testdata:
699+
line = line.strip()
700+
if not line or line.startswith("#"):
701+
continue
702+
raw_cp, name = line.split("; ")
703+
# Check for a range
704+
if ".." in raw_cp:
705+
cp1, cp2 = map(parse_cp, raw_cp.split(".."))
706+
# remove ‘*’ at the end
707+
assert name[-1] == '*', (raw_cp, name)
708+
name = name[:-1]
709+
for cp in range(cp1, cp2 + 1):
710+
names_ref[cp] = f"{name}{cp:04X}"
711+
elif name[-1] == '*':
712+
cp = parse_cp(raw_cp)
713+
name = name[:-1]
714+
names_ref[cp] = f"{name}{cp:04X}"
715+
else:
716+
assert '*' not in name, (raw_cp, name)
717+
cp = parse_cp(raw_cp)
718+
names_ref[cp] = name
719+
720+
for cp in range(0, sys.maxunicode + 1):
721+
self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp))
617722

618723
def test_isxidstart(self):
619724
self.assertTrue(self.db.isxidstart('S'))
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for Tangut Ideographs names in :mod:`unicodedata`.

Modules/unicodedata.c

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,22 +1052,18 @@ static const char * const hangul_syllables[][3] = {
10521052
{ 0, 0, "H" }
10531053
};
10541054

1055-
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10561055
static int
1057-
is_unified_ideograph(Py_UCS4 code)
1058-
{
1059-
return
1060-
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1061-
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1062-
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1063-
(0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */
1064-
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1065-
(0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */
1066-
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1067-
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
1068-
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
1069-
(0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */
1070-
(0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */
1056+
find_prefix_id(Py_UCS4 code)
1057+
{
1058+
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
1059+
if (code < derived_name_ranges[i].first) {
1060+
return -1;
1061+
}
1062+
if (code <= derived_name_ranges[i].last) {
1063+
return derived_name_ranges[i].prefixid;
1064+
}
1065+
}
1066+
return -1;
10711067
}
10721068

10731069
/* macros used to determine if the given code point is in the PUA range that
@@ -1345,7 +1341,9 @@ _getucname(PyObject *self,
13451341
}
13461342
}
13471343

1348-
if (SBase <= code && code < SBase+SCount) {
1344+
int prefixid = find_prefix_id(code);
1345+
if (prefixid == 0) {
1346+
assert(SBase <= code && code < SBase+SCount);
13491347
/* Hangul syllable. */
13501348
int SIndex = code - SBase;
13511349
int L = SIndex / NCount;
@@ -1367,11 +1365,11 @@ _getucname(PyObject *self,
13671365
return 1;
13681366
}
13691367

1370-
if (is_unified_ideograph(code)) {
1371-
if (buflen < 28)
1372-
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1368+
if (prefixid > 0) {
1369+
const char *prefix = derived_name_prefixes[prefixid];
1370+
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
13731371
return 0;
1374-
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1372+
}
13751373
return 1;
13761374
}
13771375

@@ -1428,6 +1426,35 @@ _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
14281426
return 1;
14291427
}
14301428

1429+
static Py_UCS4
1430+
parse_hex_code(const char *name, int namelen)
1431+
{
1432+
if (namelen < 4 || namelen > 6) {
1433+
return (Py_UCS4)-1;
1434+
}
1435+
if (*name == '0') {
1436+
return (Py_UCS4)-1;
1437+
}
1438+
int v = 0;
1439+
while (namelen--) {
1440+
v *= 16;
1441+
Py_UCS1 c = Py_TOUPPER(*name);
1442+
if (c >= '0' && c <= '9') {
1443+
v += c - '0';
1444+
}
1445+
else if (c >= 'A' && c <= 'F') {
1446+
v += c - 'A' + 10;
1447+
}
1448+
else {
1449+
return (Py_UCS4)-1;
1450+
}
1451+
name++;
1452+
}
1453+
if (v > 0x10ffff) {
1454+
return (Py_UCS4)-1;
1455+
}
1456+
return v;
1457+
}
14311458

14321459
static int
14331460
_getcode(const char* name, int namelen, Py_UCS4* code)
@@ -1436,8 +1463,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14361463
* Named aliases are not resolved, they are returned as a code point in the
14371464
* PUA */
14381465

1439-
/* Check for hangul syllables. */
1440-
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1466+
int i = 0;
1467+
size_t prefixlen;
1468+
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
1469+
const char *prefix = derived_name_prefixes[i];
1470+
prefixlen = strlen(derived_name_prefixes[i]);
1471+
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
1472+
break;
1473+
}
1474+
}
1475+
1476+
if (i == 0) {
1477+
/* Hangul syllables. */
1478+
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
14411479
int len, L = -1, V = -1, T = -1;
14421480
const char *pos = name + 16;
14431481
find_syllable(pos, &len, &L, LCount, 0);
@@ -1454,28 +1492,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14541492
return 0;
14551493
}
14561494

1457-
/* Check for unified ideographs. */
1458-
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1459-
/* Four or five hexdigits must follow. */
1460-
unsigned int v;
1461-
v = 0;
1462-
name += 22;
1463-
namelen -= 22;
1464-
if (namelen != 4 && namelen != 5)
1495+
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
1496+
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);
1497+
if (find_prefix_id(v) != i) {
14651498
return 0;
1466-
while (namelen--) {
1467-
v *= 16;
1468-
Py_UCS1 c = Py_TOUPPER(*name);
1469-
if (c >= '0' && c <= '9')
1470-
v += c - '0';
1471-
else if (c >= 'A' && c <= 'F')
1472-
v += c - 'A' + 10;
1473-
else
1474-
return 0;
1475-
name++;
14761499
}
1477-
if (!is_unified_ideograph(v))
1478-
return 0;
14791500
*code = v;
14801501
return 1;
14811502
}

Modules/unicodename_db.h

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)