Skip to content

Commit bcabbd0

Browse files
[3.14] gh-80667: Fix lookup for Tangut ideographs in unicodedata (GH-144789) (GH-144871)
(cherry picked from commit 8b7b5a9) Co-authored-by: Pierre Le Marre <dev@wismill.eu>
1 parent 46e7189 commit bcabbd0

File tree

6 files changed

+209
-60
lines changed

6 files changed

+209
-60
lines changed

Lib/test/test_ucn.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,30 @@ def test_cjk_unified_ideographs(self):
111111
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
112112
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
113113

114+
def test_tangut_ideographs(self):
115+
self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000")
116+
self.checkletter("TANGUT IDEOGRAPH-187F7", "\U000187f7")
117+
self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00")
118+
self.checkletter("TANGUT IDEOGRAPH-18D08", "\U00018d08")
119+
self.checkletter("tangut ideograph-18d08", "\U00018d08")
120+
121+
def test_egyptian_hieroglyphs(self):
122+
self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460")
123+
self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa")
124+
self.checkletter("egyptian hieroglyph-143fa", "\U000143fa")
125+
126+
def test_khitan_small_script_characters(self):
127+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00")
128+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5")
129+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
130+
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
131+
self.checkletter("khitan small script character-18cff", "\U00018cff")
132+
133+
def test_nushu_characters(self):
134+
self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170")
135+
self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb")
136+
self.checkletter("nushu character-1b2fb", "\U0001b2fb")
137+
114138
def test_bmp_characters(self):
115139
for code in range(0x10000):
116140
char = chr(code)

Lib/test/test_unicodedata.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,56 @@ def test_function_checksum(self):
116116
result = h.hexdigest()
117117
self.assertEqual(result, self.expectedchecksum)
118118

119+
def test_name(self):
120+
name = self.db.name
121+
self.assertRaises(ValueError, name, '\0')
122+
self.assertRaises(ValueError, name, '\n')
123+
self.assertRaises(ValueError, name, '\x1F')
124+
self.assertRaises(ValueError, name, '\x7F')
125+
self.assertRaises(ValueError, name, '\x9F')
126+
self.assertRaises(ValueError, name, '\uFFFE')
127+
self.assertRaises(ValueError, name, '\uFFFF')
128+
self.assertRaises(ValueError, name, '\U0010FFFF')
129+
self.assertEqual(name('\U0010FFFF', 42), 42)
130+
131+
self.assertEqual(name(' '), 'SPACE')
132+
self.assertEqual(name('1'), 'DIGIT ONE')
133+
self.assertEqual(name('A'), 'LATIN CAPITAL LETTER A')
134+
self.assertEqual(name('\xA0'), 'NO-BREAK SPACE')
135+
self.assertEqual(name('\u0221', None), None if self.old else
136+
'LATIN SMALL LETTER D WITH CURL')
137+
self.assertEqual(name('\u3400'), 'CJK UNIFIED IDEOGRAPH-3400')
138+
self.assertEqual(name('\u9FA5'), 'CJK UNIFIED IDEOGRAPH-9FA5')
139+
self.assertEqual(name('\uAC00'), 'HANGUL SYLLABLE GA')
140+
self.assertEqual(name('\uD7A3'), 'HANGUL SYLLABLE HIH')
141+
self.assertEqual(name('\uF900'), 'CJK COMPATIBILITY IDEOGRAPH-F900')
142+
self.assertEqual(name('\uFA6A'), 'CJK COMPATIBILITY IDEOGRAPH-FA6A')
143+
self.assertEqual(name('\uFBF9'),
144+
'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA '
145+
'ABOVE WITH ALEF MAKSURA ISOLATED FORM')
146+
self.assertEqual(name('\U00013460', None), None if self.old else
147+
'EGYPTIAN HIEROGLYPH-13460')
148+
self.assertEqual(name('\U000143FA', None), None if self.old else
149+
'EGYPTIAN HIEROGLYPH-143FA')
150+
self.assertEqual(name('\U00018B00', None), None if self.old else
151+
'KHITAN SMALL SCRIPT CHARACTER-18B00')
152+
self.assertEqual(name('\U00018CD5', None), None if self.old else
153+
'KHITAN SMALL SCRIPT CHARACTER-18CD5')
154+
self.assertEqual(name('\U00018CFF', None), None if self.old else
155+
'KHITAN SMALL SCRIPT CHARACTER-18CFF')
156+
self.assertEqual(name('\U0001B170', None), None if self.old else
157+
'NUSHU CHARACTER-1B170')
158+
self.assertEqual(name('\U0001B2FB', None), None if self.old else
159+
'NUSHU CHARACTER-1B2FB')
160+
self.assertEqual(name('\U0001FBA8', None), None if self.old else
161+
'BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO '
162+
'MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE')
163+
self.assertEqual(name('\U0002A6D6'), 'CJK UNIFIED IDEOGRAPH-2A6D6')
164+
self.assertEqual(name('\U0002FA1D'), 'CJK COMPATIBILITY IDEOGRAPH-2FA1D')
165+
self.assertEqual(name('\U000323AF', None), None if self.old else
166+
'CJK UNIFIED IDEOGRAPH-323AF')
167+
168+
@requires_resource('cpu')
119169
def test_name_inverse_lookup(self):
120170
for char in iterallchars():
121171
looked_name = self.db.name(char, None)
@@ -139,6 +189,17 @@ def test_lookup_nonexistant(self):
139189
"HANDBUG",
140190
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
141191
"???",
192+
"CJK UNIFIED IDEOGRAPH-03400",
193+
"CJK UNIFIED IDEOGRAPH-020000",
194+
"CJK UNIFIED IDEOGRAPH-33FF",
195+
"CJK UNIFIED IDEOGRAPH-F900",
196+
"CJK UNIFIED IDEOGRAPH-13460",
197+
"CJK UNIFIED IDEOGRAPH-17000",
198+
"CJK UNIFIED IDEOGRAPH-18B00",
199+
"CJK UNIFIED IDEOGRAPH-1B170",
200+
"CJK COMPATIBILITY IDEOGRAPH-3400",
201+
"TANGUT IDEOGRAPH-3400",
202+
"HANGUL SYLLABLE AC00",
142203
]:
143204
self.assertRaises(KeyError, self.db.lookup, nonexistent)
144205

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Support lookup for Tangut Ideographs in :mod:`unicodedata`.

Modules/unicodedata.c

Lines changed: 64 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,21 +1010,18 @@ static const char * const hangul_syllables[][3] = {
10101010
{ 0, 0, "H" }
10111011
};
10121012

1013-
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10141013
static int
1015-
is_unified_ideograph(Py_UCS4 code)
1014+
find_prefix_id(Py_UCS4 code)
10161015
{
1017-
return
1018-
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1019-
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1020-
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1021-
(0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
1022-
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1023-
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1024-
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1025-
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
1026-
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
1027-
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
1016+
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
1017+
if (code < derived_name_ranges[i].first) {
1018+
return -1;
1019+
}
1020+
if (code <= derived_name_ranges[i].last) {
1021+
return derived_name_ranges[i].prefixid;
1022+
}
1023+
}
1024+
return -1;
10281025
}
10291026

10301027
/* macros used to determine if the given code point is in the PUA range that
@@ -1302,7 +1299,9 @@ _getucname(PyObject *self,
13021299
}
13031300
}
13041301

1305-
if (SBase <= code && code < SBase+SCount) {
1302+
int prefixid = find_prefix_id(code);
1303+
if (prefixid == 0) {
1304+
assert(SBase <= code && code < SBase+SCount);
13061305
/* Hangul syllable. */
13071306
int SIndex = code - SBase;
13081307
int L = SIndex / NCount;
@@ -1324,11 +1323,13 @@ _getucname(PyObject *self,
13241323
return 1;
13251324
}
13261325

1327-
if (is_unified_ideograph(code)) {
1328-
if (buflen < 28)
1329-
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1326+
/* Only support CJK unified ideographs.
1327+
* Support for Tangut ideographs is a new feature in 3.15. */
1328+
if (prefixid == 1) {
1329+
const char *prefix = derived_name_prefixes[prefixid];
1330+
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
13301331
return 0;
1331-
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1332+
}
13321333
return 1;
13331334
}
13341335

@@ -1385,6 +1386,35 @@ _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
13851386
return 1;
13861387
}
13871388

1389+
static Py_UCS4
1390+
parse_hex_code(const char *name, int namelen)
1391+
{
1392+
if (namelen < 4 || namelen > 6) {
1393+
return (Py_UCS4)-1;
1394+
}
1395+
if (*name == '0') {
1396+
return (Py_UCS4)-1;
1397+
}
1398+
int v = 0;
1399+
while (namelen--) {
1400+
v *= 16;
1401+
Py_UCS1 c = Py_TOUPPER(*name);
1402+
if (c >= '0' && c <= '9') {
1403+
v += c - '0';
1404+
}
1405+
else if (c >= 'A' && c <= 'F') {
1406+
v += c - 'A' + 10;
1407+
}
1408+
else {
1409+
return (Py_UCS4)-1;
1410+
}
1411+
name++;
1412+
}
1413+
if (v > 0x10ffff) {
1414+
return (Py_UCS4)-1;
1415+
}
1416+
return v;
1417+
}
13881418

13891419
static int
13901420
_getcode(const char* name, int namelen, Py_UCS4* code)
@@ -1393,8 +1423,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
13931423
* Named aliases are not resolved, they are returned as a code point in the
13941424
* PUA */
13951425

1396-
/* Check for hangul syllables. */
1397-
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1426+
int i = 0;
1427+
size_t prefixlen;
1428+
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
1429+
const char *prefix = derived_name_prefixes[i];
1430+
prefixlen = strlen(derived_name_prefixes[i]);
1431+
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
1432+
break;
1433+
}
1434+
}
1435+
1436+
if (i == 0) {
1437+
/* Hangul syllables. */
1438+
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
13981439
int len, L = -1, V = -1, T = -1;
13991440
const char *pos = name + 16;
14001441
find_syllable(pos, &len, &L, LCount, 0);
@@ -1411,28 +1452,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14111452
return 0;
14121453
}
14131454

1414-
/* Check for unified ideographs. */
1415-
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1416-
/* Four or five hexdigits must follow. */
1417-
unsigned int v;
1418-
v = 0;
1419-
name += 22;
1420-
namelen -= 22;
1421-
if (namelen != 4 && namelen != 5)
1455+
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
1456+
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);
1457+
if (find_prefix_id(v) != i) {
14221458
return 0;
1423-
while (namelen--) {
1424-
v *= 16;
1425-
Py_UCS1 c = Py_TOUPPER(*name);
1426-
if (c >= '0' && c <= '9')
1427-
v += c - '0';
1428-
else if (c >= 'A' && c <= 'F')
1429-
v += c - 'A' + 10;
1430-
else
1431-
return 0;
1432-
name++;
14331459
}
1434-
if (!is_unified_ideograph(v))
1435-
return 0;
14361460
*code = v;
14371461
return 1;
14381462
}

Modules/unicodename_db.h

Lines changed: 27 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Tools/unicode/makeunicodedata.py

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -99,18 +99,13 @@
9999
CASED_MASK = 0x2000
100100
EXTENDED_CASE_MASK = 0x4000
101101

102-
# these ranges need to match unicodedata.c:is_unified_ideograph
103-
cjk_ranges = [
104-
('3400', '4DBF'), # CJK Ideograph Extension A CJK
105-
('4E00', '9FFF'), # CJK Ideograph
106-
('20000', '2A6DF'), # CJK Ideograph Extension B
107-
('2A700', '2B739'), # CJK Ideograph Extension C
108-
('2B740', '2B81D'), # CJK Ideograph Extension D
109-
('2B820', '2CEA1'), # CJK Ideograph Extension E
110-
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
111-
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
112-
('30000', '3134A'), # CJK Ideograph Extension G
113-
('31350', '323AF'), # CJK Ideograph Extension H
102+
# Maps the range names in UnicodeData.txt to prefixes for
103+
# derived names specified by rule NR2.
104+
# Hangul should always be at index 0, since it uses special format.
105+
derived_name_range_names = [
106+
("Hangul Syllable", "HANGUL SYLLABLE "),
107+
("CJK Ideograph", "CJK UNIFIED IDEOGRAPH-"),
108+
("Tangut Ideograph", "TANGUT IDEOGRAPH-"),
114109
]
115110

116111

@@ -124,7 +119,7 @@ def maketables(trace=0):
124119

125120
for version in old_versions:
126121
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
127-
old_unicode = UnicodeData(version, cjk_check=False)
122+
old_unicode = UnicodeData(version, ideograph_check=False)
128123
print(len(list(filter(None, old_unicode.table))), "characters")
129124
merge_old_version(version, unicode, old_unicode)
130125

@@ -698,6 +693,23 @@ def makeunicodename(unicode, trace):
698693
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
699694
fprint('};')
700695

696+
fprint(dedent("""
697+
typedef struct {
698+
Py_UCS4 first;
699+
Py_UCS4 last;
700+
int prefixid;
701+
} derived_name_range;
702+
"""))
703+
704+
fprint('static const derived_name_range derived_name_ranges[] = {')
705+
for name_range in unicode.derived_name_ranges:
706+
fprint(' {0x%s, 0x%s, %d},' % name_range)
707+
fprint('};')
708+
709+
fprint('static const char * const derived_name_prefixes[] = {')
710+
for _, prefix in derived_name_range_names:
711+
fprint(' "%s",' % prefix)
712+
fprint('};')
701713

702714
def merge_old_version(version, new, old):
703715
# Changes to exclusion file not implemented yet
@@ -905,14 +917,14 @@ def from_row(row: List[str]) -> UcdRecord:
905917
class UnicodeData:
906918
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
907919

908-
def __init__(self, version, cjk_check=True):
920+
def __init__(self, version, ideograph_check=True):
909921
self.changed = []
910922
table = [None] * 0x110000
911923
for s in UcdFile(UNICODE_DATA, version):
912924
char = int(s[0], 16)
913925
table[char] = from_row(s)
914926

915-
cjk_ranges_found = []
927+
self.derived_name_ranges = []
916928

917929
# expand first-last ranges
918930
field = None
@@ -926,15 +938,15 @@ def __init__(self, version, cjk_check=True):
926938
s.name = ""
927939
field = dataclasses.astuple(s)[:15]
928940
elif s.name[-5:] == "Last>":
929-
if s.name.startswith("<CJK Ideograph"):
930-
cjk_ranges_found.append((field[0],
931-
s.codepoint))
941+
for j, (rangename, _) in enumerate(derived_name_range_names):
942+
if s.name.startswith("<" + rangename):
943+
self.derived_name_ranges.append(
944+
(field[0], s.codepoint, j))
945+
break
932946
s.name = ""
933947
field = None
934948
elif field:
935949
table[i] = from_row(('%X' % i,) + field[1:])
936-
if cjk_check and cjk_ranges != cjk_ranges_found:
937-
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
938950

939951
# public attributes
940952
self.filename = UNICODE_DATA % ''

0 commit comments

Comments
 (0)