Skip to content

Commit 0609eb2

Browse files
committed
unicodedata: Fix Tangut Ideograph names
1 parent 33838fe commit 0609eb2

File tree

3 files changed

+64
-9
lines changed

3 files changed

+64
-9
lines changed

Lib/test/test_unicodedata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
7171

7272
# Update this if the database changes. Make sure to do a full rebuild
7373
# (e.g. 'make distclean && make') to get the correct checksum.
74-
expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
74+
expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'
7575

7676
@requires_resource('cpu')
7777
def test_function_checksum(self):

Modules/unicodedata.c

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = {
10251025

10261026
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
10271027
static int
1028-
is_unified_ideograph(Py_UCS4 code)
1028+
is_cjk_unified_ideograph(Py_UCS4 code)
10291029
{
10301030
return
10311031
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
@@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code)
10391039
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
10401040
}
10411041

1042+
/* These ranges need to match makeunicodedata.py:tangut_ranges. */
1043+
static int
1044+
is_tangut_ideograph(Py_UCS4 code)
1045+
{
1046+
return
1047+
(0x17000 <= code && code <= 0x187F7) || /* Tangut */
1048+
(0x18D00 <= code && code <= 0x18D08); /* Tangut Supplement */
1049+
}
1050+
10421051
/* macros used to determine if the given code point is in the PUA range that
10431052
* we are using to store aliases and named sequences */
10441053
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1098,14 +1107,22 @@ _getucname(PyObject *self,
10981107
return 1;
10991108
}
11001109

1101-
if (is_unified_ideograph(code)) {
1110+
if (is_cjk_unified_ideograph(code)) {
11021111
if (buflen < 28)
11031112
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
11041113
return 0;
11051114
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
11061115
return 1;
11071116
}
11081117

1118+
if (is_tangut_ideograph(code)) {
1119+
if (buflen < 23)
1120+
/* Worst case: TANGUT IDEOGRAPH-18D08 */
1121+
return 0;
1122+
sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
1123+
return 1;
1124+
}
1125+
11091126
/* get offset into phrasebook */
11101127
offset = phrasebook_offset1[(code>>phrasebook_shift)];
11111128
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@@ -1236,7 +1253,7 @@ _getcode(PyObject* self,
12361253
return 0;
12371254
}
12381255

1239-
/* Check for unified ideographs. */
1256+
/* Check for CJK unified ideographs. */
12401257
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
12411258
/* Four or five hexdigits must follow. */
12421259
v = 0;
@@ -1254,12 +1271,38 @@ _getcode(PyObject* self,
12541271
return 0;
12551272
name++;
12561273
}
1257-
if (!is_unified_ideograph(v))
1274+
if (!is_cjk_unified_ideograph(v))
1275+
return 0;
1276+
*code = v;
1277+
return 1;
1278+
}
1279+
1280+
1281+
/* Check for Tangut ideographs. */
1282+
if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
1283+
/* Five hexdigits must follow. */
1284+
v = 0;
1285+
name += 17;
1286+
namelen -= 17;
1287+
if (namelen != 5)
1288+
return 0;
1289+
while (namelen--) {
1290+
v *= 16;
1291+
if (*name >= '0' && *name <= '9')
1292+
v += *name - '0';
1293+
else if (*name >= 'A' && *name <= 'F')
1294+
v += *name - 'A' + 10;
1295+
else
1296+
return 0;
1297+
name++;
1298+
}
1299+
if (!is_tangut_ideograph(v))
12581300
return 0;
12591301
*code = v;
12601302
return 1;
12611303
}
12621304

1305+
12631306
/* the following is the same as python's dictionary lookup, with
12641307
only minor changes. see the makeunicodedata script for more
12651308
details */

Tools/unicode/makeunicodedata.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
CASED_MASK = 0x2000
100100
EXTENDED_CASE_MASK = 0x4000
101101

102-
# these ranges need to match unicodedata.c:is_unified_ideograph
102+
# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
103103
cjk_ranges = [
104104
('3400', '4DBF'),
105105
('4E00', '9FFF'),
@@ -112,6 +112,12 @@
112112
('31350', '323AF'),
113113
]
114114

115+
# these ranges need to match unicodedata.c:is_tangut_ideograph
116+
tangut_ranges = [
117+
('17000', '187F7'),
118+
('18D00', '18D08')
119+
]
120+
115121

116122
def maketables(trace=0):
117123

@@ -123,7 +129,7 @@ def maketables(trace=0):
123129

124130
for version in old_versions:
125131
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
126-
old_unicode = UnicodeData(version, cjk_check=False)
132+
old_unicode = UnicodeData(version, ideograph_check=False)
127133
print(len(list(filter(None, old_unicode.table))), "characters")
128134
merge_old_version(version, unicode, old_unicode)
129135

@@ -1020,14 +1026,15 @@ def from_row(row: List[str]) -> UcdRecord:
10201026
class UnicodeData:
10211027
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
10221028

1023-
def __init__(self, version, cjk_check=True):
1029+
def __init__(self, version, ideograph_check=True):
10241030
self.changed = []
10251031
table = [None] * 0x110000
10261032
for s in UcdFile(UNICODE_DATA, version):
10271033
char = int(s[0], 16)
10281034
table[char] = from_row(s)
10291035

10301036
cjk_ranges_found = []
1037+
tangut_ranges_found = []
10311038

10321039
# expand first-last ranges
10331040
field = None
@@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True):
10441051
if s.name.startswith("<CJK Ideograph"):
10451052
cjk_ranges_found.append((field[0],
10461053
s.codepoint))
1054+
elif s.name.startswith("<Tangut Ideograph"):
1055+
tangut_ranges_found.append((field[0],
1056+
s.codepoint))
10471057
s.name = ""
10481058
field = None
10491059
elif field:
10501060
table[i] = from_row(('%X' % i,) + field[1:])
1051-
if cjk_check and cjk_ranges != cjk_ranges_found:
1061+
if ideograph_check and cjk_ranges != cjk_ranges_found:
10521062
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1063+
if ideograph_check and tangut_ranges != tangut_ranges_found:
1064+
raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)
10531065

10541066
# public attributes
10551067
self.filename = UNICODE_DATA % ''

0 commit comments

Comments
 (0)