unicodedata: Fix Tangut Ideograph names

wismill · wismill · commit 0609eb29586b · 2023-07-26T06:36:53.000+02:00
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
 
     # Update this if the database changes. Make sure to do a full rebuild
     # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
+    expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'
 
     @requires_resource('cpu')
     def test_function_checksum(self):
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = {
 
 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
 static int
-is_unified_ideograph(Py_UCS4 code)
+is_cjk_unified_ideograph(Py_UCS4 code)
 {
     return
         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
@@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code)
         (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
 }
 
+/* These ranges need to match makeunicodedata.py:tangut_ranges. */
+static int
+is_tangut_ideograph(Py_UCS4 code)
+{
+    return
+        (0x17000 <= code && code <= 0x187F7) || /* Tangut */
+        (0x18D00 <= code && code <= 0x18D08);   /* Tangut Supplement */
+}
+
 /* macros used to determine if the given code point is in the PUA range that
  * we are using to store aliases and named sequences */
 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1098,14 +1107,22 @@ _getucname(PyObject *self,
         return 1;
     }
 
-    if (is_unified_ideograph(code)) {
+    if (is_cjk_unified_ideograph(code)) {
         if (buflen < 28)
             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
             return 0;
         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
         return 1;
     }
 
+    if (is_tangut_ideograph(code)) {
+        if (buflen < 23)
+            /* Worst case: TANGUT IDEOGRAPH-18D08 */
+            return 0;
+        sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
+        return 1;
+    }
+
     /* get offset into phrasebook */
     offset = phrasebook_offset1[(code>>phrasebook_shift)];
     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@@ -1236,7 +1253,7 @@ _getcode(PyObject* self,
         return 0;
     }
 
-    /* Check for unified ideographs. */
+    /* Check for CJK unified ideographs. */
     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
         /* Four or five hexdigits must follow. */
         v = 0;
@@ -1254,12 +1271,38 @@ _getcode(PyObject* self,
                 return 0;
             name++;
         }
-        if (!is_unified_ideograph(v))
+        if (!is_cjk_unified_ideograph(v))
+            return 0;
+        *code = v;
+        return 1;
+    }
+
+
+    /* Check for Tangut ideographs. */
+    if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
+        /* Five hexdigits must follow. */
+        v = 0;
+        name += 17;
+        namelen -= 17;
+        if (namelen != 5)
+            return 0;
+        while (namelen--) {
+            v *= 16;
+            if (*name >= '0' && *name <= '9')
+                v += *name - '0';
+            else if (*name >= 'A' && *name <= 'F')
+                v += *name - 'A' + 10;
+            else
+                return 0;
+            name++;
+        }
+        if (!is_tangut_ideograph(v))
             return 0;
         *code = v;
         return 1;
     }
 
+
     /* the following is the same as python's dictionary lookup, with
        only minor changes.  see the makeunicodedata script for more
        details */
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
@@ -99,7 +99,7 @@
 CASED_MASK = 0x2000
 EXTENDED_CASE_MASK = 0x4000
 
-# these ranges need to match unicodedata.c:is_unified_ideograph
+# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
 cjk_ranges = [
     ('3400', '4DBF'),
     ('4E00', '9FFF'),
@@ -112,6 +112,12 @@
     ('31350', '323AF'),
 ]
 
+# these ranges need to match unicodedata.c:is_tangut_ideograph
+tangut_ranges = [
+    ('17000', '187F7'),
+    ('18D00', '18D08')
+]
+
 
 def maketables(trace=0):
 
@@ -123,7 +129,7 @@ def maketables(trace=0):
 
     for version in old_versions:
         print("--- Reading", UNICODE_DATA % ("-"+version), "...")
-        old_unicode = UnicodeData(version, cjk_check=False)
+        old_unicode = UnicodeData(version, ideograph_check=False)
         print(len(list(filter(None, old_unicode.table))), "characters")
         merge_old_version(version, unicode, old_unicode)
 
@@ -1020,14 +1026,15 @@ def from_row(row: List[str]) -> UcdRecord:
 class UnicodeData:
     # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
 
-    def __init__(self, version, cjk_check=True):
+    def __init__(self, version, ideograph_check=True):
         self.changed = []
         table = [None] * 0x110000
         for s in UcdFile(UNICODE_DATA, version):
             char = int(s[0], 16)
             table[char] = from_row(s)
 
         cjk_ranges_found = []
+        tangut_ranges_found = []
 
         # expand first-last ranges
         field = None
@@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True):
                     if s.name.startswith("<CJK Ideograph"):
                         cjk_ranges_found.append((field[0],
                                                  s.codepoint))
+                    elif s.name.startswith("<Tangut Ideograph"):
+                        tangut_ranges_found.append((field[0],
+                                                    s.codepoint))
                     s.name = ""
                     field = None
             elif field:
                 table[i] = from_row(('%X' % i,) + field[1:])
-        if cjk_check and cjk_ranges != cjk_ranges_found:
+        if ideograph_check and cjk_ranges != cjk_ranges_found:
             raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
+        if ideograph_check and tangut_ranges != tangut_ranges_found:
+            raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)
 
         # public attributes
         self.filename = UNICODE_DATA % ''