Issue python#28563: Fixed possible DoS and arbitrary code execution when handle

serhiy-storchaka · serhiy-storchaka · commit 1c3fdd900d10 · 2016-11-08T21:20:09.000+02:00
plural form selections in the gettext module.  The expression parser now
supports exact syntax supported by GNU gettext.
diff --git a/Lib/gettext.py b/Lib/gettext.py
@@ -59,55 +59,139 @@
 
 _default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
 
+# Expression parsing for plural form selection.
+#
+# The gettext library supports a small subset of C syntax.  The only
+# incompatible difference is that integer literals starting with zero are
+# decimal.
+#
+# https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
+# http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
+
+_token_pattern = re.compile(r"""
+        (?P<WHITESPACES>[ \t]+)                    | # spaces and horizontal tabs
+        (?P<NUMBER>[0-9]+\b)                       | # decimal integer
+        (?P<NAME>n\b)                              | # only n is allowed
+        (?P<PARENTHESIS>[()])                      |
+        (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
+                                                     # <=, >=, ==, !=, &&, ||,
+                                                     # ? :
+                                                     # unary and bitwise ops
+                                                     # not allowed
+        (?P<INVALID>\w+|.)                           # invalid token
+    """, re.VERBOSE|re.DOTALL)
+
+def _tokenize(plural):
+    for mo in re.finditer(_token_pattern, plural):
+        kind = mo.lastgroup
+        if kind == 'WHITESPACES':
+            continue
+        value = mo.group(kind)
+        if kind == 'INVALID':
+            raise ValueError('invalid token in plural form: %s' % value)
+        yield value
+    yield ''
+
+def _error(value):
+    if value:
+        return ValueError('unexpected token in plural form: %s' % value)
+    else:
+        return ValueError('unexpected end of plural form')
+
+_binary_ops = (
+    ('||',),
+    ('&&',),
+    ('==', '!='),
+    ('<', '>', '<=', '>='),
+    ('+', '-'),
+    ('*', '/', '%'),
+)
+_binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
+_c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
+
+def _parse(tokens, priority=-1):
+    result = ''
+    nexttok = next(tokens)
+    while nexttok == '!':
+        result += 'not '
+        nexttok = next(tokens)
+
+    if nexttok == '(':
+        sub, nexttok = _parse(tokens)
+        result = '%s(%s)' % (result, sub)
+        if nexttok != ')':
+            raise ValueError('unbalanced parenthesis in plural form')
+    elif nexttok == 'n':
+        result = '%s%s' % (result, nexttok)
+    else:
+        try:
+            value = int(nexttok, 10)
+        except ValueError:
+            raise _error(nexttok) from None
+        result = '%s%d' % (result, value)
+    nexttok = next(tokens)
+
+    j = 100
+    while nexttok in _binary_ops:
+        i = _binary_ops[nexttok]
+        if i < priority:
+            break
+        # Break chained comparisons
+        if i in (3, 4) and j in (3, 4):  # '==', '!=', '<', '>', '<=', '>='
+            result = '(%s)' % result
+        # Replace some C operators by their Python equivalents
+        op = _c2py_ops.get(nexttok, nexttok)
+        right, nexttok = _parse(tokens, i + 1)
+        result = '%s %s %s' % (result, op, right)
+        j = i
+    if j == priority == 4:  # '<', '>', '<=', '>='
+        result = '(%s)' % result
+
+    if nexttok == '?' and priority <= 0:
+        if_true, nexttok = _parse(tokens, 0)
+        if nexttok != ':':
+            raise _error(nexttok)
+        if_false, nexttok = _parse(tokens)
+        result = '%s if %s else %s' % (if_true, result, if_false)
+        if priority == 0:
+            result = '(%s)' % result
+
+    return result, nexttok
 
 def c2py(plural):
     """Gets a C expression as used in PO files for plural forms and returns a
-    Python lambda function that implements an equivalent expression.
+    Python function that implements an equivalent expression.
     """
-    # Security check, allow only the "n" identifier
-    import token, tokenize
-    tokens = tokenize.generate_tokens(io.StringIO(plural).readline)
-    try:
-        danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
-    except tokenize.TokenError:
-        raise ValueError('plural forms expression error, maybe unbalanced parenthesis')
-    else:
-        if danger:
-            raise ValueError('plural forms expression could be dangerous')
-
-    # Replace some C operators by their Python equivalents
-    plural = plural.replace('&&', ' and ')
-    plural = plural.replace('||', ' or ')
-
-    expr = re.compile(r'\!([^=])')
-    plural = expr.sub(' not \\1', plural)
-
-    # Regular expression and replacement function used to transform
-    # "a?b:c" to "b if a else c".
-    expr = re.compile(r'(.*?)\?(.*?):(.*)')
-    def repl(x):
-        return "(%s if %s else %s)" % (x.group(2), x.group(1),
-                                       expr.sub(repl, x.group(3)))
-
-    # Code to transform the plural expression, taking care of parentheses
-    stack = ['']
-    for c in plural:
-        if c == '(':
-            stack.append('')
-        elif c == ')':
-            if len(stack) == 1:
-                # Actually, we never reach this code, because unbalanced
-                # parentheses get caught in the security check at the
-                # beginning.
-                raise ValueError('unbalanced parenthesis in plural form')
-            s = expr.sub(repl, stack.pop())
-            stack[-1] += '(%s)' % s
-        else:
-            stack[-1] += c
-    plural = expr.sub(repl, stack.pop())
-
-    return eval('lambda n: int(%s)' % plural)
 
+    if len(plural) > 1000:
+        raise ValueError('plural form expression is too long')
+    try:
+        result, nexttok = _parse(_tokenize(plural))
+        if nexttok:
+            raise _error(nexttok)
+
+        depth = 0
+        for c in result:
+            if c == '(':
+                depth += 1
+                if depth > 20:
+                    # Python compiler limit is about 90.
+                    # The most complex example has 2.
+                    raise ValueError('plural form expression is too complex')
+            elif c == ')':
+                depth -= 1
+
+        ns = {}
+        exec('''if True:
+            def func(n):
+                if not isinstance(n, int):
+                    raise ValueError('Plural value must be an integer.')
+                return int(%s)
+            ''' % result, ns)
+        return ns['func']
+    except RuntimeError:
+        # Recursion error can be raised in _parse() or exec().
+        raise ValueError('plural form expression is too complex')
 
 
 def _expand_lang(loc):
diff --git a/Lib/test/test_gettext.py b/Lib/test/test_gettext.py
@@ -236,7 +236,9 @@ def test_plural_forms2(self):
         x = t.ngettext('There is %s file', 'There are %s files', 2)
         eq(x, 'Hay %s ficheros')
 
-    def test_hu(self):
+    # Examples from http://www.gnu.org/software/gettext/manual/gettext.html
+
+    def test_ja(self):
         eq = self.assertEqual
         f = gettext.c2py('0')
         s = ''.join([ str(f(x)) for x in range(200) ])
@@ -254,6 +256,12 @@ def test_fr(self):
         s = ''.join([ str(f(x)) for x in range(200) ])
         eq(s, "00111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
 
+    def test_lv(self):
+        eq = self.assertEqual
+        f = gettext.c2py('n%10==1 && n%100!=11 ? 0 : n != 0 ? 1 : 2')
+        s = ''.join([ str(f(x)) for x in range(200) ])
+        eq(s, "20111111111111111111101111111110111111111011111111101111111110111111111011111111101111111110111111111011111111111111111110111111111011111111101111111110111111111011111111101111111110111111111011111111")
+
     def test_gd(self):
         eq = self.assertEqual
         f = gettext.c2py('n==1 ? 0 : n==2 ? 1 : 2')
@@ -267,6 +275,12 @@ def test_gd2(self):
         s = ''.join([ str(f(x)) for x in range(200) ])
         eq(s, "20122222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222")
 
+    def test_ro(self):
+        eq = self.assertEqual
+        f = gettext.c2py('n==1 ? 0 : (n==0 || (n%100 > 0 && n%100 < 20)) ? 1 : 2')
+        s = ''.join([ str(f(x)) for x in range(200) ])
+        eq(s, "10111111111111111111222222222222222222222222222222222222222222222222222222222222222222222222222222222111111111111111111122222222222222222222222222222222222222222222222222222222222222222222222222222222")
+
     def test_lt(self):
         eq = self.assertEqual
         f = gettext.c2py('n%10==1 && n%100!=11 ? 0 : n%10>=2 && (n%100<10 || n%100>=20) ? 1 : 2')
@@ -279,6 +293,12 @@ def test_ru(self):
         s = ''.join([ str(f(x)) for x in range(200) ])
         eq(s, "20111222222222222222201112222220111222222011122222201112222220111222222011122222201112222220111222222011122222222222222220111222222011122222201112222220111222222011122222201112222220111222222011122222")
 
+    def test_cs(self):
+        eq = self.assertEqual
+        f = gettext.c2py('(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2')
+        s = ''.join([ str(f(x)) for x in range(200) ])
+        eq(s, "20111222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222")
+
     def test_pl(self):
         eq = self.assertEqual
         f = gettext.c2py('n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2')
@@ -291,10 +311,73 @@ def test_sl(self):
         s = ''.join([ str(f(x)) for x in range(200) ])
         eq(s, "30122333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333012233333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333")
 
+    def test_ar(self):
+        eq = self.assertEqual
+        f = gettext.c2py('n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 ? 4 : 5')
+        s = ''.join([ str(f(x)) for x in range(200) ])
+        eq(s, "01233333333444444444444444444444444444444444444444444444444444444444444444444444444444444444444444445553333333344444444444444444444444444444444444444444444444444444444444444444444444444444444444444444")
+
     def test_security(self):
         raises = self.assertRaises
         # Test for a dangerous expression
         raises(ValueError, gettext.c2py, "os.chmod('/etc/passwd',0777)")
+        # issue28563
+        raises(ValueError, gettext.c2py, '"(eval(foo) && ""')
+        raises(ValueError, gettext.c2py, 'f"{os.system(\'sh\')}"')
+        # Maximum recursion depth exceeded during compilation
+        raises(ValueError, gettext.c2py, 'n+'*10000 + 'n')
+        self.assertEqual(gettext.c2py('n+'*100 + 'n')(1), 101)
+        # MemoryError during compilation
+        raises(ValueError, gettext.c2py, '('*100 + 'n' + ')'*100)
+        # Maximum recursion depth exceeded in C to Python translator
+        raises(ValueError, gettext.c2py, '('*10000 + 'n' + ')'*10000)
+        self.assertEqual(gettext.c2py('('*20 + 'n' + ')'*20)(1), 1)
+
+    def test_chained_comparison(self):
+        # C doesn't chain comparison as Python so 2 == 2 == 2 gets different results
+        f = gettext.c2py('n == n == n')
+        self.assertEqual(''.join(str(f(x)) for x in range(3)), '010')
+        f = gettext.c2py('1 < n == n')
+        self.assertEqual(''.join(str(f(x)) for x in range(3)), '100')
+        f = gettext.c2py('n == n < 2')
+        self.assertEqual(''.join(str(f(x)) for x in range(3)), '010')
+        f = gettext.c2py('0 < n < 2')
+        self.assertEqual(''.join(str(f(x)) for x in range(3)), '111')
+
+    def test_decimal_number(self):
+        self.assertEqual(gettext.c2py('0123')(1), 123)
+
+    def test_invalid_syntax(self):
+        invalid_expressions = [
+            'x>1', '(n>1', 'n>1)', '42**42**42', '0xa', '1.0', '1e2',
+            'n>0x1', '+n', '-n', 'n()', 'n(1)', '1+', 'nn', 'n n',
+        ]
+        for expr in invalid_expressions:
+            with self.assertRaises(ValueError):
+                gettext.c2py(expr)
+
+    def test_nested_condition_operator(self):
+        self.assertEqual(gettext.c2py('n?1?2:3:4')(0), 4)
+        self.assertEqual(gettext.c2py('n?1?2:3:4')(1), 2)
+        self.assertEqual(gettext.c2py('n?1:3?4:5')(0), 4)
+        self.assertEqual(gettext.c2py('n?1:3?4:5')(1), 1)
+
+    def test_division(self):
+        f = gettext.c2py('2/n*3')
+        self.assertEqual(f(1), 6)
+        self.assertEqual(f(2), 3)
+        self.assertEqual(f(3), 0)
+        self.assertEqual(f(-1), -6)
+        self.assertRaises(ZeroDivisionError, f, 0)
+
+    def test_plural_number(self):
+        f = gettext.c2py('1')
+        self.assertEqual(f(1), 1)
+        self.assertRaises(ValueError, f, 1.0)
+        self.assertRaises(ValueError, f, '1')
+        self.assertRaises(ValueError, f, [])
+        self.assertRaises(ValueError, f, object())
+
 
 class GNUTranslationParsingTest(GettextBaseTest):
     def test_plural_form_error_issue17898(self):
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -16,6 +16,10 @@ Core and Builtins
 Library
 -------
 
+- Issue #28563: Fixed possible DoS and arbitrary code execution when handle
+  plural form selections in the gettext module.  The expression parser now
+  supports exact syntax supported by GNU gettext.
+
 - In the curses module, raise an error if window.getstr() or window.instr() is
   passed a negative value.