diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index 25bb224921aa8b..6ab569393e5ce1 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -113,6 +113,8 @@ extern Py_ssize_t _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t has extern Py_ssize_t _Py_dict_lookup_threadsafe(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr); extern Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject *key, Py_hash_t hash, _PyStackRef *value_addr); +extern int _PyDict_GetMethodStackRef(PyDictObject *dict, PyObject *name, _PyStackRef *method); + extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *); extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key); diff --git a/Include/internal/pycore_stackref.h b/Include/internal/pycore_stackref.h index 6bf82d8322f508..c4e8f10fe05276 100644 --- a/Include/internal/pycore_stackref.h +++ b/Include/internal/pycore_stackref.h @@ -839,6 +839,13 @@ _Py_TryXGetStackRef(PyObject **src, _PyStackRef *out) #endif +#define PyStackRef_XSETREF(dst, src) \ + do { \ + _PyStackRef _tmp_dst_ref = (dst); \ + (dst) = (src); \ + PyStackRef_XCLOSE(_tmp_dst_ref); \ + } while(0) + // Like Py_VISIT but for _PyStackRef fields #define _Py_VISIT_STACKREF(ref) \ do { \ diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 80d8644af86f74..45f58eb8ac93cf 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1647,6 +1647,9 @@ def _block(self, count): """Round up a byte count by BLOCKSIZE and return it, e.g. _block(834) => 1024. """ + # Only non-negative offsets are allowed + if count < 0: + raise InvalidHeaderError("invalid offset") blocks, remainder = divmod(count, BLOCKSIZE) if remainder: blocks += 1 diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 7055e1ed147a9e..28914df6b010d0 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -55,6 +55,7 @@ def sha256sum(data): zstname = os.path.join(TEMPDIR, "testtar.tar.zst") tmpname = os.path.join(TEMPDIR, "tmp.tar") dotlessname = os.path.join(TEMPDIR, "testtar") +SPACE = b" " sha256_regtype = ( "e09e4bc8b3c9d9177e77256353b36c159f5f040531bbd4b024a8f9b9196c71ce" @@ -4602,6 +4603,161 @@ def extractall(self, ar): ar.extractall(self.testdir, filter='fully_trusted') +class OffsetValidationTests(unittest.TestCase): + tarname = tmpname + invalid_posix_header = ( + # name: 100 bytes + tarfile.NUL * tarfile.LENGTH_NAME + # mode, space, null terminator: 8 bytes + + b"000755" + SPACE + tarfile.NUL + # uid, space, null terminator: 8 bytes + + b"000001" + SPACE + tarfile.NUL + # gid, space, null terminator: 8 bytes + + b"000001" + SPACE + tarfile.NUL + # size, space: 12 bytes + + b"\xff" * 11 + SPACE + # mtime, space: 12 bytes + + tarfile.NUL * 11 + SPACE + # chksum: 8 bytes + + b"0011407" + tarfile.NUL + # type: 1 byte + + tarfile.REGTYPE + # linkname: 100 bytes + + tarfile.NUL * tarfile.LENGTH_LINK + # magic: 6 bytes, version: 2 bytes + + tarfile.POSIX_MAGIC + # uname: 32 bytes + + tarfile.NUL * 32 + # gname: 32 bytes + + tarfile.NUL * 32 + # devmajor, space, null terminator: 8 bytes + + tarfile.NUL * 6 + SPACE + tarfile.NUL + # devminor, space, null terminator: 8 bytes + + tarfile.NUL * 6 + SPACE + tarfile.NUL + # prefix: 155 bytes + + tarfile.NUL * tarfile.LENGTH_PREFIX + # padding: 12 bytes + + tarfile.NUL * 12 + ) + invalid_gnu_header = ( + # name: 100 bytes + tarfile.NUL * tarfile.LENGTH_NAME + # mode, null terminator: 8 bytes + + b"0000755" + tarfile.NUL + # uid, null terminator: 8 bytes + + b"0000001" + tarfile.NUL + # gid, space, null terminator: 8 bytes + + b"0000001" + tarfile.NUL + # size, space: 12 bytes + + b"\xff" * 11 + SPACE + # mtime, space: 12 bytes + + tarfile.NUL * 11 + SPACE + # chksum: 8 bytes + + b"0011327" + tarfile.NUL + # type: 1 byte + + tarfile.REGTYPE + # linkname: 100 bytes + + tarfile.NUL * tarfile.LENGTH_LINK + # magic: 8 bytes + + tarfile.GNU_MAGIC + # uname: 32 bytes + + tarfile.NUL * 32 + # gname: 32 bytes + + tarfile.NUL * 32 + # devmajor, null terminator: 8 bytes + + tarfile.NUL * 8 + # devminor, null terminator: 8 bytes + + tarfile.NUL * 8 + # padding: 167 bytes + + tarfile.NUL * 167 + ) + invalid_v7_header = ( + # name: 100 bytes + tarfile.NUL * tarfile.LENGTH_NAME + # mode, space, null terminator: 8 bytes + + b"000755" + SPACE + tarfile.NUL + # uid, space, null terminator: 8 bytes + + b"000001" + SPACE + tarfile.NUL + # gid, space, null terminator: 8 bytes + + b"000001" + SPACE + tarfile.NUL + # size, space: 12 bytes + + b"\xff" * 11 + SPACE + # mtime, space: 12 bytes + + tarfile.NUL * 11 + SPACE + # chksum: 8 bytes + + b"0010070" + tarfile.NUL + # type: 1 byte + + tarfile.REGTYPE + # linkname: 100 bytes + + tarfile.NUL * tarfile.LENGTH_LINK + # padding: 255 bytes + + tarfile.NUL * 255 + ) + valid_gnu_header = tarfile.TarInfo("filename").tobuf(tarfile.GNU_FORMAT) + data_block = b"\xff" * tarfile.BLOCKSIZE + + def _write_buffer(self, buffer): + with open(self.tarname, "wb") as f: + f.write(buffer) + + def _get_members(self, ignore_zeros=None): + with open(self.tarname, "rb") as f: + with tarfile.open( + mode="r", fileobj=f, ignore_zeros=ignore_zeros + ) as tar: + return tar.getmembers() + + def _assert_raises_read_error_exception(self): + with self.assertRaisesRegex( + tarfile.ReadError, "file could not be opened successfully" + ): + self._get_members() + + def test_invalid_offset_header_validations(self): + for tar_format, invalid_header in ( + ("posix", self.invalid_posix_header), + ("gnu", self.invalid_gnu_header), + ("v7", self.invalid_v7_header), + ): + with self.subTest(format=tar_format): + self._write_buffer(invalid_header) + self._assert_raises_read_error_exception() + + def test_early_stop_at_invalid_offset_header(self): + buffer = self.valid_gnu_header + self.invalid_gnu_header + self.valid_gnu_header + self._write_buffer(buffer) + members = self._get_members() + self.assertEqual(len(members), 1) + self.assertEqual(members[0].name, "filename") + self.assertEqual(members[0].offset, 0) + + def test_ignore_invalid_archive(self): + # 3 invalid headers with their respective data + buffer = (self.invalid_gnu_header + self.data_block) * 3 + self._write_buffer(buffer) + members = self._get_members(ignore_zeros=True) + self.assertEqual(len(members), 0) + + def test_ignore_invalid_offset_headers(self): + for first_block, second_block, expected_offset in ( + ( + (self.valid_gnu_header), + (self.invalid_gnu_header + self.data_block), + 0, + ), + ( + (self.invalid_gnu_header + self.data_block), + (self.valid_gnu_header), + 1024, + ), + ): + self._write_buffer(first_block + second_block) + members = self._get_members(ignore_zeros=True) + self.assertEqual(len(members), 1) + self.assertEqual(members[0].name, "filename") + self.assertEqual(members[0].offset, expected_offset) + + def setUpModule(): os_helper.unlink(TEMPDIR) os.makedirs(TEMPDIR) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-25-22-31-52.gh-issue-131338.zJDCMp.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-25-22-31-52.gh-issue-131338.zJDCMp.rst new file mode 100644 index 00000000000000..6c064e8f4a0339 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-25-22-31-52.gh-issue-131338.zJDCMp.rst @@ -0,0 +1,2 @@ +Disable computed stack limit checks on non-glibc linux platforms to fix +crashes on deep recursion. diff --git a/Misc/NEWS.d/next/Library/2025-07-23-00-35-29.gh-issue-130577.c7EITy.rst b/Misc/NEWS.d/next/Library/2025-07-23-00-35-29.gh-issue-130577.c7EITy.rst new file mode 100644 index 00000000000000..342cabbc865dc4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-23-00-35-29.gh-issue-130577.c7EITy.rst @@ -0,0 +1,3 @@ +:mod:`tarfile` now validates archives to ensure member offsets are +non-negative. (Contributed by Alexander Enrique Urieles Nieto in +:gh:`130577`.) diff --git a/Modules/socketmodule.h b/Modules/socketmodule.h index 200b2b8c7d8310..7fd929af5f27b4 100644 --- a/Modules/socketmodule.h +++ b/Modules/socketmodule.h @@ -18,6 +18,10 @@ */ #ifdef AF_BTH # include +# ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wpragma-pack" +# endif # include /* @@ -51,7 +55,10 @@ struct SOCKADDR_BTH_REDEF { }; # include -#endif +# ifdef __clang__ +# pragma clang diagnostic pop +# endif +#endif /* AF_BTH */ /* Windows 'supports' CMSG_LEN, but does not follow the POSIX standard * interface at all, so there is no point including the code that diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 0ed52ac5e87b6e..928b905aaedb1b 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -1581,32 +1581,47 @@ _Py_dict_lookup_threadsafe(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyOb return ix; } +static Py_ssize_t +lookup_threadsafe_unicode(PyDictKeysObject *dk, PyObject *key, Py_hash_t hash, _PyStackRef *value_addr) +{ + assert(dk->dk_kind == DICT_KEYS_UNICODE); + assert(PyUnicode_CheckExact(key)); + + Py_ssize_t ix = unicodekeys_lookup_unicode_threadsafe(dk, key, hash); + if (ix == DKIX_EMPTY) { + *value_addr = PyStackRef_NULL; + return ix; + } + else if (ix >= 0) { + PyObject **addr_of_value = &DK_UNICODE_ENTRIES(dk)[ix].me_value; + PyObject *value = _Py_atomic_load_ptr(addr_of_value); + if (value == NULL) { + *value_addr = PyStackRef_NULL; + return DKIX_EMPTY; + } + if (_PyObject_HasDeferredRefcount(value)) { + *value_addr = (_PyStackRef){ .bits = (uintptr_t)value | Py_TAG_DEFERRED }; + return ix; + } + if (_Py_TryIncrefCompare(addr_of_value, value)) { + *value_addr = PyStackRef_FromPyObjectSteal(value); + return ix; + } + return DKIX_KEY_CHANGED; + } + assert(ix == DKIX_KEY_CHANGED); + return ix; +} + Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject *key, Py_hash_t hash, _PyStackRef *value_addr) { PyDictKeysObject *dk = _Py_atomic_load_ptr(&mp->ma_keys); if (dk->dk_kind == DICT_KEYS_UNICODE && PyUnicode_CheckExact(key)) { - Py_ssize_t ix = unicodekeys_lookup_unicode_threadsafe(dk, key, hash); - if (ix == DKIX_EMPTY) { - *value_addr = PyStackRef_NULL; + Py_ssize_t ix = lookup_threadsafe_unicode(dk, key, hash, value_addr); + if (ix != DKIX_KEY_CHANGED) { return ix; } - else if (ix >= 0) { - PyObject **addr_of_value = &DK_UNICODE_ENTRIES(dk)[ix].me_value; - PyObject *value = _Py_atomic_load_ptr(addr_of_value); - if (value == NULL) { - *value_addr = PyStackRef_NULL; - return DKIX_EMPTY; - } - if (_PyObject_HasDeferredRefcount(value)) { - *value_addr = (_PyStackRef){ .bits = (uintptr_t)value | Py_TAG_DEFERRED }; - return ix; - } - if (_Py_TryIncrefCompare(addr_of_value, value)) { - *value_addr = PyStackRef_FromPyObjectSteal(value); - return ix; - } - } } PyObject *obj; @@ -1646,6 +1661,46 @@ _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject *key, Py_hash_t h #endif +// Looks up the unicode key `key` in the dictionary. Note that `*method` may +// already contain a valid value! See _PyObject_GetMethodStackRef(). +int +_PyDict_GetMethodStackRef(PyDictObject *mp, PyObject *key, _PyStackRef *method) +{ + assert(PyUnicode_CheckExact(key)); + Py_hash_t hash = hash_unicode_key(key); + +#ifdef Py_GIL_DISABLED + PyDictKeysObject *dk = _Py_atomic_load_ptr_acquire(&mp->ma_keys); + if (dk->dk_kind == DICT_KEYS_UNICODE) { + _PyStackRef ref; + Py_ssize_t ix = lookup_threadsafe_unicode(dk, key, hash, &ref); + if (ix >= 0) { + assert(!PyStackRef_IsNull(ref)); + PyStackRef_XSETREF(*method, ref); + return 1; + } + else if (ix == DKIX_EMPTY) { + return 0; + } + assert(ix == DKIX_KEY_CHANGED); + } +#endif + + PyObject *obj; + Py_INCREF(mp); + Py_ssize_t ix = _Py_dict_lookup_threadsafe(mp, key, hash, &obj); + Py_DECREF(mp); + if (ix == DKIX_ERROR) { + PyStackRef_CLEAR(*method); + return -1; + } + else if (ix >= 0 && obj != NULL) { + PyStackRef_XSETREF(*method, PyStackRef_FromPyObjectSteal(obj)); + return 1; + } + return 0; // not found +} + int _PyDict_HasOnlyStringKeys(PyObject *dict) { diff --git a/Objects/object.c b/Objects/object.c index 3ed7d55593dffa..479f4176a46039 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -1753,20 +1753,15 @@ _PyObject_GetMethodStackRef(PyThreadState *ts, PyObject *obj, } } if (dict != NULL) { - // TODO: use _Py_dict_lookup_threadsafe_stackref - Py_INCREF(dict); - PyObject *value; - if (PyDict_GetItemRef(dict, name, &value) != 0) { - // found or error - Py_DECREF(dict); - PyStackRef_CLEAR(*method); - if (value != NULL) { - *method = PyStackRef_FromPyObjectSteal(value); - } + assert(PyUnicode_CheckExact(name)); + int found = _PyDict_GetMethodStackRef((PyDictObject *)dict, name, method); + if (found < 0) { + assert(PyStackRef_IsNull(*method)); + return -1; + } + else if (found) { return 0; } - // not found - Py_DECREF(dict); } if (meth_found) { diff --git a/Python/ceval.c b/Python/ceval.c index 291e753dec0ce5..9ccd42bdf0a55c 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -452,7 +452,11 @@ _Py_InitializeRecursionLimits(PyThreadState *tstate) _tstate->c_stack_soft_limit = _tstate->c_stack_hard_limit + _PyOS_STACK_MARGIN_BYTES; #else uintptr_t here_addr = _Py_get_machine_stack_pointer(); -# if defined(HAVE_PTHREAD_GETATTR_NP) && !defined(_AIX) && !defined(__NetBSD__) +/// XXX musl supports HAVE_PTHRED_GETATTR_NP, but the resulting stack size +/// (on alpine at least) is much smaller than expected and imposes undue limits +/// compared to the old stack size estimation. (We assume musl is not glibc.) +# if defined(HAVE_PTHREAD_GETATTR_NP) && !defined(_AIX) && \ + !defined(__NetBSD__) && (defined(__GLIBC__) || !defined(__linux__)) size_t stack_size, guard_size; void *stack_addr; pthread_attr_t attr;