From 635a694daf81d5296d25d85b93e1a8e90d115c9b Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 8 May 2026 13:41:50 -0700 Subject: [PATCH 1/2] Bounds-check IFD count to prevent DoS on crafted TIFFs `parse_ifd` previously trusted an entry's `count` field unconditionally and passed it straight into `struct.unpack_from(f'{bo}{count}{fmt}', ...)`. A crafted TIFF with `count=5e7` on a LONG tag and a value pointer aimed at a multi-MB padding region forced the parser to materialize a ~1.4 GB tuple of Python ints before any caller-side validation ran. This change adds two guards inside the per-entry loop: 1. Reject `count > MAX_IFD_ENTRY_COUNT` (10_000_000) for tags that are not pixel-data offset/byte-count arrays. STRIP_OFFSETS, STRIP_BYTE_COUNTS, TILE_OFFSETS, TILE_BYTE_COUNTS, and COLORMAP are exempt because their counts legitimately scale with image size. 2. For non-inline values, verify that `value_offset + count * type_size <= len(data)` before calling `_read_value`. Both checks raise `ValueError` before any allocation. Threat model is untrusted TIFF input (web download, fsspec source, third-party catalog, user upload). Tests added in test_parse_ifd_bounds.py: - count over the cap on a non-pixel tag is rejected - value range past EOF is rejected - pixel-array tag with a huge count is allowed - legal small IFDs still parse - TileByteCounts SHORT path exercised --- xrspatial/geotiff/_header.py | 39 ++++ .../geotiff/tests/test_parse_ifd_bounds.py | 194 ++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_parse_ifd_bounds.py diff --git a/xrspatial/geotiff/_header.py b/xrspatial/geotiff/_header.py index 1bcb68be..d32f9269 100644 --- a/xrspatial/geotiff/_header.py +++ b/xrspatial/geotiff/_header.py @@ -15,6 +15,15 @@ UNDEFINED, ) +# Maximum allowed `count` for IFD entries that aren't pixel-data offset or +# byte-count arrays. Pixel-data arrays scale with image dimensions and a +# legitimate large image may have millions of tiles; metadata tags +# (DateTime, Software, ImageDescription, GeoKeys, etc.) should never be +# anywhere near this size. Picked well above any realistic metadata +# payload but small enough to bound the allocation a parser can be forced +# to perform from a single malformed entry. +MAX_IFD_ENTRY_COUNT = 10_000_000 + # Well-known TIFF tag IDs TAG_NEW_SUBFILE_TYPE = 254 TAG_IMAGE_WIDTH = 256 @@ -451,6 +460,17 @@ def parse_ifd(data: bytes | memoryview, offset: int, inline_max = 8 if is_big else 4 entries = {} + # Tags whose `count` legitimately scales with image size (one entry per + # strip / tile / colormap slot). These are exempt from MAX_IFD_ENTRY_COUNT. + pixel_array_tags = { + TAG_STRIP_OFFSETS, + TAG_STRIP_BYTE_COUNTS, + TAG_TILE_OFFSETS, + TAG_TILE_BYTE_COUNTS, + TAG_COLORMAP, + } + data_len = len(data) + for i in range(num_entries): eo = entry_offset + i * entry_size @@ -465,6 +485,16 @@ def parse_ifd(data: bytes | memoryview, offset: int, count = struct.unpack_from(f'{bo}I', data, eo + 4)[0] value_area_offset = eo + 8 + # Reject absurd counts on non-pixel tags before any allocation. + # Pixel-data offset/byte-count arrays may legitimately be very + # large for high-resolution tiled images, so they're exempt. + if tag not in pixel_array_tags and count > MAX_IFD_ENTRY_COUNT: + raise ValueError( + f"IFD entry tag={tag} count={count} exceeds " + f"MAX_IFD_ENTRY_COUNT={MAX_IFD_ENTRY_COUNT}; refusing to " + f"parse possibly malformed TIFF" + ) + type_size = TIFF_TYPE_SIZES.get(type_id, 1) total_size = count * type_size @@ -475,6 +505,15 @@ def parse_ifd(data: bytes | memoryview, offset: int, ptr = struct.unpack_from(f'{bo}Q', data, value_area_offset)[0] else: ptr = struct.unpack_from(f'{bo}I', data, value_area_offset)[0] + # Bound-check the value range against file length before + # _read_value triggers struct.unpack_from on a possibly huge + # count that would allocate a multi-GB tuple. + end = ptr + total_size + if ptr < 0 or end > data_len: + raise ValueError( + f"IFD entry tag={tag} value range " + f"[{ptr}, {end}) exceeds file length {data_len}" + ) value = _read_value(data, ptr, type_id, count, bo) entries[tag] = IFDEntry(tag=tag, type_id=type_id, count=count, value=value) diff --git a/xrspatial/geotiff/tests/test_parse_ifd_bounds.py b/xrspatial/geotiff/tests/test_parse_ifd_bounds.py new file mode 100644 index 00000000..aa201c62 --- /dev/null +++ b/xrspatial/geotiff/tests/test_parse_ifd_bounds.py @@ -0,0 +1,194 @@ +"""Bounds-checks for IFD `count` against file length and a hard cap. + +These tests cover the S2 audit finding: a crafted TIFF with a huge +`count` on a non-pixel tag could force `parse_ifd` to ask +`struct.unpack_from` for a multi-million-element tuple, allocating +hundreds of MB to GB of memory before the parser ever returns. The fix +adds two guards to `parse_ifd`: + +1. A hard cap (`MAX_IFD_ENTRY_COUNT`) on `count` for tags that aren't + pixel-data offset/byte-count arrays. +2. A bounds check that the value range + `[value_offset, value_offset + count * type_size)` falls inside the + file before `_read_value` is called. + +Legitimate large pixel-array tags (TileOffsets, StripOffsets, etc.) +must still parse without raising. +""" +from __future__ import annotations + +import struct + +import pytest + +from xrspatial.geotiff._dtypes import LONG, SHORT +from xrspatial.geotiff._header import ( + MAX_IFD_ENTRY_COUNT, + TAG_IMAGE_WIDTH, + TAG_TILE_OFFSETS, + TAG_TILE_BYTE_COUNTS, + parse_header, + parse_ifd, +) + + +def _build_tiff(entries: list[tuple[int, int, int, bytes]], + tail_padding: int = 0, + external_payloads: list[tuple[int, bytes]] | None = None + ) -> bytes: + """Build a classic little-endian TIFF with the given IFD entries. + + Each entry is `(tag, type_id, count, value_field_bytes)` where + `value_field_bytes` is exactly 4 bytes that go into the IFD entry's + value/offset slot. + + `external_payloads` is an optional list of `(absolute_offset, + bytes)` pairs that will be written to the file at those positions. + Used to place legal data referenced by an entry's value pointer. + """ + bo = '<' + n = len(entries) + # Layout: header (8) + IFD (2 + n*12 + 4) + padding + payloads. + ifd_offset = 8 + ifd_size = 2 + n * 12 + 4 + + # Determine total size needed. + end_of_ifd = ifd_offset + ifd_size + file_size = end_of_ifd + tail_padding + if external_payloads: + for off, payload in external_payloads: + file_size = max(file_size, off + len(payload)) + + buf = bytearray(file_size) + buf[0:2] = b'II' + struct.pack_into(f'{bo}H', buf, 2, 42) # magic + struct.pack_into(f'{bo}I', buf, 4, ifd_offset) # first IFD offset + + struct.pack_into(f'{bo}H', buf, ifd_offset, n) + for i, (tag, type_id, count, value_bytes) in enumerate(entries): + eo = ifd_offset + 2 + i * 12 + struct.pack_into(f'{bo}H', buf, eo, tag) + struct.pack_into(f'{bo}H', buf, eo + 2, type_id) + struct.pack_into(f'{bo}I', buf, eo + 4, count) + assert len(value_bytes) == 4, "value field must be exactly 4 bytes" + buf[eo + 8:eo + 12] = value_bytes + # next IFD offset = 0 + struct.pack_into(f'{bo}I', buf, ifd_offset + 2 + n * 12, 0) + + if external_payloads: + for off, payload in external_payloads: + buf[off:off + len(payload)] = payload + + return bytes(buf) + + +def test_count_overflow_rejected(): + """A non-pixel tag with count > MAX_IFD_ENTRY_COUNT must raise.""" + bad_count = MAX_IFD_ENTRY_COUNT + 1 + # Tag IMAGE_WIDTH is not in the pixel-array exemption set. + # Use type LONG (4 bytes), value pointer = 0 (irrelevant; we never + # reach the read because the count check fires first). + data = _build_tiff( + entries=[(TAG_IMAGE_WIDTH, LONG, bad_count, b'\x00\x00\x00\x00')], + tail_padding=64, + ) + header = parse_header(data) + with pytest.raises(ValueError, match="count"): + parse_ifd(data, header.first_ifd_offset, header) + + +def test_value_offset_past_eof_rejected(): + """A non-inline value range that extends past EOF must raise.""" + # 5 LONGs = 20 bytes -> non-inline. Place value pointer at a spot + # such that ptr + 20 > file_size. + count = 5 + file_size_target = 200 # we'll build a small file + # Use a pointer that's just inside the file but with not enough + # room for count * type_size bytes following it. + ptr = file_size_target - 4 # only 4 bytes follow, need 20 + value_bytes = struct.pack(' MAX_IFD_ENTRY_COUNT` parse fine. + + A 100k tile count is well within legitimate territory for a large + tiled COG, and is below the cap anyway. To exercise the exemption + path we additionally craft a TileOffsets entry whose count exceeds + MAX_IFD_ENTRY_COUNT but whose value pointer references a region + that is genuinely present in the file. The test makes sure no + ValueError is raised on the count check; we use a small but + above-cap value backed by real file bytes. + """ + # Use a count just above the cap. + count = MAX_IFD_ENTRY_COUNT + 10 + type_size = 4 # LONG + payload_size = count * type_size + # Place payload right after the IFD. + payload_offset = 8 + 2 + 12 * 2 + 4 # header + IFD with 2 entries + payload = b'\x00' * payload_size + value_bytes = struct.pack(' Date: Fri, 8 May 2026 18:21:06 -0700 Subject: [PATCH 2/2] Address Copilot review on #1527 Tighten the IFD-entry caps and clean up test scaffolding: - Lower MAX_IFD_ENTRY_COUNT 10_000_000 -> 100_000. Even at the lower bound a malformed entry can still force ~3 MB of Python tuple allocation for LONG values; 100K is well above any legitimate metadata count and bounds the worst case to ~30 MB. - Add MAX_IFD_ENTRY_BYTES = 256 KiB cap on `count * type_size` for non-pixel tags. Catches large-itemsize abuse (e.g. DOUBLE arrays) where the count alone passes but the bytes are excessive. Default caps are now independently useful: count cap stops integer-array attacks, byte cap stops large-element-size attacks. - Remove the dead `ptr < 0` branch on the value-range check; ptr is unpacked via unsigned `I`/`Q` so it can never be negative. - Reformat _build_tiff signature in the test to one-arg-per-line so the closing paren lines up cleanly (prior layout tripped E128/E124). - Replace test_huge_pixel_array_count_allowed (which allocated 40 MB and forced a 10M-int tuple in CI) with monkeypatch-based tests that exercise the cap logic with tiny payloads. Add a second test that proves the byte cap fires at default cap values, so the production thresholds stay covered. --- xrspatial/geotiff/_header.py | 65 ++++++--- .../geotiff/tests/test_parse_ifd_bounds.py | 134 ++++++++++++------ 2 files changed, 136 insertions(+), 63 deletions(-) diff --git a/xrspatial/geotiff/_header.py b/xrspatial/geotiff/_header.py index d32f9269..7a1492cc 100644 --- a/xrspatial/geotiff/_header.py +++ b/xrspatial/geotiff/_header.py @@ -15,14 +15,24 @@ UNDEFINED, ) -# Maximum allowed `count` for IFD entries that aren't pixel-data offset or -# byte-count arrays. Pixel-data arrays scale with image dimensions and a -# legitimate large image may have millions of tiles; metadata tags -# (DateTime, Software, ImageDescription, GeoKeys, etc.) should never be -# anywhere near this size. Picked well above any realistic metadata -# payload but small enough to bound the allocation a parser can be forced -# to perform from a single malformed entry. -MAX_IFD_ENTRY_COUNT = 10_000_000 +# Caps for IFD entries that aren't pixel-data offset or byte-count +# arrays. Pixel-data arrays (TileOffsets, StripOffsets, etc.) scale with +# image dimensions and are exempt; metadata tags (DateTime, Software, +# ImageDescription, GeoKeys, ColorMap when treated as pixel data, etc.) +# should never come close to either bound. Two caps are enforced: +# +# - MAX_IFD_ENTRY_COUNT bounds the *number of elements* the parser will +# allocate as a Python tuple. struct.unpack_from of N items produces a +# tuple of N PyLong objects; even at small per-element byte sizes, +# the heap cost is dominated by the tuple, not the bytes. +# - MAX_IFD_ENTRY_BYTES bounds the *value range read from the file*. +# This catches large-itemsize tags (DOUBLE = 8 bytes) where the count +# alone is misleading. +# +# Together they bound a single malformed entry's worst-case heap impact +# to the lower of (~tens of MB tuple) and (1 MB raw bytes read). +MAX_IFD_ENTRY_COUNT = 100_000 +MAX_IFD_ENTRY_BYTES = 1 << 18 # 256 KiB # Well-known TIFF tag IDs TAG_NEW_SUBFILE_TYPE = 254 @@ -485,19 +495,27 @@ def parse_ifd(data: bytes | memoryview, offset: int, count = struct.unpack_from(f'{bo}I', data, eo + 4)[0] value_area_offset = eo + 8 - # Reject absurd counts on non-pixel tags before any allocation. - # Pixel-data offset/byte-count arrays may legitimately be very - # large for high-resolution tiled images, so they're exempt. - if tag not in pixel_array_tags and count > MAX_IFD_ENTRY_COUNT: - raise ValueError( - f"IFD entry tag={tag} count={count} exceeds " - f"MAX_IFD_ENTRY_COUNT={MAX_IFD_ENTRY_COUNT}; refusing to " - f"parse possibly malformed TIFF" - ) - type_size = TIFF_TYPE_SIZES.get(type_id, 1) total_size = count * type_size + # Reject absurd counts/sizes on non-pixel tags before any + # allocation. Pixel-data offset/byte-count arrays may legitimately + # be very large for high-resolution tiled images, so they're + # exempt. Both element-count and byte-range caps fire here. + if tag not in pixel_array_tags: + if count > MAX_IFD_ENTRY_COUNT: + raise ValueError( + f"IFD entry tag={tag} count={count} exceeds " + f"MAX_IFD_ENTRY_COUNT={MAX_IFD_ENTRY_COUNT}; refusing " + f"to parse possibly malformed TIFF" + ) + if total_size > MAX_IFD_ENTRY_BYTES: + raise ValueError( + f"IFD entry tag={tag} value size {total_size} bytes " + f"exceeds MAX_IFD_ENTRY_BYTES={MAX_IFD_ENTRY_BYTES}; " + f"refusing to parse possibly malformed TIFF" + ) + if total_size <= inline_max: value = _read_value(data, value_area_offset, type_id, count, bo) else: @@ -506,13 +524,14 @@ def parse_ifd(data: bytes | memoryview, offset: int, else: ptr = struct.unpack_from(f'{bo}I', data, value_area_offset)[0] # Bound-check the value range against file length before - # _read_value triggers struct.unpack_from on a possibly huge - # count that would allocate a multi-GB tuple. - end = ptr + total_size - if ptr < 0 or end > data_len: + # _read_value runs. ptr and total_size are both >= 0 (ptr is + # an unsigned uint32/uint64, total_size is count*type_size), + # so we only need the upper bound. + if ptr + total_size > data_len: raise ValueError( f"IFD entry tag={tag} value range " - f"[{ptr}, {end}) exceeds file length {data_len}" + f"[{ptr}, {ptr + total_size}) exceeds file length " + f"{data_len}" ) value = _read_value(data, ptr, type_id, count, bo) diff --git a/xrspatial/geotiff/tests/test_parse_ifd_bounds.py b/xrspatial/geotiff/tests/test_parse_ifd_bounds.py index aa201c62..34c80da7 100644 --- a/xrspatial/geotiff/tests/test_parse_ifd_bounds.py +++ b/xrspatial/geotiff/tests/test_parse_ifd_bounds.py @@ -1,19 +1,24 @@ -"""Bounds-checks for IFD `count` against file length and a hard cap. +"""Bounds-checks for IFD `count` and value range against file length. These tests cover the S2 audit finding: a crafted TIFF with a huge `count` on a non-pixel tag could force `parse_ifd` to ask `struct.unpack_from` for a multi-million-element tuple, allocating hundreds of MB to GB of memory before the parser ever returns. The fix -adds two guards to `parse_ifd`: +adds three guards to `parse_ifd`: -1. A hard cap (`MAX_IFD_ENTRY_COUNT`) on `count` for tags that aren't - pixel-data offset/byte-count arrays. -2. A bounds check that the value range - `[value_offset, value_offset + count * type_size)` falls inside the - file before `_read_value` is called. +1. A hard cap (`MAX_IFD_ENTRY_COUNT`) on `count` for non-pixel tags. +2. A hard cap (`MAX_IFD_ENTRY_BYTES`) on `count * type_size` for + non-pixel tags. Catches large-itemsize abuse (e.g. DOUBLE arrays) + where the count alone may pass. +3. A bounds check that `value_offset + count * type_size <= len(data)` + before `_read_value` is called, for any tag with a non-inline value. Legitimate large pixel-array tags (TileOffsets, StripOffsets, etc.) must still parse without raising. + +Tests monkeypatch the cap constants down to small values where helpful, +so they exercise the limits with tiny in-memory payloads instead of +allocating hundreds of MB in CI. """ from __future__ import annotations @@ -21,21 +26,24 @@ import pytest -from xrspatial.geotiff._dtypes import LONG, SHORT +from xrspatial.geotiff import _header +from xrspatial.geotiff._dtypes import DOUBLE, LONG, SHORT from xrspatial.geotiff._header import ( + MAX_IFD_ENTRY_BYTES, MAX_IFD_ENTRY_COUNT, TAG_IMAGE_WIDTH, - TAG_TILE_OFFSETS, TAG_TILE_BYTE_COUNTS, + TAG_TILE_OFFSETS, parse_header, parse_ifd, ) -def _build_tiff(entries: list[tuple[int, int, int, bytes]], - tail_padding: int = 0, - external_payloads: list[tuple[int, bytes]] | None = None - ) -> bytes: +def _build_tiff( + entries: list[tuple[int, int, int, bytes]], + tail_padding: int = 0, + external_payloads: list[tuple[int, bytes]] | None = None, +) -> bytes: """Build a classic little-endian TIFF with the given IFD entries. Each entry is `(tag, type_id, count, value_field_bytes)` where @@ -52,7 +60,6 @@ def _build_tiff(entries: list[tuple[int, int, int, bytes]], ifd_offset = 8 ifd_size = 2 + n * 12 + 4 - # Determine total size needed. end_of_ifd = ifd_offset + ifd_size file_size = end_of_ifd + tail_padding if external_payloads: @@ -86,8 +93,7 @@ def test_count_overflow_rejected(): """A non-pixel tag with count > MAX_IFD_ENTRY_COUNT must raise.""" bad_count = MAX_IFD_ENTRY_COUNT + 1 # Tag IMAGE_WIDTH is not in the pixel-array exemption set. - # Use type LONG (4 bytes), value pointer = 0 (irrelevant; we never - # reach the read because the count check fires first). + # value pointer = 0 is irrelevant; the count check fires first. data = _build_tiff( entries=[(TAG_IMAGE_WIDTH, LONG, bad_count, b'\x00\x00\x00\x00')], tail_padding=64, @@ -97,50 +103,84 @@ def test_count_overflow_rejected(): parse_ifd(data, header.first_ifd_offset, header) +def test_byte_size_overflow_rejected_with_legal_count(monkeypatch): + """A count under the count cap but bytes over the byte cap raises. + + Monkeypatches both caps to small values so the test exercises the + byte-cap path independently of the count-cap path. With count=33 + and DOUBLE (8 bytes), bytes=264 > 256 (byte cap) while count < 100 + (count cap), so only the byte cap fires. + """ + monkeypatch.setattr(_header, 'MAX_IFD_ENTRY_COUNT', 100) + monkeypatch.setattr(_header, 'MAX_IFD_ENTRY_BYTES', 256) + + count = 33 + data = _build_tiff( + entries=[(TAG_IMAGE_WIDTH, DOUBLE, count, b'\x00\x00\x00\x00')], + tail_padding=64, + ) + header = parse_header(data) + with pytest.raises(ValueError, match="bytes exceeds MAX_IFD_ENTRY_BYTES"): + parse_ifd(data, header.first_ifd_offset, header) + + +def test_byte_cap_fires_at_default_values(): + """Default caps catch a realistic large-itemsize abuse. + + With default caps (count=100K, bytes=256KiB), a non-pixel DOUBLE + tag with count=32_770 has bytes=262_160 > 262_144 (256KiB) but + count is well under 100K. This proves the byte cap is independently + useful at production values, not just under monkeypatch. + """ + count = (MAX_IFD_ENTRY_BYTES // 8) + 2 + assert count < MAX_IFD_ENTRY_COUNT, ( + "test must exercise byte cap, not count cap" + ) + data = _build_tiff( + entries=[(TAG_IMAGE_WIDTH, DOUBLE, count, b'\x00\x00\x00\x00')], + tail_padding=64, + ) + header = parse_header(data) + with pytest.raises(ValueError, match="bytes exceeds MAX_IFD_ENTRY_BYTES"): + parse_ifd(data, header.first_ifd_offset, header) + + def test_value_offset_past_eof_rejected(): """A non-inline value range that extends past EOF must raise.""" # 5 LONGs = 20 bytes -> non-inline. Place value pointer at a spot # such that ptr + 20 > file_size. count = 5 - file_size_target = 200 # we'll build a small file - # Use a pointer that's just inside the file but with not enough - # room for count * type_size bytes following it. + file_size_target = 200 ptr = file_size_target - 4 # only 4 bytes follow, need 20 value_bytes = struct.pack(' MAX_IFD_ENTRY_COUNT` parse fine. +def test_pixel_array_tag_exemption(monkeypatch): + """Pixel-array tags bypass both caps; a tiny lowered cap proves it. - A 100k tile count is well within legitimate territory for a large - tiled COG, and is below the cap anyway. To exercise the exemption - path we additionally craft a TileOffsets entry whose count exceeds - MAX_IFD_ENTRY_COUNT but whose value pointer references a region - that is genuinely present in the file. The test makes sure no - ValueError is raised on the count check; we use a small but - above-cap value backed by real file bytes. + Monkeypatches both caps to 10 so the test exercises the exemption + without allocating large buffers. With non-pixel tags, count=11 + would raise; with TileOffsets it must pass. """ - # Use a count just above the cap. - count = MAX_IFD_ENTRY_COUNT + 10 + monkeypatch.setattr(_header, 'MAX_IFD_ENTRY_COUNT', 10) + monkeypatch.setattr(_header, 'MAX_IFD_ENTRY_BYTES', 16) + + count = 11 type_size = 4 # LONG payload_size = count * type_size - # Place payload right after the IFD. - payload_offset = 8 + 2 + 12 * 2 + 4 # header + IFD with 2 entries + # Place payload right after the IFD with 2 entries. + payload_offset = 8 + 2 + 12 * 2 + 4 payload = b'\x00' * payload_size value_bytes = struct.pack('