diff --git a/xrspatial/geotiff/_header.py b/xrspatial/geotiff/_header.py index 1bcb68be..7a1492cc 100644 --- a/xrspatial/geotiff/_header.py +++ b/xrspatial/geotiff/_header.py @@ -15,6 +15,25 @@ UNDEFINED, ) +# Caps for IFD entries that aren't pixel-data offset or byte-count +# arrays. Pixel-data arrays (TileOffsets, StripOffsets, etc.) scale with +# image dimensions and are exempt; metadata tags (DateTime, Software, +# ImageDescription, GeoKeys, ColorMap when treated as pixel data, etc.) +# should never come close to either bound. Two caps are enforced: +# +# - MAX_IFD_ENTRY_COUNT bounds the *number of elements* the parser will +# allocate as a Python tuple. struct.unpack_from of N items produces a +# tuple of N PyLong objects; even at small per-element byte sizes, +# the heap cost is dominated by the tuple, not the bytes. +# - MAX_IFD_ENTRY_BYTES bounds the *value range read from the file*. +# This catches large-itemsize tags (DOUBLE = 8 bytes) where the count +# alone is misleading. +# +# Together they bound a single malformed entry's worst-case heap impact +# to the lower of (~tens of MB tuple) and (1 MB raw bytes read). +MAX_IFD_ENTRY_COUNT = 100_000 +MAX_IFD_ENTRY_BYTES = 1 << 18 # 256 KiB + # Well-known TIFF tag IDs TAG_NEW_SUBFILE_TYPE = 254 TAG_IMAGE_WIDTH = 256 @@ -451,6 +470,17 @@ def parse_ifd(data: bytes | memoryview, offset: int, inline_max = 8 if is_big else 4 entries = {} + # Tags whose `count` legitimately scales with image size (one entry per + # strip / tile / colormap slot). These are exempt from MAX_IFD_ENTRY_COUNT. + pixel_array_tags = { + TAG_STRIP_OFFSETS, + TAG_STRIP_BYTE_COUNTS, + TAG_TILE_OFFSETS, + TAG_TILE_BYTE_COUNTS, + TAG_COLORMAP, + } + data_len = len(data) + for i in range(num_entries): eo = entry_offset + i * entry_size @@ -468,6 +498,24 @@ def parse_ifd(data: bytes | memoryview, offset: int, type_size = TIFF_TYPE_SIZES.get(type_id, 1) total_size = count * type_size + # Reject absurd counts/sizes on non-pixel tags before any + # allocation. Pixel-data offset/byte-count arrays may legitimately + # be very large for high-resolution tiled images, so they're + # exempt. Both element-count and byte-range caps fire here. + if tag not in pixel_array_tags: + if count > MAX_IFD_ENTRY_COUNT: + raise ValueError( + f"IFD entry tag={tag} count={count} exceeds " + f"MAX_IFD_ENTRY_COUNT={MAX_IFD_ENTRY_COUNT}; refusing " + f"to parse possibly malformed TIFF" + ) + if total_size > MAX_IFD_ENTRY_BYTES: + raise ValueError( + f"IFD entry tag={tag} value size {total_size} bytes " + f"exceeds MAX_IFD_ENTRY_BYTES={MAX_IFD_ENTRY_BYTES}; " + f"refusing to parse possibly malformed TIFF" + ) + if total_size <= inline_max: value = _read_value(data, value_area_offset, type_id, count, bo) else: @@ -475,6 +523,16 @@ def parse_ifd(data: bytes | memoryview, offset: int, ptr = struct.unpack_from(f'{bo}Q', data, value_area_offset)[0] else: ptr = struct.unpack_from(f'{bo}I', data, value_area_offset)[0] + # Bound-check the value range against file length before + # _read_value runs. ptr and total_size are both >= 0 (ptr is + # an unsigned uint32/uint64, total_size is count*type_size), + # so we only need the upper bound. + if ptr + total_size > data_len: + raise ValueError( + f"IFD entry tag={tag} value range " + f"[{ptr}, {ptr + total_size}) exceeds file length " + f"{data_len}" + ) value = _read_value(data, ptr, type_id, count, bo) entries[tag] = IFDEntry(tag=tag, type_id=type_id, count=count, value=value) diff --git a/xrspatial/geotiff/tests/test_parse_ifd_bounds.py b/xrspatial/geotiff/tests/test_parse_ifd_bounds.py new file mode 100644 index 00000000..34c80da7 --- /dev/null +++ b/xrspatial/geotiff/tests/test_parse_ifd_bounds.py @@ -0,0 +1,248 @@ +"""Bounds-checks for IFD `count` and value range against file length. + +These tests cover the S2 audit finding: a crafted TIFF with a huge +`count` on a non-pixel tag could force `parse_ifd` to ask +`struct.unpack_from` for a multi-million-element tuple, allocating +hundreds of MB to GB of memory before the parser ever returns. The fix +adds three guards to `parse_ifd`: + +1. A hard cap (`MAX_IFD_ENTRY_COUNT`) on `count` for non-pixel tags. +2. A hard cap (`MAX_IFD_ENTRY_BYTES`) on `count * type_size` for + non-pixel tags. Catches large-itemsize abuse (e.g. DOUBLE arrays) + where the count alone may pass. +3. A bounds check that `value_offset + count * type_size <= len(data)` + before `_read_value` is called, for any tag with a non-inline value. + +Legitimate large pixel-array tags (TileOffsets, StripOffsets, etc.) +must still parse without raising. + +Tests monkeypatch the cap constants down to small values where helpful, +so they exercise the limits with tiny in-memory payloads instead of +allocating hundreds of MB in CI. +""" +from __future__ import annotations + +import struct + +import pytest + +from xrspatial.geotiff import _header +from xrspatial.geotiff._dtypes import DOUBLE, LONG, SHORT +from xrspatial.geotiff._header import ( + MAX_IFD_ENTRY_BYTES, + MAX_IFD_ENTRY_COUNT, + TAG_IMAGE_WIDTH, + TAG_TILE_BYTE_COUNTS, + TAG_TILE_OFFSETS, + parse_header, + parse_ifd, +) + + +def _build_tiff( + entries: list[tuple[int, int, int, bytes]], + tail_padding: int = 0, + external_payloads: list[tuple[int, bytes]] | None = None, +) -> bytes: + """Build a classic little-endian TIFF with the given IFD entries. + + Each entry is `(tag, type_id, count, value_field_bytes)` where + `value_field_bytes` is exactly 4 bytes that go into the IFD entry's + value/offset slot. + + `external_payloads` is an optional list of `(absolute_offset, + bytes)` pairs that will be written to the file at those positions. + Used to place legal data referenced by an entry's value pointer. + """ + bo = '<' + n = len(entries) + # Layout: header (8) + IFD (2 + n*12 + 4) + padding + payloads. + ifd_offset = 8 + ifd_size = 2 + n * 12 + 4 + + end_of_ifd = ifd_offset + ifd_size + file_size = end_of_ifd + tail_padding + if external_payloads: + for off, payload in external_payloads: + file_size = max(file_size, off + len(payload)) + + buf = bytearray(file_size) + buf[0:2] = b'II' + struct.pack_into(f'{bo}H', buf, 2, 42) # magic + struct.pack_into(f'{bo}I', buf, 4, ifd_offset) # first IFD offset + + struct.pack_into(f'{bo}H', buf, ifd_offset, n) + for i, (tag, type_id, count, value_bytes) in enumerate(entries): + eo = ifd_offset + 2 + i * 12 + struct.pack_into(f'{bo}H', buf, eo, tag) + struct.pack_into(f'{bo}H', buf, eo + 2, type_id) + struct.pack_into(f'{bo}I', buf, eo + 4, count) + assert len(value_bytes) == 4, "value field must be exactly 4 bytes" + buf[eo + 8:eo + 12] = value_bytes + # next IFD offset = 0 + struct.pack_into(f'{bo}I', buf, ifd_offset + 2 + n * 12, 0) + + if external_payloads: + for off, payload in external_payloads: + buf[off:off + len(payload)] = payload + + return bytes(buf) + + +def test_count_overflow_rejected(): + """A non-pixel tag with count > MAX_IFD_ENTRY_COUNT must raise.""" + bad_count = MAX_IFD_ENTRY_COUNT + 1 + # Tag IMAGE_WIDTH is not in the pixel-array exemption set. + # value pointer = 0 is irrelevant; the count check fires first. + data = _build_tiff( + entries=[(TAG_IMAGE_WIDTH, LONG, bad_count, b'\x00\x00\x00\x00')], + tail_padding=64, + ) + header = parse_header(data) + with pytest.raises(ValueError, match="count"): + parse_ifd(data, header.first_ifd_offset, header) + + +def test_byte_size_overflow_rejected_with_legal_count(monkeypatch): + """A count under the count cap but bytes over the byte cap raises. + + Monkeypatches both caps to small values so the test exercises the + byte-cap path independently of the count-cap path. With count=33 + and DOUBLE (8 bytes), bytes=264 > 256 (byte cap) while count < 100 + (count cap), so only the byte cap fires. + """ + monkeypatch.setattr(_header, 'MAX_IFD_ENTRY_COUNT', 100) + monkeypatch.setattr(_header, 'MAX_IFD_ENTRY_BYTES', 256) + + count = 33 + data = _build_tiff( + entries=[(TAG_IMAGE_WIDTH, DOUBLE, count, b'\x00\x00\x00\x00')], + tail_padding=64, + ) + header = parse_header(data) + with pytest.raises(ValueError, match="bytes exceeds MAX_IFD_ENTRY_BYTES"): + parse_ifd(data, header.first_ifd_offset, header) + + +def test_byte_cap_fires_at_default_values(): + """Default caps catch a realistic large-itemsize abuse. + + With default caps (count=100K, bytes=256KiB), a non-pixel DOUBLE + tag with count=32_770 has bytes=262_160 > 262_144 (256KiB) but + count is well under 100K. This proves the byte cap is independently + useful at production values, not just under monkeypatch. + """ + count = (MAX_IFD_ENTRY_BYTES // 8) + 2 + assert count < MAX_IFD_ENTRY_COUNT, ( + "test must exercise byte cap, not count cap" + ) + data = _build_tiff( + entries=[(TAG_IMAGE_WIDTH, DOUBLE, count, b'\x00\x00\x00\x00')], + tail_padding=64, + ) + header = parse_header(data) + with pytest.raises(ValueError, match="bytes exceeds MAX_IFD_ENTRY_BYTES"): + parse_ifd(data, header.first_ifd_offset, header) + + +def test_value_offset_past_eof_rejected(): + """A non-inline value range that extends past EOF must raise.""" + # 5 LONGs = 20 bytes -> non-inline. Place value pointer at a spot + # such that ptr + 20 > file_size. + count = 5 + file_size_target = 200 + ptr = file_size_target - 4 # only 4 bytes follow, need 20 + value_bytes = struct.pack('