diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b79c72f3..6d1027b2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: include: - - {os: windows-latest, python: "3.11", dask-version: "2025.12.0", name: "min dask"} + - {os: windows-latest, python: "3.11", dask-version: "2026.3.0", name: "min dask"} - {os: windows-latest, python: "3.14", dask-version: "latest"} - {os: ubuntu-latest, python: "3.11", dask-version: "latest"} - {os: ubuntu-latest, python: "3.14", dask-version: "latest"} diff --git a/pyproject.toml b/pyproject.toml index 91aa0d70..353811c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,8 @@ dependencies = [ "annsel>=0.1.2", "click", "dask-image", - "dask>=2025.12.0,<2026.1.2", - "distributed<2026.1.2", + "dask>=2026.3.0", + "distributed>=2026.3.0", "datashader", "fsspec[s3,http]", "geopandas>=0.14", @@ -35,7 +35,7 @@ dependencies = [ "networkx", "numba>=0.55.0", "numpy", - "ome_zarr>=0.14.0", + "ome_zarr>=0.16.0", "pandas", "pooch", "pyarrow", diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 760736c6..38a694f4 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -1108,6 +1108,7 @@ def write( update_sdata_path: bool = True, sdata_formats: SpatialDataFormatType | list[SpatialDataFormatType] | None = None, shapes_geometry_encoding: Literal["WKB", "geoarrow"] | None = None, + compressor: dict[Literal["lz4", "zstd"], int] | None = None, ) -> None: """ Write the `SpatialData` object to a Zarr store. @@ -1155,11 +1156,17 @@ def write( shapes_geometry_encoding Whether to use the WKB or geoarrow encoding for GeoParquet. See :meth:`geopandas.GeoDataFrame.to_parquet` for details. If None, uses the value from :attr:`spatialdata.settings.shapes_geometry_encoding`. + compressor + A lenght-1 dictionary with as key the type of compression to use for images and labels and as value the + compression level which should be inclusive between 0 and 9. For compression, `lz4` and `zstd` are + supported. If not specified, the compression will be `lz4` with compression level 5. Bytes are automatically + ordered for more efficient compression. """ - from spatialdata._io._utils import _resolve_zarr_store + from spatialdata._io._utils import _resolve_zarr_store, _validate_compressor_args from spatialdata._io.format import _parse_formats parsed = _parse_formats(sdata_formats) + _validate_compressor_args(compressor) if isinstance(file_path, str): file_path = Path(file_path) @@ -1181,6 +1188,7 @@ def write( overwrite=False, parsed_formats=parsed, shapes_geometry_encoding=shapes_geometry_encoding, + compressor=compressor, ) if self.path != file_path and update_sdata_path: @@ -1198,6 +1206,7 @@ def _write_element( overwrite: bool, parsed_formats: dict[str, SpatialDataFormatType] | None = None, shapes_geometry_encoding: Literal["WKB", "geoarrow"] | None = None, + compressor: dict[Literal["lz4", "zstd"], int] | None = None, ) -> None: from spatialdata._io.io_zarr import _get_groups_for_element @@ -1236,6 +1245,7 @@ def _write_element( group=element_group, name=element_name, element_format=parsed_formats["raster"], + compressor=compressor, ) elif element_type == "labels": write_labels( @@ -1243,6 +1253,7 @@ def _write_element( group=root_group, name=element_name, element_format=parsed_formats["raster"], + compressor=compressor, ) elif element_type == "points": write_points( @@ -1273,6 +1284,7 @@ def write_element( overwrite: bool = False, sdata_formats: SpatialDataFormatType | list[SpatialDataFormatType] | None = None, shapes_geometry_encoding: Literal["WKB", "geoarrow"] | None = None, + compressor: dict[Literal["lz4", "zstd"], int] | None = None, ) -> None: """ Write a single element, or a list of elements, to the Zarr store used for backing. @@ -1291,6 +1303,11 @@ def write_element( shapes_geometry_encoding Whether to use the WKB or geoarrow encoding for GeoParquet. See :meth:`geopandas.GeoDataFrame.to_parquet` for details. If None, uses the value from :attr:`spatialdata.settings.shapes_geometry_encoding`. + compressor + A lenght-1 dictionary with as key the type of compression to use for images and labels and as value the + compression level which should be inclusive between 0 and 9. For compression, `lz4` and `zstd` are + supported. If not specified, the compression will be `lz4` with compression level 5. Bytes are automatically + ordered for more efficient compression. Notes ----- @@ -1309,6 +1326,7 @@ def write_element( overwrite=overwrite, sdata_formats=sdata_formats, shapes_geometry_encoding=shapes_geometry_encoding, + compressor=compressor, ) return @@ -1344,6 +1362,7 @@ def write_element( overwrite=overwrite, parsed_formats=parsed_formats, shapes_geometry_encoding=shapes_geometry_encoding, + compressor=compressor, ) # After every write, metadata should be consolidated, otherwise this can lead to IO problems like when deleting. if self.has_consolidated_metadata(): diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 6690d111..71bdb0da 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -545,3 +545,22 @@ def handle_read_errors( else: # on_bad_files == BadFileHandleMethod.ERROR # Let it raise exceptions yield + + +def _validate_compressor_args(compressor_dict: dict[Literal["lz4", "zstd"], int] | None) -> None: + if compressor_dict: + if not isinstance(compressor_dict, dict): + raise TypeError( + f"Expected a dictionary with as key the type of compression to use for images and labels and " + f"as value the compression level which should be inclusive between 1 and 9. " + f"Got type: {type(compressor_dict)}" + ) + if len(compressor_dict) != 1: + raise ValueError( + "Expected a dictionary with a single key indicating the type of compression, either 'lz4' or " + "'zstd' and an `int` inclusive between 1 and 9 as value representing the compression level." + ) + if (compression := list(compressor_dict.keys())[0]) not in ["lz4", "zstd"]: + raise ValueError(f"Compression must either be `lz4` or `zstd`, got: {compression}.") + if not isinstance(value := list(compressor_dict.values())[0], int) or not (0 <= value <= 9): + raise ValueError(f"The compression level must be an integer inclusive between 0 and 9. Got: {value}") diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index 4bba7887..a20ca6c2 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -2,7 +2,7 @@ from collections.abc import Sequence from pathlib import Path -from typing import Any, Literal, TypeGuard +from typing import Any, Literal, TypeGuard, cast import dask.array as da import numpy as np @@ -265,6 +265,7 @@ def _write_raster( name: str, raster_format: RasterFormatType, storage_options: JSONDict | list[JSONDict] | None = None, + compressor: dict[Literal["lz4", "zstd"], int] | None = None, label_metadata: JSONDict | None = None, **metadata: str | JSONDict | list[JSONDict], ) -> None: @@ -284,6 +285,8 @@ def _write_raster( The format used to write the raster data. storage_options Additional options for writing the raster data, like chunks and compression. + compressor + Compression settings as a len-1 dictionary with a single key-value {compression: compression level} pair label_metadata Label metadata which can only be defined when writing 'labels'. metadata @@ -313,6 +316,7 @@ def _write_raster( raster_data, raster_format, storage_options, + compressor=compressor, **metadata, ) elif isinstance(raster_data, DataTree): @@ -323,6 +327,7 @@ def _write_raster( raster_data, raster_format, storage_options, + compressor=compressor, **metadata, ) else: @@ -337,13 +342,94 @@ def _write_raster( group.attrs[ATTRS_KEY] = attrs +def _build_v3_codec( + compression: Literal["lz4", "zstd"], + compression_level: int, +) -> Any: + """Return the appropriate zarr v3 codec for the given compression type and level.""" + if compression == "zstd": + from zarr.codecs import ZstdCodec + + return ZstdCodec(level=compression_level) + # lz4: use the native zarr v3 BloscCodec + from zarr.codecs import BloscCodec + + return BloscCodec(cname="lz4", clevel=compression_level) + + +def _apply_compression( + storage_options: JSONDict | list[JSONDict], + compressor: dict[Literal["lz4", "zstd"], int] | None, + zarr_format: Literal[2, 3] = 3, +) -> JSONDict | list[JSONDict]: + """Apply compression settings to storage options. + + Parameters + ---------- + storage_options + Storage options for zarr arrays + compressor + Compression settings as a dictionary with a single key-value pair + zarr_format + The zarr format version (2 or 3) + + Returns + ------- + Updated storage options with compression settings + """ + if not compressor: + return storage_options + + ((compression, compression_level),) = compressor.items() + + if zarr_format == 2: + from numcodecs import Blosc as BloscV2 + + assert BloscV2.SHUFFLE == 1 + codec_v2 = BloscV2(cname=compression, clevel=compression_level, shuffle=1) + + def _update_dict(d: dict[str, Any]) -> None: + d["compressor"] = codec_v2 + + if isinstance(storage_options, dict): + _update_dict(d=storage_options) + elif isinstance(storage_options, list): + for option in storage_options: + _update_dict(d=option) + elif storage_options is None: + return {"compressor": codec_v2} + else: + raise ValueError(f"storage_options must be a dict or list, not {type(storage_options)}") + else: + # zarr v3: use native codec objects via the "compressors" (plural) key. + # see https://github.com/ome/ome-zarr-py/blob/v0.16.0/ome_zarr/writer.py#L754 + # ome-zarr-py ≥ 0.16.0 with dask ≥ 2026.3.0 forwards this key to zarr_array_kwargs. + codec_v3 = _build_v3_codec(compression, compression_level) + + def _update_dict_v3(d: dict[str, Any]) -> None: + d["compressors"] = [codec_v3] + + if isinstance(storage_options, dict): + _update_dict_v3(d=storage_options) + elif isinstance(storage_options, list): + for option in storage_options: + _update_dict_v3(d=option) + elif storage_options is None: + return {"compressors": [codec_v3]} + else: + raise ValueError(f"storage_options must be a dict or list, not {type(storage_options)}") + + return storage_options + + def _write_raster_dataarray( raster_type: Literal["image", "labels"], group: zarr.Group, element_name: str, raster_data: DataArray, raster_format: RasterFormatType, - storage_options: JSONDict | list[JSONDict] | None = None, + storage_options: JSONDict | list[JSONDict] | None, + compressor: dict[Literal["lz4", "zstd"], int] | None, **metadata: str | JSONDict | list[JSONDict], ) -> None: """Write raster data of type DataArray to disk. @@ -362,6 +448,8 @@ def _write_raster_dataarray( The format used to write the raster data. storage_options Additional options for writing the raster data, like chunks and compression. + compressor + Compression settings as a len-1 dictionary with a single key-value {compression: compression level} pair metadata Additional metadata for the raster element """ @@ -373,6 +461,11 @@ def _write_raster_dataarray( input_axes: tuple[str, ...] = tuple(raster_data.dims) parsed_axes = _get_valid_axes(axes=list(input_axes), fmt=raster_format) storage_options = _prepare_storage_options(storage_options) + # Apply compression if specified + storage_options = _apply_compression( + storage_options, compressor, zarr_format=cast(Literal[2, 3], raster_format.zarr_format) + ) + # Explicitly disable pyramid generation for single-scale rasters. Recent ome-zarr versions default # write_image()/write_labels() to scale_factors=(2, 4, 8, 16), which would otherwise write s0, s1, ... # even when the input is a plain DataArray. @@ -406,7 +499,8 @@ def _write_raster_datatree( element_name: str, raster_data: DataTree, raster_format: RasterFormatType, - storage_options: JSONDict | list[JSONDict] | None = None, + storage_options: JSONDict | list[JSONDict] | None, + compressor: dict[Literal["lz4", "zstd"], int] | None, **metadata: str | JSONDict | list[JSONDict], ) -> zarr.Group: """Write raster data of type DataTree to disk. @@ -425,6 +519,8 @@ def _write_raster_datatree( The format used to write the raster data. storage_options Additional options for writing the raster data, like chunks and compression. + compressor + Compression settings as a len-1 dictionary with a single key-value {compression: compression level} pair metadata Additional metadata for the raster element """ @@ -442,6 +538,10 @@ def _write_raster_datatree( parsed_axes = _get_valid_axes(axes=list(input_axes), fmt=raster_format) storage_options = _prepare_storage_options(storage_options) + + # Apply compression if specified + storage_options = _apply_compression(storage_options, compressor, zarr_format=raster_format.zarr_format) + ome_zarr_format = get_ome_zarr_format(raster_format) dask_delayed = write_multi_scale_ngff( pyramid=data, @@ -483,6 +583,7 @@ def write_image( name: str, element_format: RasterFormatType = CurrentRasterFormat(), storage_options: JSONDict | list[JSONDict] | None = None, + compressor: dict[Literal["lz4", "zstd"], int] | None = None, **metadata: str | JSONDict | list[JSONDict], ) -> None: _write_raster( @@ -492,6 +593,7 @@ def write_image( name=name, raster_format=element_format, storage_options=storage_options, + compressor=compressor, **metadata, ) @@ -503,6 +605,7 @@ def write_labels( element_format: RasterFormatType = CurrentRasterFormat(), storage_options: JSONDict | list[JSONDict] | None = None, label_metadata: JSONDict | None = None, + compressor: dict[Literal["lz4", "zstd"], int] | None = None, **metadata: JSONDict, ) -> None: _write_raster( @@ -512,6 +615,7 @@ def write_labels( name=name, raster_format=element_format, storage_options=storage_options, + compressor=compressor, label_metadata=label_metadata, **metadata, ) diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index f1dd22a3..07643000 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -244,6 +244,74 @@ def test_roundtrip( sdata2.write(tmpdir2, sdata_formats=sdata_container_format) _are_directories_identical(tmpdir, tmpdir2, exclude_regexp="[1-9][0-9]*.*") + def test_compression_roundtrip( + self, + tmp_path: str, + full_sdata: SpatialData, + sdata_container_format: SpatialDataContainerFormatType, + ): + tmpdir = Path(tmp_path) / "tmp.zarr" + with pytest.raises(TypeError, match="Expected a dictionary with as"): + full_sdata.write(tmpdir, compressor="faulty", sdata_formats=sdata_container_format) + with pytest.raises(ValueError, match="Expected a dictionary with a single"): + full_sdata.write(tmpdir, compressor={"zstd": 8, "other_item": 4}, sdata_formats=sdata_container_format) + with pytest.raises(ValueError, match="Compression must either"): + full_sdata.write(tmpdir, compressor={"faulty": 8}, sdata_formats=sdata_container_format) + with pytest.raises(ValueError, match="Compression must either"): + full_sdata.write(tmpdir, compressor={"The compression level": 10}, sdata_formats=sdata_container_format) + + full_sdata.write(tmpdir, compressor={"zstd": 8}, sdata_formats=sdata_container_format) + + # sourcery skip: no-loop-in-tests + for element in ["image2d", "image2d_multiscale", "labels2d", "labels2d_multiscale"]: + element_type = "images" if element.startswith("image") else "labels" + arr = zarr.open_group(tmpdir / element_type, mode="r")[element]["s0"] + compressor = arr.compressors[0] + + if sdata_container_format.zarr_format == 2: + assert compressor.cname == "zstd" + assert compressor.clevel == 8 + elif sdata_container_format.zarr_format == 3: + from zarr.codecs.zstd import ZstdCodec + + assert isinstance(compressor, ZstdCodec) + assert compressor.level == 8 + + @pytest.mark.parametrize("compressor", [{"lz4": 3}, {"zstd": 7}]) + @pytest.mark.parametrize("element", [("images", "image2d"), ("labels", "labels2d")]) + def test_write_element_compression( + self, + tmp_path: str, + full_sdata: SpatialData, + compressor: dict[Literal["lz4", "zstd"], int], + element: str, + sdata_container_format: SpatialDataContainerFormatType, + ): + tmpdir = Path(tmp_path) / "compression.zarr" + sdata = SpatialData() + sdata.write(tmpdir, sdata_formats=sdata_container_format) + + sdata["element"] = full_sdata[element[1]] + sdata.write_element("element", compressor=compressor, sdata_formats=sdata_container_format) + + arr = zarr.open_group(tmpdir / element[0], mode="r")["element"]["s0"] + compression = arr.compressors[0] + + if sdata_container_format.zarr_format == 2: + assert compression.cname == list(compressor.keys())[0] + assert compression.clevel == list(compressor.values())[0] + elif sdata_container_format.zarr_format == 3: + from zarr.codecs import BloscCodec, ZstdCodec + + compressor_name = list(compressor.keys())[0] + compressor_level = list(compressor.values())[0] + if compressor_name == "zstd": + assert isinstance(compression, ZstdCodec) + assert compression.level == compressor_level + elif compressor_name == "lz4": + assert isinstance(compression, BloscCodec) + assert compression.clevel == compressor_level + def test_incremental_io_list_of_elements( self, shapes: SpatialData,