From 9a8de8ef56bb5d1be3bd8d4063145cef9988c7bc Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 18 May 2026 16:16:49 +0100 Subject: [PATCH 1/3] Update to new vcztools code. --- uv.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uv.lock b/uv.lock index 3b2d3c5..dde5e29 100644 --- a/uv.lock +++ b/uv.lock @@ -1737,8 +1737,8 @@ all = [ [[package]] name = "vcztools" -version = "0.1.3.dev356" -source = { git = "https://github.com/sgkit-dev/vcztools.git?rev=main#11ead2fe565c81dc7b8d024dae9ce373a45be4ba" } +version = "0.1.3.dev365" +source = { git = "https://github.com/sgkit-dev/vcztools.git?rev=main#8d4b12d49e24539191994451cc8da36e18a9f2e9" } dependencies = [ { name = "click" }, { name = "humanfriendly" }, From ed12528e36901c61b99c2651b4f7e769c106d59f Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 18 May 2026 18:55:53 +0100 Subject: [PATCH 2/3] Expose BgenEncoder total_string_length and pad_byte on mount-bgen vcztools.BgenEncoder accepts total_string_length and pad_byte but ViewBgenOptions does not yet surface them. Add a temporary biofuse/_vcztools_compat.py shim that subclasses ViewBgenOptions to add the two fields, layers --total-string-length and --pad-byte onto the existing decorator, and extends from_click_kwargs to read them. _bgen_encoder_factory passes the new fields straight to BgenEncoder; nothing else in cli.py / encoder_host.py changes. Delete the shim (and the TestViewBgenOptionsShim class) once the fields land in vcztools upstream. --- biofuse/__init__.py | 2 + biofuse/_vcztools_compat.py | 83 ++++++++++++++++++++++++++++++++++ biofuse/formats.py | 5 ++- tests/test_bgen_apps.py | 56 ++++++++++++++++++++--- tests/test_cli.py | 41 +++++++++++++++++ tests/test_formats.py | 89 +++++++++++++++++++++++++++++++++++++ 6 files changed, 270 insertions(+), 6 deletions(-) create mode 100644 biofuse/_vcztools_compat.py diff --git a/biofuse/__init__.py b/biofuse/__init__.py index 3c96b0b..3865ac2 100644 --- a/biofuse/__init__.py +++ b/biofuse/__init__.py @@ -1,3 +1,5 @@ +from biofuse import _vcztools_compat # noqa: F401 — applies temporary monkeypatch + try: from biofuse._version import version as __version__ except ImportError: diff --git a/biofuse/_vcztools_compat.py b/biofuse/_vcztools_compat.py new file mode 100644 index 0000000..2b2d81d --- /dev/null +++ b/biofuse/_vcztools_compat.py @@ -0,0 +1,83 @@ +"""Temporary monkeypatch surfacing two BgenEncoder string-padding +options on ``vcztools.ViewBgenOptions``. + +``vcztools.BgenEncoder`` accepts ``total_string_length`` and +``pad_byte`` but ``vcztools.ViewBgenOptions`` does not yet expose +them on the ``view-bgen`` CLI surface. This module subclasses the +original frozen dataclass to add the two fields, layers two click +options on top of the existing ``.decorator``, and extends +``.from_click_kwargs`` to read them. + +Importing this module reassigns ``vcztools.ViewBgenOptions`` to the +subclass. ``biofuse/__init__.py`` triggers that side effect so the +patch is in place before any biofuse module resolves the decorator. + +Delete this file (and its import in ``biofuse/__init__.py`` and the +``TestViewBgenOptionsShim`` class in ``tests/test_formats.py``) once +vcztools lands the two fields upstream. +""" + +import dataclasses + +import click +import vcztools + +_ORIGINAL = vcztools.ViewBgenOptions + + +class _PadByteParam(click.ParamType): + name = "byte" + + def convert(self, value, param, ctx): + if isinstance(value, bytes): + return value + if len(value) != 1 or not value.isascii(): + self.fail("must be a single ASCII character", param, ctx) + return value.encode("ascii") + + +@dataclasses.dataclass(frozen=True) +class _ViewBgenOptionsWithStringPadding(_ORIGINAL): + total_string_length: int | None = None + pad_byte: bytes | None = None + + @staticmethod + def decorator(f): + f = click.option( + "--pad-byte", + type=_PadByteParam(), + default=None, + help=( + "Byte that fills slack in each variant's BGEN string " + "budget. Defaults to '.' inside the encoder." + ), + )(f) + f = click.option( + "--total-string-length", + type=click.IntRange(min=1), + default=None, + help=( + "Total byte budget for each variant's BGEN string " + "fields (varid + rsid + chrom + allele1 + allele2). " + "Defaults to 64 inside the encoder." + ), + )(f) + f = _ORIGINAL.decorator(f) + return f + + @classmethod + def from_click_kwargs(cls, kwargs): + total_string_length = kwargs.pop("total_string_length", None) + pad_byte = kwargs.pop("pad_byte", None) + base = _ORIGINAL.from_click_kwargs(kwargs) + base_kwargs = { + f.name: getattr(base, f.name) for f in dataclasses.fields(_ORIGINAL) + } + return cls( + **base_kwargs, + total_string_length=total_string_length, + pad_byte=pad_byte, + ) + + +vcztools.ViewBgenOptions = _ViewBgenOptionsWithStringPadding diff --git a/biofuse/formats.py b/biofuse/formats.py index 00ebbd8..0b71952 100644 --- a/biofuse/formats.py +++ b/biofuse/formats.py @@ -129,10 +129,13 @@ def _plink_encoder_factory(reader, opts): def _bgen_encoder_factory(reader, opts): + embed_header_samples = not opts.no_header_samples return vcztools.BgenEncoder( reader, - embed_header_samples=not opts.no_header_samples, + embed_header_samples=embed_header_samples, unphased=opts.unphased, + total_string_length=opts.total_string_length, + pad_byte=opts.pad_byte, ) diff --git a/tests/test_bgen_apps.py b/tests/test_bgen_apps.py index 7990880..b166456 100644 --- a/tests/test_bgen_apps.py +++ b/tests/test_bgen_apps.py @@ -78,9 +78,18 @@ async def fx_mounted_bgen(tmp_path, fx_medium_vcz): yield mnt, "medium", expected, log -def _encoder_bytes(vcz_path: pathlib.Path) -> bytes: +def _encoder_bytes( + vcz_path: pathlib.Path, + *, + total_string_length: int | None = None, + pad_byte: bytes | None = None, +) -> bytes: reader = _open_reader(vcz_path) - with vcztools.BgenEncoder(reader) as enc: + with vcztools.BgenEncoder( + reader, + total_string_length=total_string_length, + pad_byte=pad_byte, + ) as enc: return enc.read(0, enc.total_size) @@ -89,14 +98,16 @@ async def _arun(cmd) -> None: @contextlib.asynccontextmanager -async def _mount_bgen(tmp_path, vcz, opts=None): +async def _mount_bgen(tmp_path, vcz, opts=None, *, mnt_name="mnt"): """Mount ``vcz`` as a BGEN fileset; yield ``(mnt, basename)``. ``opts`` is the ``vcztools.ViewBgenOptions`` dataclass the host runs under; defaults to a fresh ``ViewBgenOptions()`` (every field - at its dataclass default). + at its dataclass default). ``mnt_name`` lets a single test mount + twice under the same ``tmp_path`` without colliding on the + mountpoint directory. """ - mnt = tmp_path / "mnt" + mnt = tmp_path / mnt_name mnt.mkdir() basename = vcz.path.stem if opts is None: @@ -199,6 +210,41 @@ async def test_unphased_stable_across_opens(self, tmp_path, fx_small_vcz, unphas assert data == expected, f"cycle {cycle} differed from reference" +class TestBgenCustomStringPadding: + """End-to-end coverage that ``--total-string-length`` / + ``--pad-byte`` reach the FUSE-served bytes. + """ + + async def test_full_bgen_with_custom_budget(self, tmp_path, fx_small_vcz): + opts = vcztools.ViewBgenOptions(total_string_length=128, pad_byte=b"X") + expected = _encoder_bytes( + fx_small_vcz.path, total_string_length=128, pad_byte=b"X" + ) + async with _mount_bgen(tmp_path, fx_small_vcz, opts=opts) as (mnt, basename): + bgen_path = mnt / f"{basename}.bgen" + data = await trio.to_thread.run_sync(bgen_path.read_bytes) + assert data == expected + + async def test_bytes_differ_from_default(self, tmp_path, fx_small_vcz): + opts_x = vcztools.ViewBgenOptions(pad_byte=b"X") + async with _mount_bgen( + tmp_path, fx_small_vcz, opts=opts_x, mnt_name="mnt_x" + ) as (mnt, basename): + data_x = await trio.to_thread.run_sync( + (mnt / f"{basename}.bgen").read_bytes + ) + + opts_default = vcztools.ViewBgenOptions() + async with _mount_bgen( + tmp_path, fx_small_vcz, opts=opts_default, mnt_name="mnt_default" + ) as (mnt, basename): + data_default = await trio.to_thread.run_sync( + (mnt / f"{basename}.bgen").read_bytes + ) + + assert data_default != data_x + + def _pread_sync(path: pathlib.Path, off: int, size: int) -> bytes: with path.open("rb") as f: f.seek(off) diff --git a/tests/test_cli.py b/tests/test_cli.py index 666593c..6c1c07a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -54,6 +54,8 @@ def test_mount_bgen_help(self): "--no-sample-file", "--no-bgi", "--no-header-samples", + "--total-string-length", + "--pad-byte", "--log-level", ]: assert flag in result.output, f"missing {flag} in mount-bgen help" @@ -96,6 +98,45 @@ def test_bgen_nonexistent_mount_dir_fails(self, tmp_path): assert "mount directory does not exist" in result.output +class TestMountBgenStringOptions: + """Parse-time validation of ``--total-string-length`` / ``--pad-byte``. + + The CLI rejects malformed values before any FUSE mount is + attempted, so these tests stay in-process. + """ + + def test_pad_byte_rejects_multichar(self, tmp_path): + result = CliRunner().invoke( + cli.biofuse_main, + ["mount-bgen", "--pad-byte", "XX", "x.vcz", str(tmp_path)], + ) + assert result.exit_code != 0 + assert "single ASCII character" in result.output + + def test_pad_byte_rejects_non_ascii(self, tmp_path): + result = CliRunner().invoke( + cli.biofuse_main, + ["mount-bgen", "--pad-byte", "é", "x.vcz", str(tmp_path)], + ) + assert result.exit_code != 0 + assert "single ASCII character" in result.output + + def test_pad_byte_rejects_empty(self, tmp_path): + result = CliRunner().invoke( + cli.biofuse_main, + ["mount-bgen", "--pad-byte", "", "x.vcz", str(tmp_path)], + ) + assert result.exit_code != 0 + assert "single ASCII character" in result.output + + def test_total_string_length_rejects_zero(self, tmp_path): + result = CliRunner().invoke( + cli.biofuse_main, + ["mount-bgen", "--total-string-length", "0", "x.vcz", str(tmp_path)], + ) + assert result.exit_code != 0 + + class TestEndToEndMount: """Spawn the CLI as a subprocess, wait for mount, read files, terminate.""" diff --git a/tests/test_formats.py b/tests/test_formats.py index 6beb876..20f055a 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -7,6 +7,7 @@ the same selection. """ +import dataclasses import io import sqlite3 import tempfile @@ -341,6 +342,94 @@ def test_unphased_default_matches_in_process(self, fx_reader, fx_small_vcz): assert data == ref_data +class TestBgenTotalStringLengthPlumbing: + """``--total-string-length`` flows through to ``BgenEncoder``. + + Compared against an in-process ``BgenEncoder`` built with the same + kwarg, the streamed bytes must be identical. + """ + + def test_default_matches_in_process(self, fx_reader, fx_small_vcz): + opts = vcztools.ViewBgenOptions() + assert opts.total_string_length is None + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts) as encoder: + data = encoder.read(0, encoder.total_size) + ref_reader = _open_reader(fx_small_vcz.path) + with vcztools.BgenEncoder(ref_reader) as ref: + ref_data = ref.read(0, ref.total_size) + assert data == ref_data + + def test_explicit_value_matches_in_process(self, fx_reader, fx_small_vcz): + opts = vcztools.ViewBgenOptions(total_string_length=128) + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts) as encoder: + data = encoder.read(0, encoder.total_size) + ref_reader = _open_reader(fx_small_vcz.path) + with vcztools.BgenEncoder(ref_reader, total_string_length=128) as ref: + ref_data = ref.read(0, ref.total_size) + assert data == ref_data + + def test_budget_too_small_raises(self, fx_reader): + opts = vcztools.ViewBgenOptions(total_string_length=1) + with pytest.raises(ValueError, match="total_string_length"): + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts) as encoder: + encoder.read(0, encoder.total_size) + + +class TestBgenPadBytePlumbing: + """``--pad-byte`` flows through to ``BgenEncoder(pad_byte=...)``.""" + + def test_default_matches_in_process(self, fx_reader, fx_small_vcz): + opts = vcztools.ViewBgenOptions() + assert opts.pad_byte is None + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts) as encoder: + data = encoder.read(0, encoder.total_size) + ref_reader = _open_reader(fx_small_vcz.path) + with vcztools.BgenEncoder(ref_reader) as ref: + ref_data = ref.read(0, ref.total_size) + assert data == ref_data + + def test_explicit_pad_byte_matches_in_process(self, fx_reader, fx_small_vcz): + opts = vcztools.ViewBgenOptions(pad_byte=b"X") + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts) as encoder: + data = encoder.read(0, encoder.total_size) + ref_reader = _open_reader(fx_small_vcz.path) + with vcztools.BgenEncoder(ref_reader, pad_byte=b"X") as ref: + ref_data = ref.read(0, ref.total_size) + assert data == ref_data + + def test_pad_byte_changes_bytes(self, fx_reader): + opts_default = vcztools.ViewBgenOptions() + opts_x = vcztools.ViewBgenOptions(pad_byte=b"X") + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts_default) as encoder: + data_default = encoder.read(0, encoder.total_size) + with formats.BGEN_SPEC.encoder_factory(fx_reader, opts_x) as encoder: + data_x = encoder.read(0, encoder.total_size) + assert data_default != data_x + + +class TestViewBgenOptionsShim: + """Locks the temporary monkeypatch in ``biofuse/_vcztools_compat.py``. + + Delete this class when the upstream vcztools fields land. + """ + + def test_fields_are_present(self): + names = {f.name for f in dataclasses.fields(vcztools.ViewBgenOptions)} + assert {"total_string_length", "pad_byte"} <= names + + def test_default_values(self): + opts = vcztools.ViewBgenOptions() + assert opts.total_string_length is None + assert opts.pad_byte is None + + def test_unset_kwargs_preserve_base_fields(self): + opts = vcztools.ViewBgenOptions(no_header_samples=True, unphased=True) + assert opts.no_header_samples is True + assert opts.unphased is True + assert opts.total_string_length is None + assert opts.pad_byte is None + + class TestSpecsRegistry: def test_specs_dict_has_both_entries(self): assert formats.SPECS == {"plink": formats.PLINK_SPEC, "bgen": formats.BGEN_SPEC} From d558c1b9de47020d62723970981868a9f59466fa Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 19 May 2026 11:27:38 +0100 Subject: [PATCH 3/3] Drop _vcztools_compat shim; use upstream ViewBgenOptions The --total-string-length and --pad-byte options have landed in vcztools.ViewBgenOptions, so the biofuse-local monkeypatch is no longer needed. Bumps the vcztools lock past commit a8ce91d and loosens the non-ASCII pad-byte CLI test to match upstream's strict encoder, which raises UnicodeEncodeError instead of BadParameter. --- biofuse/__init__.py | 2 - biofuse/_vcztools_compat.py | 83 ------------------------------------- tests/test_cli.py | 4 +- tests/test_formats.py | 24 ----------- uv.lock | 4 +- 5 files changed, 5 insertions(+), 112 deletions(-) delete mode 100644 biofuse/_vcztools_compat.py diff --git a/biofuse/__init__.py b/biofuse/__init__.py index 3865ac2..3c96b0b 100644 --- a/biofuse/__init__.py +++ b/biofuse/__init__.py @@ -1,5 +1,3 @@ -from biofuse import _vcztools_compat # noqa: F401 — applies temporary monkeypatch - try: from biofuse._version import version as __version__ except ImportError: diff --git a/biofuse/_vcztools_compat.py b/biofuse/_vcztools_compat.py deleted file mode 100644 index 2b2d81d..0000000 --- a/biofuse/_vcztools_compat.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Temporary monkeypatch surfacing two BgenEncoder string-padding -options on ``vcztools.ViewBgenOptions``. - -``vcztools.BgenEncoder`` accepts ``total_string_length`` and -``pad_byte`` but ``vcztools.ViewBgenOptions`` does not yet expose -them on the ``view-bgen`` CLI surface. This module subclasses the -original frozen dataclass to add the two fields, layers two click -options on top of the existing ``.decorator``, and extends -``.from_click_kwargs`` to read them. - -Importing this module reassigns ``vcztools.ViewBgenOptions`` to the -subclass. ``biofuse/__init__.py`` triggers that side effect so the -patch is in place before any biofuse module resolves the decorator. - -Delete this file (and its import in ``biofuse/__init__.py`` and the -``TestViewBgenOptionsShim`` class in ``tests/test_formats.py``) once -vcztools lands the two fields upstream. -""" - -import dataclasses - -import click -import vcztools - -_ORIGINAL = vcztools.ViewBgenOptions - - -class _PadByteParam(click.ParamType): - name = "byte" - - def convert(self, value, param, ctx): - if isinstance(value, bytes): - return value - if len(value) != 1 or not value.isascii(): - self.fail("must be a single ASCII character", param, ctx) - return value.encode("ascii") - - -@dataclasses.dataclass(frozen=True) -class _ViewBgenOptionsWithStringPadding(_ORIGINAL): - total_string_length: int | None = None - pad_byte: bytes | None = None - - @staticmethod - def decorator(f): - f = click.option( - "--pad-byte", - type=_PadByteParam(), - default=None, - help=( - "Byte that fills slack in each variant's BGEN string " - "budget. Defaults to '.' inside the encoder." - ), - )(f) - f = click.option( - "--total-string-length", - type=click.IntRange(min=1), - default=None, - help=( - "Total byte budget for each variant's BGEN string " - "fields (varid + rsid + chrom + allele1 + allele2). " - "Defaults to 64 inside the encoder." - ), - )(f) - f = _ORIGINAL.decorator(f) - return f - - @classmethod - def from_click_kwargs(cls, kwargs): - total_string_length = kwargs.pop("total_string_length", None) - pad_byte = kwargs.pop("pad_byte", None) - base = _ORIGINAL.from_click_kwargs(kwargs) - base_kwargs = { - f.name: getattr(base, f.name) for f in dataclasses.fields(_ORIGINAL) - } - return cls( - **base_kwargs, - total_string_length=total_string_length, - pad_byte=pad_byte, - ) - - -vcztools.ViewBgenOptions = _ViewBgenOptionsWithStringPadding diff --git a/tests/test_cli.py b/tests/test_cli.py index 6c1c07a..f79ba3c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -114,12 +114,14 @@ def test_pad_byte_rejects_multichar(self, tmp_path): assert "single ASCII character" in result.output def test_pad_byte_rejects_non_ascii(self, tmp_path): + # Upstream's --pad-byte callback uses errors="strict", so a + # non-ASCII character surfaces as a UnicodeEncodeError rather + # than a click.BadParameter — only the non-zero exit is stable. result = CliRunner().invoke( cli.biofuse_main, ["mount-bgen", "--pad-byte", "é", "x.vcz", str(tmp_path)], ) assert result.exit_code != 0 - assert "single ASCII character" in result.output def test_pad_byte_rejects_empty(self, tmp_path): result = CliRunner().invoke( diff --git a/tests/test_formats.py b/tests/test_formats.py index 20f055a..1e762d7 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -7,7 +7,6 @@ the same selection. """ -import dataclasses import io import sqlite3 import tempfile @@ -407,29 +406,6 @@ def test_pad_byte_changes_bytes(self, fx_reader): assert data_default != data_x -class TestViewBgenOptionsShim: - """Locks the temporary monkeypatch in ``biofuse/_vcztools_compat.py``. - - Delete this class when the upstream vcztools fields land. - """ - - def test_fields_are_present(self): - names = {f.name for f in dataclasses.fields(vcztools.ViewBgenOptions)} - assert {"total_string_length", "pad_byte"} <= names - - def test_default_values(self): - opts = vcztools.ViewBgenOptions() - assert opts.total_string_length is None - assert opts.pad_byte is None - - def test_unset_kwargs_preserve_base_fields(self): - opts = vcztools.ViewBgenOptions(no_header_samples=True, unphased=True) - assert opts.no_header_samples is True - assert opts.unphased is True - assert opts.total_string_length is None - assert opts.pad_byte is None - - class TestSpecsRegistry: def test_specs_dict_has_both_entries(self): assert formats.SPECS == {"plink": formats.PLINK_SPEC, "bgen": formats.BGEN_SPEC} diff --git a/uv.lock b/uv.lock index dde5e29..c82a2db 100644 --- a/uv.lock +++ b/uv.lock @@ -1737,8 +1737,8 @@ all = [ [[package]] name = "vcztools" -version = "0.1.3.dev365" -source = { git = "https://github.com/sgkit-dev/vcztools.git?rev=main#8d4b12d49e24539191994451cc8da36e18a9f2e9" } +version = "0.1.3.dev378" +source = { git = "https://github.com/sgkit-dev/vcztools.git?rev=main#a8ce91d1e4fedf4e0cdb5b5e0ca7ebda4fb65b46" } dependencies = [ { name = "click" }, { name = "humanfriendly" },