Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ You can find a summary of the available variables and their defaults below.
| `MDIO__EXPORT__CPU_COUNT` | `int` | Number of logical CPUs available |
| `MDIO__GRID__SPARSITY_RATIO_WARN` | `float` | `2.0` |
| `MDIO__GRID__SPARSITY_RATIO_LIMIT` | `float` | `10.0` |
| `MDIO__IMPORT__SAVE_SEGY_FILE_HEADER` | `bool` | `False` |
| `MDIO__IMPORT__SAVE_SEGY_FILE_HEADER` | `int` | `0` |
| `MDIO__IMPORT__CLOUD_NATIVE` | `bool` | `False` |
| `MDIO__IMPORT__RAW_HEADERS` | `bool` | `False` |
| `MDIO_IGNORE_CHECKS` | `bool` | `False` |
Expand Down Expand Up @@ -71,13 +71,29 @@ $ export MDIO__GRID__SPARSITY_RATIO_LIMIT=15.0

### `MDIO__IMPORT__SAVE_SEGY_FILE_HEADER`

**Accepted values:** `true`, `false`, `1`, `0`, `yes`, `no`, `on`, `off`
**Accepted values:** `0`, `1`, `2`, `true`, `false`, `yes`, `no`, `on`, `off`

Controls preservation of the original SEG-Y textual file header during import.
The textual file header must be 40 lines of 80 printable characters per the
SEG-Y standard; lossy EBCDIC decoding can produce headers that violate this
layout. The variable selects how MDIO reacts:

| Value | Behavior |
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `0` / `false` | Do not save SEG-Y file headers (default). |
| `1` / `true` | Save SEG-Y file headers and raise `ValueError` if the text header is not exactly 40x80 ASCII-printable characters (rejects e.g. `U+FFFD` from a lossy EBCDIC decode). |
| `2` | Save SEG-Y file headers; if the text header is malformed, log a warning and correct it (non-ASCII or non-printable characters become spaces and rows pad to 80x40). |

When enabled, preserves the original SEG-Y textual file header during import.
This is useful for maintaining full SEG-Y standard compliance and preserving survey metadata.
```{note}
On export, `mdio_to_segy` always defensively validates the stored text header
and, if it cannot be re-encoded as ASCII (for example because the store was
written by an older version of MDIO that accepted lossy EBCDIC decodes),
repairs it on the fly and emits a warning. Re-ingest the source SEG-Y with
mode `1` or `2` to silence the warning permanently.
```

```shell
$ export MDIO__IMPORT__SAVE_SEGY_FILE_HEADER=true
$ export MDIO__IMPORT__SAVE_SEGY_FILE_HEADER=1
$ mdio segy import input.segy output.mdio --header-locations 189,193
```

Expand Down
31 changes: 17 additions & 14 deletions src/mdio/converters/segy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
from mdio.converters.exceptions import GridTraceCountError
from mdio.converters.exceptions import GridTraceSparsityError
from mdio.converters.type_converter import to_structured_type
from mdio.core.config import SAVE_SEGY_FILE_HEADER_LENIENT
from mdio.core.config import SAVE_SEGY_FILE_HEADER_OFF
from mdio.core.config import SAVE_SEGY_FILE_HEADER_STRICT
from mdio.core.config import MDIOSettings
from mdio.core.grid import Grid
from mdio.core.utils_write import MAX_COORDINATES_BYTES
Expand All @@ -39,6 +42,8 @@
from mdio.segy.file import get_segy_file_info
from mdio.segy.scalar import SCALE_COORDINATE_KEYS
from mdio.segy.scalar import _apply_coordinate_scalar
from mdio.segy.text_header import sanitize_text_header
from mdio.segy.text_header import validate_text_header
from mdio.segy.utilities import get_grid_plan

if TYPE_CHECKING:
Expand Down Expand Up @@ -537,28 +542,26 @@ def _populate_coordinates(

def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_info: SegyFileInfo) -> xr_Dataset:
settings = MDIOSettings()
mode = settings.save_segy_file_header

if not settings.save_segy_file_header:
if mode == SAVE_SEGY_FILE_HEADER_OFF:
return xr_dataset

expected_rows = 40
expected_cols = 80
text_header = segy_file_info.text_header

text_header_rows = segy_file_info.text_header.splitlines()
text_header_cols_bad = [len(row) != expected_cols for row in text_header_rows]

if len(text_header_rows) != expected_rows:
err = f"Invalid text header count: expected {expected_rows}, got {len(segy_file_info.text_header)}"
raise ValueError(err)

if any(text_header_cols_bad):
err = f"Invalid text header columns: expected {expected_cols} per line."
raise ValueError(err)
if mode == SAVE_SEGY_FILE_HEADER_LENIENT:
try:
validate_text_header(text_header)
except ValueError as exc:
logger.warning("Correcting malformed SEG-Y text header on import: %s", exc)
text_header = sanitize_text_header(text_header)
elif mode == SAVE_SEGY_FILE_HEADER_STRICT:
validate_text_header(text_header)

xr_dataset["segy_file_header"] = ((), "")
xr_dataset["segy_file_header"].attrs.update(
{
"textHeader": segy_file_info.text_header,
"textHeader": text_header,
"binaryHeader": segy_file_info.binary_header_dict,
}
)
Expand Down
43 changes: 40 additions & 3 deletions src/mdio/core/config.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
"""Environment variable management for MDIO operations."""

from typing import Literal

from psutil import cpu_count
from pydantic import Field
from pydantic import field_validator
from pydantic_settings import BaseSettings
from pydantic_settings import SettingsConfigDict

SAVE_SEGY_FILE_HEADER_OFF = 0
SAVE_SEGY_FILE_HEADER_STRICT = 1
SAVE_SEGY_FILE_HEADER_LENIENT = 2

SaveSegyFileHeaderMode = Literal[
SAVE_SEGY_FILE_HEADER_OFF,
SAVE_SEGY_FILE_HEADER_STRICT,
SAVE_SEGY_FILE_HEADER_LENIENT,
]

_SAVE_HEADER_TRUE_STRINGS = frozenset({"true", "yes", "on"})
_SAVE_HEADER_FALSE_STRINGS = frozenset({"false", "no", "off"})


class MDIOSettings(BaseSettings):
"""MDIO environment configuration settings."""
Expand Down Expand Up @@ -34,9 +50,12 @@ class MDIOSettings(BaseSettings):
)

# Import configuration
save_segy_file_header: bool = Field(
default=False,
description="Whether to save SEG-Y file headers",
save_segy_file_header: SaveSegyFileHeaderMode = Field(
default=0,
description=(
"How to save SEG-Y file headers: 0 (or False) skips, 1 (or True) saves "
"and raises on malformed text header, 2 saves and corrects malformed text header."
),
alias="MDIO__IMPORT__SAVE_SEGY_FILE_HEADER",
)
raw_headers: bool = Field(
Expand All @@ -58,3 +77,21 @@ class MDIOSettings(BaseSettings):
)

model_config = SettingsConfigDict(case_sensitive=True)

@field_validator("save_segy_file_header", mode="before")
@classmethod
def _coerce_save_segy_file_header(cls, value: object) -> object:
"""Accept legacy bool values and case-insensitive string aliases."""
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in _SAVE_HEADER_FALSE_STRINGS:
return SAVE_SEGY_FILE_HEADER_OFF
if normalized in _SAVE_HEADER_TRUE_STRINGS:
return SAVE_SEGY_FILE_HEADER_STRICT
try:
return int(value)
except ValueError:
pass
if isinstance(value, bool):
return int(value)
return value
20 changes: 20 additions & 0 deletions src/mdio/segy/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from mdio.api.io import open_mdio
from mdio.exceptions import MDIOMissingVariableError
from mdio.segy.compat import encode_segy_revision
from mdio.segy.text_header import sanitize_text_header
from mdio.segy.text_header import validate_text_header

if TYPE_CHECKING:
import xarray as xr
Expand All @@ -28,6 +30,23 @@
logger = logging.getLogger(__name__)


def _ensure_exportable_text_header(text_header: str) -> str:
"""Validate the stored text header; repair and warn if it cannot be ASCII-encoded.

Args:
text_header: The ``textHeader`` attribute as stored on the MDIO dataset.

Returns:
A text header string that satisfies :func:`validate_text_header`.
"""
try:
validate_text_header(text_header)
except ValueError as exc:
logger.warning("Stored MDIO text header is not exportable as-is and will be repaired: %s", exc)
return sanitize_text_header(text_header)
return text_header


def make_segy_factory(spec: SegySpec, binary_header: dict[str, int]) -> SegyFactory:
"""Generate SEG-Y factory from MDIO metadata."""
sample_interval = binary_header["sample_interval"]
Expand Down Expand Up @@ -88,6 +107,7 @@ def mdio_spec_to_segy(

factory = make_segy_factory(spec=segy_spec, binary_header=binary_header)

text_header = _ensure_exportable_text_header(text_header)
text_header_bytes = factory.create_textual_header(text_header)

# During MDIO SEGY import, TGSAI/segy always creates revision major/minor fields
Expand Down
92 changes: 92 additions & 0 deletions src/mdio/segy/text_header.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""SEG-Y textual file header validation and sanitization helpers."""

from __future__ import annotations

import re

EXPECTED_ROWS = 40
EXPECTED_COLS = 80
ASCII_MAX_ORD = 127

_REPORT_LIMIT = 5
_NEWLINE_RUN = re.compile(r"\n{2,}")


def _is_safe_char(char: str) -> bool:
"""Return True if char is 7-bit ASCII and printable."""
return ord(char) <= ASCII_MAX_ORD and char.isprintable()


def _summarize(mapping: dict[int, list[int]], limit: int = _REPORT_LIMIT) -> str:
"""Format ``{row: [positions]}`` for an error message, capped for readability."""
if not mapping:
return "{}"

items = list(mapping.items())
head = items[:limit]
body = ", ".join(f"row {row}: positions {positions[:limit]}" for row, positions in head)

extra_rows = len(items) - len(head)
if extra_rows > 0:
body += f" (+{extra_rows} more rows)"
return body


def validate_text_header(text_header: str) -> None:
r"""Validate a SEG-Y textual file header is 40 rows of 80 ASCII-printable characters.

Args:
text_header: Decoded text header in wrapped form (40 rows of 80 chars joined by ``\n``).

Raises:
ValueError: If row count, row width, or any character fails the SEG-Y ASCII contract.
"""
rows = text_header.split("\n")

if len(rows) != EXPECTED_ROWS:
err = f"Invalid text header line count: expected {EXPECTED_ROWS}, got {len(rows)}"
raise ValueError(err)

bad_widths = [(i, len(row)) for i, row in enumerate(rows) if len(row) != EXPECTED_COLS]
if bad_widths:
capped = bad_widths[:_REPORT_LIMIT]
suffix = f" (+{len(bad_widths) - len(capped)} more)" if len(bad_widths) > len(capped) else ""
err = f"Invalid text header line widths: expected {EXPECTED_COLS} columns; offending rows: {capped}{suffix}"
raise ValueError(err)

bad_chars: dict[int, list[int]] = {}
for i, row in enumerate(rows):
positions = [j for j, c in enumerate(row) if not _is_safe_char(c)]
if positions:
bad_chars[i] = positions

if bad_chars:
err = f"Invalid text header characters: non-ASCII or non-printable at {_summarize(bad_chars)}"
raise ValueError(err)


def sanitize_text_header(text_header: str) -> str:
r"""Coerce a SEG-Y textual file header into the 40x80 ASCII-printable card layout.

Runs of two or more ``\n`` collapse to one (some writers terminate cards with ``\n\n``).
Each row gets unsafe characters replaced with spaces and is padded/truncated to 80 chars.
The result always has exactly 40 rows.

Args:
text_header: Decoded textual file header string.

Returns:
Sanitized header that satisfies :func:`validate_text_header`.
"""
normalized = _NEWLINE_RUN.sub("\n", text_header)
rows = normalized.split("\n")

sanitized: list[str] = []
for row in rows[:EXPECTED_ROWS]:
cleaned = "".join(c if _is_safe_char(c) else " " for c in row)
sanitized.append(cleaned[:EXPECTED_COLS].ljust(EXPECTED_COLS))

while len(sanitized) < EXPECTED_ROWS:
sanitized.append(" " * EXPECTED_COLS)

return "\n".join(sanitized)
49 changes: 46 additions & 3 deletions tests/unit/test_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unittest.mock import patch

import pytest
from pydantic import ValidationError

from mdio.core.config import MDIOSettings

Expand Down Expand Up @@ -32,7 +33,7 @@ def test_environment_isolation(self) -> None:
original_values = {
"cpus": MDIOSettings().export_cpus,
"ratio": MDIOSettings().grid_sparsity_ratio_warn,
"bool": MDIOSettings().save_segy_file_header,
"save_header": MDIOSettings().save_segy_file_header,
}

with patch.dict(
Expand All @@ -45,9 +46,51 @@ def test_environment_isolation(self) -> None:
):
assert MDIOSettings().export_cpus == 99
assert MDIOSettings().grid_sparsity_ratio_warn == 99.9
assert MDIOSettings().save_segy_file_header is True
assert MDIOSettings().save_segy_file_header == 1

# Values should be restored after context
assert MDIOSettings().export_cpus == original_values["cpus"]
assert MDIOSettings().grid_sparsity_ratio_warn == original_values["ratio"]
assert MDIOSettings().save_segy_file_header == original_values["bool"]
assert MDIOSettings().save_segy_file_header == original_values["save_header"]


class TestSaveSegyFileHeaderMode:
"""Test coercion for ``MDIO__IMPORT__SAVE_SEGY_FILE_HEADER``."""

@pytest.mark.parametrize(
("env_value", "expected"),
[
("0", 0),
("1", 1),
("2", 2),
("false", 0),
("False", 0),
("FALSE", 0),
("no", 0),
("off", 0),
("true", 1),
("True", 1),
("TRUE", 1),
("yes", 1),
("on", 1),
],
)
def test_string_coercion(self, env_value: str, expected: int) -> None:
"""Strings (including legacy bool aliases) coerce to 0, 1, or 2."""
with patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": env_value}):
assert MDIOSettings().save_segy_file_header == expected

@pytest.mark.parametrize("python_value", [False, True, 0, 1, 2])
def test_native_python_values(self, python_value: bool | int) -> None:
"""Bool/int passed directly are accepted for backwards compatibility."""
settings = MDIOSettings(MDIO__IMPORT__SAVE_SEGY_FILE_HEADER=python_value)
assert settings.save_segy_file_header == int(python_value)

@pytest.mark.parametrize("bad_value", ["3", "-1", "maybe", "tru"])
def test_rejects_invalid_strings(self, bad_value: str) -> None:
"""Anything other than 0/1/2 or bool aliases is rejected."""
with (
patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": bad_value}),
pytest.raises(ValidationError),
):
MDIOSettings()
Loading
Loading