Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ sandbox/
venv/
venvs/
.DS_Store
uv.lock
7 changes: 6 additions & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co

## Test Markers
- When adding AI-generated tests, mark them with `@pytest.mark.ai_generated`
- Any new pytest markers must be registered in `tox.ini` under `[pytest]` section in the `markers` list
- Any new pytest markers must be registered in `pytest_configure` function of `dandi/pytest_plugin.py`

## Code Style
- Code is formatted with Black (line length 100)
Expand All @@ -31,6 +31,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
- Prefer specific exceptions over generic ones
- For CLI, use click library patterns
- Imports organized: stdlib, third-party, local (alphabetical within groups)
- **Imports must be at the top of the file** — do NOT place imports inside
functions or methods unless there is a concrete reason (circular dependency,
or heavy transitive imports like `pynwb`/`h5py`/`nwbinspector` that would
slow down module load for unrelated code paths). When deferring an import
for weight, add the comment `# Avoid heavy import by importing within function:`.

## Documentation
- Keep docstrings updated when changing function signatures
Expand Down
42 changes: 41 additions & 1 deletion dandi/files/bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,23 @@
from dandischema.digests.dandietag import DandiETag
from dandischema.models import BareAsset, CommonModel
from dandischema.models import Dandiset as DandisetMeta
from dandischema.models import get_schema_version
from dandischema.models import StandardsType, get_schema_version, nwb_standard
from packaging.version import Version
from pydantic import ValidationError

# True when the installed dandischema exposes the per-asset dataStandard field
# and related StandardsType enhancements (version, extensions). All these
# fields ship together starting with dandischema 0.12.2.
# TODO: remove this guard (and all branches that check it) once the minimum
# required dandischema version is >= 0.12.2.
_SCHEMA_BAREASSET_HAS_DATASTANDARD = "dataStandard" in BareAsset.model_fields
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD and Version(
dandischema.__version__
) >= Version("0.12.2"):
raise RuntimeError(
f"dandischema {dandischema.__version__} should have "
f"'dataStandard' field on BareAsset"
)
from pydantic_core import ErrorDetails
import requests

Expand Down Expand Up @@ -504,6 +518,32 @@ def get_metadata(
else:
raise
metadata.path = self.path
if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
kwargs: dict[str, Any] = dict(nwb_standard)
# Avoid heavy import by importing within function:
from dandi.pynwb_utils import get_nwb_extensions

try:
nwb_exts = get_nwb_extensions(self.filepath)
except Exception:
lgr.debug(
"Failed to extract NWB extensions from %s",
self.filepath,
exc_info=True,
)
nwb_exts = {}
if nwb_exts:
kwargs["extensions"] = [
StandardsType(name=name, version=ver).model_dump(
mode="json", exclude_none=True
)
for name, ver in sorted(nwb_exts.items())
]
nwb = StandardsType(**kwargs)
if metadata.dataStandard is None:
metadata.dataStandard = [nwb]
elif nwb not in metadata.dataStandard:
metadata.dataStandard.append(nwb)
return metadata

# TODO: @validate_cache.memoize_path
Expand Down
83 changes: 80 additions & 3 deletions dandi/files/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,27 @@
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
import json
from pathlib import Path
from threading import Lock
import weakref

from dandischema.models import BareAsset
from dandischema.models import (
BareAsset,
StandardsType,
bids_standard,
ome_ngff_standard,
)

import dandi
from dandi.bids_validator_deno import bids_validate

from .bases import GenericAsset, LocalFileAsset, NWBAsset
from .bases import GenericAsset, LocalFileAsset, NWBAsset, _SCHEMA_BAREASSET_HAS_DATASTANDARD

Check notice

Code scanning / CodeQL

Cyclic import Note

Import of module
dandi.files.bases
begins an import cycle.

if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
from dandischema.models import hed_standard
else:
hed_standard = None # type: ignore[assignment]
from .zarr import ZarrAsset
from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
from ..metadata.core import add_common_metadata, prepare_metadata
Expand All @@ -23,10 +35,30 @@
ValidationResult,
)

lgr = dandi.get_logger()

BIDS_ASSET_ERRORS = ("BIDS.NON_BIDS_PATH_PLACEHOLDER",)
BIDS_DATASET_ERRORS = ("BIDS.MANDATORY_FILE_MISSING_PLACEHOLDER",)


def _add_standard(
metadata, # type: ignore[no-untyped-def]
standard_dict: dict,
version: str | None = None,
) -> None:
"""Add a data standard to asset metadata if the field is available."""
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
return
kwargs = dict(standard_dict)
if version:
kwargs["version"] = version
standard = StandardsType(**kwargs)
if metadata.dataStandard is None:
metadata.dataStandard = [standard]
elif standard not in metadata.dataStandard:
metadata.dataStandard.append(standard)


@dataclass
class BIDSDatasetDescriptionAsset(LocalFileAsset):
"""
Expand Down Expand Up @@ -192,7 +224,50 @@
assert self._dataset_errors is not None
return self._dataset_errors.copy()

# get_metadata(): inherit use of default metadata from LocalFileAsset
def get_metadata(
self,
digest: Digest | None = None,
ignore_errors: bool = True,
) -> BareAsset:
metadata = super().get_metadata(digest=digest, ignore_errors=ignore_errors)
try:
with open(self.filepath) as f:
desc = json.load(f)
except (OSError, json.JSONDecodeError) as e:
lgr.warning("Failed to read %s: %s", self.filepath, e)
_add_standard(metadata, bids_standard)
return metadata
_add_standard(metadata, bids_standard, version=desc.get("BIDSVersion"))
if hed_standard and (hed_version := desc.get("HEDVersion")):
# HEDVersion can be a string or list.
# List form: ["8.2.0", "sc:1.0.0"] where first element is base
# HED version and subsequent "prefix:version" entries are library
# schemas recorded as extensions.
if isinstance(hed_version, list):
version = hed_version[0] if hed_version else None
library_entries = hed_version[1:]
else:
version = hed_version
library_entries = []
kwargs: dict = dict(hed_standard)
if version:
kwargs["version"] = version
if library_entries:
extensions = []
for entry in library_entries:
# Format is "prefix:version" (e.g. "sc:1.0.0")
if ":" in str(entry):
lib_name, lib_ver = str(entry).split(":", 1)
else:
lib_name, lib_ver = str(entry), None
ext = StandardsType(name=lib_name, version=lib_ver)
extensions.append(
ext.model_dump(mode="json", exclude_none=True)
)
if extensions:
kwargs["extensions"] = extensions
_add_standard(metadata, kwargs)
return metadata


@dataclass
Expand Down Expand Up @@ -312,6 +387,8 @@
add_common_metadata(metadata, self.filepath, start_time, end_time, digest)
metadata.path = self.path
metadata.encodingFormat = ZARR_MIME_TYPE
if Path(self.path).suffixes == [".ome", ".zarr"]:
_add_standard(metadata, ome_ngff_standard)
return metadata


Expand Down
43 changes: 43 additions & 0 deletions dandi/pynwb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,49 @@ def _sanitize(v: Any) -> str:
return None


# Namespaces bundled with NWB/HDMF core — not extensions
_NWB_CORE_NAMESPACES = frozenset({"core", "hdmf-common", "hdmf-experimental"})


def get_nwb_extensions(filepath: str | Path | Readable) -> dict[str, str]:
"""Return NWB extensions embedded in an HDF5 file.

Reads the ``specifications`` group of an NWB HDF5 file and returns a
mapping of extension namespace names to their latest embedded version,
excluding core NWB/HDMF namespaces.

Parameters
----------
filepath
Path to an NWB ``.nwb`` HDF5 file, or a :class:`Readable`.

Returns
-------
dict[str, str]
``{namespace_name: latest_version}`` for each non-core namespace
found in the file's ``specifications`` group. Empty dict if no
extensions are present or the group does not exist.
"""
extensions: dict[str, str] = {}
with open_readable(filepath) as fp, h5py.File(fp, "r") as h5file:
specs = h5file.get("specifications")
if specs is None:
return extensions
for name in specs:
if name in _NWB_CORE_NAMESPACES:
continue
ns_group = specs[name]
if not isinstance(ns_group, h5py.Group):
continue
try:
sorted_versions = sorted(ns_group, key=Version)
if sorted_versions:
extensions[name] = sorted_versions[-1]
except Exception:
lgr.debug("Failed to parse versions for NWB extension %s", name)
return extensions


def get_neurodata_types_to_modalities_map() -> dict[str, str]:
"""Return a dict to map neurodata types known to pynwb to "modalities"

Expand Down
87 changes: 87 additions & 0 deletions dandi/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import json
from operator import attrgetter
import os
from pathlib import Path
Expand All @@ -16,6 +17,7 @@
from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
from ..dandiapi import AssetType, RemoteZarrAsset
from ..exceptions import UnknownAssetError
from ..files.bases import _SCHEMA_BAREASSET_HAS_DATASTANDARD
from ..files import (
BIDSDatasetDescriptionAsset,
DandisetMetadataFile,
Expand Down Expand Up @@ -608,3 +610,88 @@ def test_validate_invalid_zarr3(path: str, expected_result_ids: set[str]) -> Non

result_ids = {r.id for r in zf.get_validation_errors()}
assert result_ids == expected_result_ids


@pytest.mark.ai_generated
class TestBIDSDatasetDescriptionDataStandard:
"""Tests for per-asset dataStandard population from dataset_description.json"""

@staticmethod
def _make_bids_dd(tmp_path: Path, content: dict) -> BIDSDatasetDescriptionAsset:
dd_path = tmp_path / "dataset_description.json"
dd_path.write_text(json.dumps(content))
return BIDSDatasetDescriptionAsset(
filepath=dd_path,
path="dataset_description.json",
dandiset_path=tmp_path,
)

@staticmethod
def _standard_names(metadata): # type: ignore[no-untyped-def]
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
pytest.skip("dandischema too old, no dataStandard on BareAsset")
return [s.name for s in (metadata.dataStandard or [])]

def test_bids_always_set(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Brain Imaging Data Structure (BIDS)" in names

def test_hed_detected_when_hedversion_present(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0", "HEDVersion": "8.2.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" in names
assert "Brain Imaging Data Structure (BIDS)" in names

def test_hed_not_detected_when_hedversion_absent(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" not in names

def test_hed_detected_with_list_hedversion(self, tmp_path: Path) -> None:
"""HEDVersion can be a list of strings per BIDS spec."""
asset = self._make_bids_dd(
tmp_path,
{
"Name": "Test",
"BIDSVersion": "1.9.0",
"HEDVersion": ["8.2.0", "sc:1.0.0"],
},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" in names

def test_hed_library_schemas_as_extensions(self, tmp_path: Path) -> None:
"""HED library schemas in list HEDVersion populate extensions."""
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
pytest.skip("dandischema too old, no dataStandard on BareAsset")
asset = self._make_bids_dd(
tmp_path,
{
"Name": "Test",
"BIDSVersion": "1.9.0",
"HEDVersion": ["8.2.0", "sc:1.0.0", "lang:1.1.0"],
},
)
metadata = asset.get_metadata()
hed_standards = [
s for s in (metadata.dataStandard or [])
if s.name == "Hierarchical Event Descriptors (HED)"
]
assert len(hed_standards) == 1
hed = hed_standards[0]
assert hed.version == "8.2.0"
assert hed.extensions is not None
ext_names = {e.name for e in hed.extensions}
assert ext_names == {"sc", "lang"}
ext_map = {e.name: e.version for e in hed.extensions}
assert ext_map == {"sc": "1.0.0", "lang": "1.1.0"}
Loading
Loading