From 6df14b808e74c11e7d47704d1e0f5d192551bbb8 Mon Sep 17 00:00:00 2001 From: Robert Carroll Date: Thu, 12 Mar 2026 10:53:53 -0500 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Datasets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds in dataset representation. --- docs/schema/include_access_model.yaml | 115 ++++++++++++++++-- .../datamodel/include_access_model.py | 74 ++++++++++- .../include_access_model_pydantic.py | 54 ++++++-- .../schema/include_access_model.yaml | 38 +++++- 4 files changed, 260 insertions(+), 21 deletions(-) diff --git a/docs/schema/include_access_model.yaml b/docs/schema/include_access_model.yaml index b0a4b4e..e70593d 100644 --- a/docs/schema/include_access_model.yaml +++ b/docs/schema/include_access_model.yaml @@ -835,10 +835,11 @@ slots: title: DOI from_schema: https://includedcc.org/include-access-model slot_uri: includedcc:do_id - owner: DOI + owner: Dataset domain_of: - Study - DOI + - Dataset range: DOI multivalued: false subject_id: @@ -979,12 +980,13 @@ slots: title: Name from_schema: https://includedcc.org/include-access-model slot_uri: includedcc:name - owner: ActivityDefinition + owner: Dataset domain_of: - VirtualBiorepository - Investigator - EncounterDefinition - ActivityDefinition + - Dataset range: string required: false email: @@ -1176,9 +1178,10 @@ slots: title: Publication from_schema: https://includedcc.org/include-access-model slot_uri: includedcc:publication - owner: Study + owner: Dataset domain_of: - Study + - Dataset range: Publication multivalued: true inlined: true @@ -1737,10 +1740,11 @@ slots: title: Description from_schema: https://includedcc.org/include-access-model slot_uri: includedcc:description - owner: ActivityDefinition + owner: Dataset domain_of: - EncounterDefinition - ActivityDefinition + - Dataset range: string encounter_definition_id: name: encounter_definition_id @@ -1773,9 +1777,10 @@ slots: title: File ID from_schema: https://includedcc.org/include-access-model slot_uri: includedcc:file_id - owner: File + owner: Dataset domain_of: - File + - Dataset range: File filename: name: filename @@ -1891,6 +1896,39 @@ slots: domain_of: - FileHash range: string + dataset_id: + name: dataset_id + definition_uri: https://includedcc.org/include-access-model/dataset_id + description: Unique identifier for a Dataset. + title: Dataset ID + from_schema: https://includedcc.org/include-access-model + slot_uri: includedcc:dataset_id + owner: Dataset + domain_of: + - Dataset + range: Dataset + data_collection_start: + name: data_collection_start + definition_uri: https://includedcc.org/include-access-model/data_collection_start + description: The date that data collection started. May include only a year. + title: Data Collection Start + from_schema: https://includedcc.org/include-access-model + slot_uri: includedcc:data_collection_start + owner: Dataset + domain_of: + - Dataset + range: string + data_collection_end: + name: data_collection_end + definition_uri: https://includedcc.org/include-access-model/data_collection_end + description: The date that data collection started. May include only a year. + title: Data Collection End + from_schema: https://includedcc.org/include-access-model + slot_uri: includedcc:data_collection_end + owner: Dataset + domain_of: + - Dataset + range: string Study_study_id: name: Study_study_id definition_uri: https://includedcc.org/include-access-model/study_id @@ -2235,6 +2273,41 @@ slots: usage_slot_name: sample_id range: Sample multivalued: true + Dataset_dataset_id: + name: Dataset_dataset_id + definition_uri: https://includedcc.org/include-access-model/dataset_id + description: Unique identifier for a Dataset. + title: Dataset ID + from_schema: https://includedcc.org/include-access-model + is_a: dataset_id + domain: Dataset + slot_uri: includedcc:dataset_id + identifier: true + alias: dataset_id + owner: Dataset + domain_of: + - Dataset + is_usage_slot: true + usage_slot_name: dataset_id + range: string + required: true + Dataset_file_id: + name: Dataset_file_id + definition_uri: https://includedcc.org/include-access-model/file_id + description: The list of files comprising this dataset. + title: File ID + from_schema: https://includedcc.org/include-access-model + is_a: file_id + domain: Dataset + slot_uri: includedcc:file_id + alias: file_id + owner: Dataset + domain_of: + - Dataset + is_usage_slot: true + usage_slot_name: file_id + range: File + multivalued: true classes: Record: name: Record @@ -2643,9 +2716,35 @@ classes: - hash_type - hash_value class_uri: includedcc:FileHash + Dataset: + name: Dataset + definition_uri: https://includedcc.org/include-access-model/Dataset + description: Set of files grouped together for release. + title: Dataset + from_schema: https://includedcc.org/include-access-model + slots: + - Dataset_dataset_id + - name + - description + - do_id + - Dataset_file_id + - publication + - data_collection_start + - data_collection_end + slot_usage: + dataset_id: + name: dataset_id + identifier: true + range: string + required: true + file_id: + name: file_id + description: The list of files comprising this dataset. + multivalued: true + class_uri: includedcc:Dataset metamodel_version: 1.7.0 source_file: include_access_model.yaml -source_file_date: '2026-03-09T16:30:20' -source_file_size: 35097 -generation_date: '2026-03-09T16:31:10' +source_file_date: '2026-03-12T10:52:04' +source_file_size: 36264 +generation_date: '2026-03-12T10:53:07' diff --git a/src/include_access_model/datamodel/include_access_model.py b/src/include_access_model/datamodel/include_access_model.py index 94ed38a..d1d9023 100644 --- a/src/include_access_model/datamodel/include_access_model.py +++ b/src/include_access_model/datamodel/include_access_model.py @@ -1,5 +1,5 @@ # Auto generated from include_access_model.yaml by pythongen.py version: 0.0.1 -# Generation date: 2026-03-09T16:30:56 +# Generation date: 2026-03-12T10:52:49 # Schema: include-access-model # # id: https://includedcc.org/include-access-model @@ -141,6 +141,10 @@ class FileFileId(extended_str): pass +class DatasetDatasetId(extended_str): + pass + + @dataclass(repr=False) class Record(YAMLRoot): """ @@ -994,6 +998,59 @@ def __post_init__(self, *_: str, **kwargs: Any): super().__post_init__(**kwargs) +@dataclass(repr=False) +class Dataset(YAMLRoot): + """ + Set of files grouped together for release. + """ + _inherited_slots: ClassVar[list[str]] = [] + + class_class_uri: ClassVar[URIRef] = INCLUDEDCC["Dataset"] + class_class_curie: ClassVar[str] = "includedcc:Dataset" + class_name: ClassVar[str] = "Dataset" + class_model_uri: ClassVar[URIRef] = INCLUDEDCC.Dataset + + dataset_id: Union[str, DatasetDatasetId] = None + name: Optional[str] = None + description: Optional[str] = None + do_id: Optional[Union[str, DOIDoId]] = None + file_id: Optional[Union[Union[str, FileFileId], list[Union[str, FileFileId]]]] = empty_list() + publication: Optional[Union[Union[dict, Publication], list[Union[dict, Publication]]]] = empty_list() + data_collection_start: Optional[str] = None + data_collection_end: Optional[str] = None + + def __post_init__(self, *_: str, **kwargs: Any): + if self._is_empty(self.dataset_id): + self.MissingRequiredField("dataset_id") + if not isinstance(self.dataset_id, DatasetDatasetId): + self.dataset_id = DatasetDatasetId(self.dataset_id) + + if self.name is not None and not isinstance(self.name, str): + self.name = str(self.name) + + if self.description is not None and not isinstance(self.description, str): + self.description = str(self.description) + + if self.do_id is not None and not isinstance(self.do_id, DOIDoId): + self.do_id = DOIDoId(self.do_id) + + if not isinstance(self.file_id, list): + self.file_id = [self.file_id] if self.file_id is not None else [] + self.file_id = [v if isinstance(v, FileFileId) else FileFileId(v) for v in self.file_id] + + if not isinstance(self.publication, list): + self.publication = [self.publication] if self.publication is not None else [] + self.publication = [v if isinstance(v, Publication) else Publication(**as_dict(v)) for v in self.publication] + + if self.data_collection_start is not None and not isinstance(self.data_collection_start, str): + self.data_collection_start = str(self.data_collection_start) + + if self.data_collection_end is not None and not isinstance(self.data_collection_end, str): + self.data_collection_end = str(self.data_collection_end) + + super().__post_init__(**kwargs) + + # Enumerations class EnumProgram(EnumDefinitionImpl): """ @@ -1792,6 +1849,15 @@ class slots: slots.hash_value = Slot(uri=INCLUDEDCC.hash_value, name="hash_value", curie=INCLUDEDCC.curie('hash_value'), model_uri=INCLUDEDCC.hash_value, domain=None, range=Optional[str]) +slots.dataset_id = Slot(uri=INCLUDEDCC.dataset_id, name="dataset_id", curie=INCLUDEDCC.curie('dataset_id'), + model_uri=INCLUDEDCC.dataset_id, domain=None, range=Optional[Union[str, DatasetDatasetId]]) + +slots.data_collection_start = Slot(uri=INCLUDEDCC.data_collection_start, name="data_collection_start", curie=INCLUDEDCC.curie('data_collection_start'), + model_uri=INCLUDEDCC.data_collection_start, domain=None, range=Optional[str]) + +slots.data_collection_end = Slot(uri=INCLUDEDCC.data_collection_end, name="data_collection_end", curie=INCLUDEDCC.curie('data_collection_end'), + model_uri=INCLUDEDCC.data_collection_end, domain=None, range=Optional[str]) + slots.Study_study_id = Slot(uri=INCLUDEDCC.study_id, name="Study_study_id", curie=INCLUDEDCC.curie('study_id'), model_uri=INCLUDEDCC.Study_study_id, domain=Study, range=Union[str, StudyStudyId]) @@ -1848,3 +1914,9 @@ class slots: slots.File_sample_id = Slot(uri=INCLUDEDCC.sample_id, name="File_sample_id", curie=INCLUDEDCC.curie('sample_id'), model_uri=INCLUDEDCC.File_sample_id, domain=File, range=Optional[Union[Union[str, SampleSampleId], list[Union[str, SampleSampleId]]]]) + +slots.Dataset_dataset_id = Slot(uri=INCLUDEDCC.dataset_id, name="Dataset_dataset_id", curie=INCLUDEDCC.curie('dataset_id'), + model_uri=INCLUDEDCC.Dataset_dataset_id, domain=Dataset, range=Union[str, DatasetDatasetId]) + +slots.Dataset_file_id = Slot(uri=INCLUDEDCC.file_id, name="Dataset_file_id", curie=INCLUDEDCC.curie('file_id'), + model_uri=INCLUDEDCC.Dataset_file_id, domain=Dataset, range=Optional[Union[Union[str, FileFileId], list[Union[str, FileFileId]]]]) diff --git a/src/include_access_model/datamodel/include_access_model_pydantic.py b/src/include_access_model/datamodel/include_access_model_pydantic.py index 693c317..0b09a6f 100644 --- a/src/include_access_model/datamodel/include_access_model_pydantic.py +++ b/src/include_access_model/datamodel/include_access_model_pydantic.py @@ -458,10 +458,10 @@ class Study(Record): contact: list[Investigator] = Field(default=..., title="Contact Person", description="""The individual to contact with questions about this record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'VirtualBiorepository']} }) study_description: str = Field(default=..., title="Study Description", description="""Brief description of the study (2-4 sentences)""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study']} }) website: Optional[str] = Field(default=None, title="Website", description="""Website for the Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'VirtualBiorepository', 'Publication']} }) - publication: Optional[list[Publication]] = Field(default=[], title="Publication", description="""Publications associated with this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study']} }) + publication: Optional[list[Publication]] = Field(default=[], title="Publication", description="""Publications associated with this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'Dataset']} }) acknowledgments: Optional[str] = Field(default=None, title="Acknowledgments", description="""Funding statement and acknowledgments for this study""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study']} }) citation_statement: Optional[str] = Field(default=None, title="Citation Statement", description="""Statement that secondary data users should use to acknowledge use of this study or dataset. E.g., \"The results analyzed and here are based in whole or in part upon data generated by the INCLUDE (INvestigation of Co-occurring conditions across the Lifespan to Understand Down syndromE) Project , and were accessed from the INCLUDE Data Hub and .\"""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study']} }) - do_id: Optional[str] = Field(default=None, title="DOI", description="""Digital Object Identifier (DOI) for this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'DOI']} }) + do_id: Optional[str] = Field(default=None, title="DOI", description="""Digital Object Identifier (DOI) for this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'DOI', 'Dataset']} }) external_id: Optional[list[str]] = Field(default=[], title="External Identifiers", description="""Other identifiers for this entity, eg, from the submitting study or in systems like dbGaP""", json_schema_extra = { "linkml_meta": {'domain_of': ['Record']} }) @@ -501,7 +501,8 @@ class VirtualBiorepository(Record): name: Optional[str] = Field(default=None, title="Name", description="""Name of the entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', 'Investigator', 'EncounterDefinition', - 'ActivityDefinition']} }) + 'ActivityDefinition', + 'Dataset']} }) institution: Optional[str] = Field(default=None, title="Institution", description="""Name of the institution this record is associated with.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', 'Investigator']} }) contact: list[Investigator] = Field(default=..., title="Contact Person", description="""The individual to contact with questions about this record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'VirtualBiorepository']} }) website: Optional[str] = Field(default=None, title="Website", description="""Website for the Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'VirtualBiorepository', 'Publication']} }) @@ -520,7 +521,7 @@ class DOI(Record): 'required': True}}, 'title': 'Digital Object Identifier (DOI)'}) - do_id: str = Field(default=..., title="DOI", description="""Digital Object Identifier (DOI) for this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'DOI']} }) + do_id: str = Field(default=..., title="DOI", description="""Digital Object Identifier (DOI) for this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'DOI', 'Dataset']} }) bibliographic_reference: Optional[str] = Field(default=None, title="Bibiliographic Reference", description="""Text use to reference this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['DOI', 'Publication']} }) external_id: Optional[list[str]] = Field(default=[], title="External Identifiers", description="""Other identifiers for this entity, eg, from the submitting study or in systems like dbGaP""", json_schema_extra = { "linkml_meta": {'domain_of': ['Record']} }) @@ -535,7 +536,8 @@ class Investigator(Record): name: Optional[str] = Field(default=None, title="Name", description="""Name of the entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', 'Investigator', 'EncounterDefinition', - 'ActivityDefinition']} }) + 'ActivityDefinition', + 'Dataset']} }) institution: Optional[str] = Field(default=None, title="Institution", description="""Name of the institution this record is associated with.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', 'Investigator']} }) investigator_title: Optional[str] = Field(default=None, title="Investigator Title", description="""The title of the Investigator, eg, \"Assistant Professor\"""", json_schema_extra = { "linkml_meta": {'domain_of': ['Investigator']} }) email: Optional[str] = Field(default=None, title="Email Address", description="""An email address to reach the entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Investigator']} }) @@ -756,8 +758,9 @@ class EncounterDefinition(Record): name: Optional[str] = Field(default=None, title="Name", description="""Name of the entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', 'Investigator', 'EncounterDefinition', - 'ActivityDefinition']} }) - description: Optional[str] = Field(default=None, title="Description", description="""Description for this entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['EncounterDefinition', 'ActivityDefinition']} }) + 'ActivityDefinition', + 'Dataset']} }) + description: Optional[str] = Field(default=None, title="Description", description="""Description for this entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['EncounterDefinition', 'ActivityDefinition', 'Dataset']} }) activity_definition_id: Optional[list[str]] = Field(default=[], title="Activity Definition ID", description="""Unique identifier for this Activity Definition.""", json_schema_extra = { "linkml_meta": {'domain_of': ['EncounterDefinition', 'ActivityDefinition']} }) external_id: Optional[list[str]] = Field(default=[], title="External Identifiers", description="""Other identifiers for this entity, eg, from the submitting study or in systems like dbGaP""", json_schema_extra = { "linkml_meta": {'domain_of': ['Record']} }) @@ -777,8 +780,9 @@ class ActivityDefinition(Record): name: Optional[str] = Field(default=None, title="Name", description="""Name of the entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', 'Investigator', 'EncounterDefinition', - 'ActivityDefinition']} }) - description: Optional[str] = Field(default=None, title="Description", description="""Description for this entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['EncounterDefinition', 'ActivityDefinition']} }) + 'ActivityDefinition', + 'Dataset']} }) + description: Optional[str] = Field(default=None, title="Description", description="""Description for this entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['EncounterDefinition', 'ActivityDefinition', 'Dataset']} }) external_id: Optional[list[str]] = Field(default=[], title="External Identifiers", description="""Other identifiers for this entity, eg, from the submitting study or in systems like dbGaP""", json_schema_extra = { "linkml_meta": {'domain_of': ['Record']} }) @@ -795,7 +799,7 @@ class File(Record): 'subject_id': {'multivalued': True, 'name': 'subject_id'}}, 'title': 'File'}) - file_id: str = Field(default=..., title="File ID", description="""Unique identifier for this File.""", json_schema_extra = { "linkml_meta": {'domain_of': ['File']} }) + file_id: str = Field(default=..., title="File ID", description="""Unique identifier for this File.""", json_schema_extra = { "linkml_meta": {'domain_of': ['File', 'Dataset']} }) subject_id: Optional[list[str]] = Field(default=[], title="Study ID", description="""INCLUDE Global ID for the Subject""", json_schema_extra = { "linkml_meta": {'domain_of': ['Subject', 'Demographics', 'SubjectAssertion', @@ -825,6 +829,35 @@ class FileHash(ConfiguredBaseModel): hash_value: Optional[str] = Field(default=None, title="File Hash Value", description="""The value of the file hash""", json_schema_extra = { "linkml_meta": {'domain_of': ['FileHash']} }) +class Dataset(ConfiguredBaseModel): + """ + Set of files grouped together for release. + """ + linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'https://includedcc.org/include-access-model', + 'slot_usage': {'dataset_id': {'identifier': True, + 'name': 'dataset_id', + 'range': 'string', + 'required': True}, + 'file_id': {'description': 'The list of files comprising this ' + 'dataset.', + 'multivalued': True, + 'name': 'file_id'}}, + 'title': 'Dataset'}) + + dataset_id: str = Field(default=..., title="Dataset ID", description="""Unique identifier for a Dataset.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Dataset']} }) + name: Optional[str] = Field(default=None, title="Name", description="""Name of the entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['VirtualBiorepository', + 'Investigator', + 'EncounterDefinition', + 'ActivityDefinition', + 'Dataset']} }) + description: Optional[str] = Field(default=None, title="Description", description="""Description for this entity.""", json_schema_extra = { "linkml_meta": {'domain_of': ['EncounterDefinition', 'ActivityDefinition', 'Dataset']} }) + do_id: Optional[str] = Field(default=None, title="DOI", description="""Digital Object Identifier (DOI) for this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'DOI', 'Dataset']} }) + file_id: Optional[list[str]] = Field(default=[], title="File ID", description="""The list of files comprising this dataset.""", json_schema_extra = { "linkml_meta": {'domain_of': ['File', 'Dataset']} }) + publication: Optional[list[Publication]] = Field(default=[], title="Publication", description="""Publications associated with this Record.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Study', 'Dataset']} }) + data_collection_start: Optional[str] = Field(default=None, title="Data Collection Start", description="""The date that data collection started. May include only a year.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Dataset']} }) + data_collection_end: Optional[str] = Field(default=None, title="Data Collection End", description="""The date that data collection started. May include only a year.""", json_schema_extra = { "linkml_meta": {'domain_of': ['Dataset']} }) + + # Model rebuild # see https://pydantic-docs.helpmanual.io/usage/models/#rebuilding-a-model Record.model_rebuild() @@ -846,3 +879,4 @@ class FileHash(ConfiguredBaseModel): ActivityDefinition.model_rebuild() File.model_rebuild() FileHash.model_rebuild() +Dataset.model_rebuild() diff --git a/src/include_access_model/schema/include_access_model.yaml b/src/include_access_model/schema/include_access_model.yaml index 99145db..bd2104a 100644 --- a/src/include_access_model/schema/include_access_model.yaml +++ b/src/include_access_model/schema/include_access_model.yaml @@ -342,7 +342,28 @@ classes: slots: - hash_type - hash_value - + Dataset: + title: Dataset + description: Set of files grouped together for release. + slots: + - dataset_id + - name + - description + - do_id + - file_id + - publication + #TODO: Are these good elements for the core entity? + - data_collection_start + - data_collection_end + + slot_usage: + dataset_id: + range: string + required: true + identifier: true + file_id: + multivalued: true + description: The list of files comprising this dataset. slots: @@ -766,7 +787,20 @@ slots: title: File Hash Value description: The value of the file hash range: string - + dataset_id: + title: Dataset ID + description: Unique identifier for a Dataset. + range: Dataset + data_collection_start: + title: Data Collection Start + description: The date that data collection started. May include only a year. + #TODO: We could re-evaluate these as dates, but that may be too implementation specific + range: string + data_collection_end: + title: Data Collection End + description: The date that data collection started. May include only a year. + #TODO: We could re-evaluate these as dates, but that may be too implementation specific + range: string enums: EnumProgram: