Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion osf/metadata/osf_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from django.contrib.contenttypes.models import ContentType
from django import db
from mimetypes import MimeTypes
import rdflib

from api.caching.tasks import get_storage_usage_total
Expand Down Expand Up @@ -44,6 +45,8 @@

logger = logging.getLogger(__name__)

mime = MimeTypes()


##### BEGIN "public" api #####

Expand Down Expand Up @@ -373,7 +376,7 @@ def osf_iri(guid_or_model):
return OSFIO[guid._id]


def osfguid_from_iri(iri):
def osfguid_from_iri(iri: str) -> str:
if iri.startswith(OSFIO):
return without_namespace(iri, OSFIO)
raise ValueError(f'expected iri starting with "{OSFIO}" (got "{iri}")')
Expand Down Expand Up @@ -702,6 +705,16 @@ def gather_files(focus):
yield (DCTERMS.requires, file_focus)


@gather.er(DCAT.mediaType)
def gather_file_mediatype(focus):
(mime_type, _) = mime.guess_type(focus.dbmodel.name)
yield (DCAT.mediaType, (
'application/octet-stream'
if mime_type is None
else mime_type
))


@gather.er(DCTERMS.hasPart, DCTERMS.isPartOf)
def gather_parts(focus):
if isinstance(focus.dbmodel, osfdb.AbstractNode):
Expand Down
3 changes: 3 additions & 0 deletions osf/metadata/serializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
from .datacite import DataciteJsonMetadataSerializer, DataciteXmlMetadataSerializer
from .google_dataset_json_ld import GoogleDatasetJsonLdSerializer
from .turtle import TurtleMetadataSerializer
from .linkset import SignpostLinkset, SignpostLinksetJSON


METADATA_SERIALIZER_REGISTRY = {
'turtle': TurtleMetadataSerializer,
'datacite-json': DataciteJsonMetadataSerializer,
'datacite-xml': DataciteXmlMetadataSerializer,
'google-dataset-json-ld': GoogleDatasetJsonLdSerializer,
'linkset': SignpostLinkset,
'linkset-json': SignpostLinksetJSON
}


Expand Down
148 changes: 148 additions & 0 deletions osf/metadata/serializers/linkset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""osf.metadata.serializers.signpost_linkset: FAIR signposting with osf metadata
FAIR signposting: https://signposting.org/FAIR/
definition of linkset mediatypes: https://www.rfc-editor.org/rfc/rfc9264.html
"""
from __future__ import annotations
import abc
from collections.abc import (
Iterable,
Iterator
)
from collections import defaultdict
import dataclasses
import json
from urllib.parse import urljoin, urlsplit, urlencode, urlunsplit

import rdflib

from ._base import MetadataSerializer
from osf.metadata.osf_gathering import osfguid_from_iri
from osf.metadata.rdfutils import DOI, DCTERMS, OWL, RDF, OSF, DCAT
from website.settings import DOMAIN
from website.util import web_url_for


@dataclasses.dataclass
class SignpostLink:
anchor_uri: str
relation: str
target_uri: str
target_attrs: Iterable[tuple[str, str]] = ()


class BaseSignpostLinkset(MetadataSerializer, abc.ABC):
def _each_link(self) -> Iterator[SignpostLink]:
focus_iri = self.basket.focus.iri
if self.basket.focus.rdftype == OSF.File:
# collection (file's containing obj)
for _collection_uri in self.basket[OSF.isContainedBy]:
yield SignpostLink(focus_iri, 'collection', str(_collection_uri))

# author
for _creator_iri in self.basket[DCTERMS.creator]:
yield SignpostLink(focus_iri, 'author', str(_creator_iri))

# type
if self.basket.focus.rdftype == OSF.File:
parent_types = set(self.basket[OSF.isContainedBy / (DCTERMS.type | RDF.type)])
for _type_iri in self.basket[DCTERMS.type | RDF.type]:
# check the type differs from parent project / registry / preprint
if _type_iri not in parent_types:
yield SignpostLink(focus_iri, 'type', str(_type_iri))
else:
for _type_iri in self.basket[DCTERMS.type | RDF.type]:
yield SignpostLink(focus_iri, 'type', str(_type_iri))

# cite-as
yield SignpostLink(focus_iri, 'cite-as', next((
_sameas_iri
for _sameas_iri in self.basket[OWL.sameAs]
if _sameas_iri.startswith(DOI)
), focus_iri))

base_metadata_url = urljoin(DOMAIN, web_url_for(
'metadata_download', # name of a view function mapped in website/routes.py
guid=osfguid_from_iri(self.basket.focus.iri),
))
split_base_metadata_url = urlsplit(base_metadata_url)

# describes
yield SignpostLink(
base_metadata_url,
'describes',
focus_iri,
)

from osf.metadata.serializers import METADATA_SERIALIZER_REGISTRY
# describedby
for _format_key, _serializer in METADATA_SERIALIZER_REGISTRY.items():
_metadata_url = urlunsplit(split_base_metadata_url._replace(
query=urlencode({'format': _format_key}),
))
yield SignpostLink(
focus_iri,
'describedby',
_metadata_url,
[('type', _serializer.mediatype)]
)

# license
for _license_uri in self.basket[DCTERMS.rights]:
if not isinstance(_license_uri, rdflib.BNode):
yield SignpostLink(focus_iri, 'license', str(_license_uri))

# item
for _file_iri in self.basket[OSF.contains]:
for mime_type in self.basket[_file_iri:DCAT.mediaType]:
yield SignpostLink(focus_iri, 'item', str(_file_iri), [('type', mime_type)])


class SignpostLinkset(BaseSignpostLinkset):
mediatype = 'application/linkset'

def filename_for_itemid(self, itemid: str):
return f'{itemid}-metadata.linkset'

def serialize(self) -> str | bytes:
"""serialize a linkset for FAIR signposting
see example https://www.rfc-editor.org/rfc/rfc9264.html#section-7.1
FAIR signposting: https://signposting.org/FAIR/
"""
result = ',\n'.join(self._serialize_link(link) for link in self._each_link())
return '{}\n'.format(result)

def _serialize_link(self, link: SignpostLink) -> str:
segments = [
f'<{link.target_uri}>',
f'rel="{link.relation}"',
f'anchor="{link.anchor_uri}"'
]
for key, value in link.target_attrs:
segments.append(f'{key}="{value}"')
return ' ; '.join(segments)

class SignpostLinksetJSON(BaseSignpostLinkset):
mediatype = 'application/linkset+json'

def filename_for_itemid(self, itemid: str):
return f'{itemid}-metadata.linkset.json'

def serialize(self) -> str | bytes:
"""serialize linkset json
definition: https://www.rfc-editor.org/rfc/rfc9264.html#section-4.2
example: https://www.rfc-editor.org/rfc/rfc9264.html#section-7.2
"""
grouped_links = defaultdict(lambda: defaultdict(list))

for link in self._each_link():
link_entry = {'href': link.target_uri}
link_entry.update(link.target_attrs)
grouped_links[link.anchor_uri][link.relation].append(link_entry)

linkset = []
for anchor, relations in grouped_links.items():
anchor_entry = {'anchor': anchor}
anchor_entry.update(relations)
linkset.append(anchor_entry)

return json.dumps({'linkset': linkset}, indent=2)
10 changes: 10 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_basic.linkset
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<http://localhost:5000/w2ibb> ; rel="collection" ; anchor="http://localhost:5000/w3ibb",
<https://osf.io/vocab/2022/File> ; rel="type" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="cite-as" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="describes" ; anchor="http://localhost:5000/metadata/w3ibb/",
<http://localhost:5000/metadata/w3ibb/?format=turtle> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="text/turtle; charset=utf-8",
<http://localhost:5000/metadata/w3ibb/?format=datacite-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/json",
<http://localhost:5000/metadata/w3ibb/?format=datacite-xml> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/xml",
<http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/ld+json",
<http://localhost:5000/metadata/w3ibb/?format=linkset> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset",
<http://localhost:5000/metadata/w3ibb/?format=linkset-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset+json"
56 changes: 56 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_basic.linkset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"linkset": [
{
"anchor": "http://localhost:5000/w3ibb",
"collection": [
{
"href": "http://localhost:5000/w2ibb"
}
],
"type": [
{
"href": "https://osf.io/vocab/2022/File"
}
],
"cite-as": [
{
"href": "http://localhost:5000/w3ibb"
}
],
"describedby": [
{
"href": "http://localhost:5000/metadata/w3ibb/?format=turtle",
"type": "text/turtle; charset=utf-8"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-json",
"type": "application/json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-xml",
"type": "application/xml"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld",
"type": "application/ld+json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset",
"type": "application/linkset"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset-json",
"type": "application/linkset+json"
}
]
},
{
"anchor": "http://localhost:5000/metadata/w3ibb/",
"describes": [
{
"href": "http://localhost:5000/w3ibb"
}
]
}
]
}
10 changes: 10 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_full.linkset
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<http://localhost:5000/w2ibb> ; rel="collection" ; anchor="http://localhost:5000/w3ibb",
<https://osf.io/vocab/2022/File> ; rel="type" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="cite-as" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="describes" ; anchor="http://localhost:5000/metadata/w3ibb/",
<http://localhost:5000/metadata/w3ibb/?format=turtle> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="text/turtle; charset=utf-8",
<http://localhost:5000/metadata/w3ibb/?format=datacite-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/json",
<http://localhost:5000/metadata/w3ibb/?format=datacite-xml> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/xml",
<http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/ld+json",
<http://localhost:5000/metadata/w3ibb/?format=linkset> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset",
<http://localhost:5000/metadata/w3ibb/?format=linkset-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset+json"
56 changes: 56 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_full.linkset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"linkset": [
{
"anchor": "http://localhost:5000/w3ibb",
"collection": [
{
"href": "http://localhost:5000/w2ibb"
}
],
"type": [
{
"href": "https://osf.io/vocab/2022/File"
}
],
"cite-as": [
{
"href": "http://localhost:5000/w3ibb"
}
],
"describedby": [
{
"href": "http://localhost:5000/metadata/w3ibb/?format=turtle",
"type": "text/turtle; charset=utf-8"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-json",
"type": "application/json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-xml",
"type": "application/xml"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld",
"type": "application/ld+json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset",
"type": "application/linkset"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset-json",
"type": "application/linkset+json"
}
]
},
{
"anchor": "http://localhost:5000/metadata/w3ibb/",
"describes": [
{
"href": "http://localhost:5000/w3ibb"
}
]
}
]
}
11 changes: 11 additions & 0 deletions osf_tests/metadata/expected_metadata_files/preprint_basic.linkset
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<http://localhost:5000/w1ibb> ; rel="author" ; anchor="http://localhost:5000/w4ibb",
<https://schema.datacite.org/meta/kernel-4/#Preprint> ; rel="type" ; anchor="http://localhost:5000/w4ibb",
<https://osf.io/vocab/2022/Preprint> ; rel="type" ; anchor="http://localhost:5000/w4ibb",
<https://doi.org/11.pp/FK2osf.io/w4ibb_v1> ; rel="cite-as" ; anchor="http://localhost:5000/w4ibb",
<http://localhost:5000/w4ibb> ; rel="describes" ; anchor="http://localhost:5000/metadata/w4ibb/",
<http://localhost:5000/metadata/w4ibb/?format=turtle> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="text/turtle; charset=utf-8",
<http://localhost:5000/metadata/w4ibb/?format=datacite-json> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/json",
<http://localhost:5000/metadata/w4ibb/?format=datacite-xml> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/xml",
<http://localhost:5000/metadata/w4ibb/?format=google-dataset-json-ld> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/ld+json",
<http://localhost:5000/metadata/w4ibb/?format=linkset> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/linkset",
<http://localhost:5000/metadata/w4ibb/?format=linkset-json> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/linkset+json"
Loading