From a854e9f896a45f876953a6e40609ea2873b7b650 Mon Sep 17 00:00:00 2001 From: Paul van Genuchten Date: Fri, 10 Oct 2025 14:53:24 +0200 Subject: [PATCH] this introduces the option to trigger convert from a local metadata file or even a json or dict object directly it has some other branches merged, so may need to rebase --- csvwlib/converter/ModelConverter.py | 25 +++++++++++++++---------- csvwlib/converter/ToRDFConverter.py | 2 +- csvwlib/utils/MetadataLocator.py | 19 ++++++++++++++++--- csvwlib/utils/metadata.py | 14 ++++++++++---- 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/csvwlib/converter/ModelConverter.py b/csvwlib/converter/ModelConverter.py index 616b8ff..9717578 100644 --- a/csvwlib/converter/ModelConverter.py +++ b/csvwlib/converter/ModelConverter.py @@ -24,7 +24,7 @@ def __init__(self, csv_url=None, metadata_url=None): self.csvs = None self.values_valiator = None self.metadata_url = metadata_url - self.start_url = csv_url if csv_url is not None else metadata_url + self.start_url = csv_url if csv_url is not None else (metadata_url if not isinstance(metadata_url,dict) else metadata_url.get('url')) self.metadata = None self.atdm = {'@type': '@AnnotatedTableGroup'} self.mode = CONST_STANDARD_MODE @@ -34,6 +34,8 @@ def convert_to_atdm(self, mode=CONST_STANDARD_MODE): metadata_validator = MetadataValidator(self.start_url) self.mode = mode self.metadata = MetadataLocator.find_and_get(self.csv_url, self.metadata_url) + if self.metadata_url and (isinstance(self.metadata_url,dict) or not self.metadata_url.startswith('http')): + self.metadata_url = "http://example.com/metadata" self._normalize_metadata_base_url() self._normalize_metadata_csv_url() metadata_validator.validate_metadata(self.metadata) @@ -75,14 +77,17 @@ def _add_table_metadata(table_metadata, table): def _normalize_metadata_base_url(self): if self.metadata is None: return - for context_entry in self.metadata.get('@context',[]): - if type(context_entry) is dict and '@base' in context_entry: - original_url = self.metadata['url'] - if original_url.startswith('http'): - directory, file_name = original_url.rsplit('/', 1) - self.metadata['url'] = directory + '/' + context_entry['@base'] + file_name - else: - self.metadata['url'] = context_entry['@base'] + self.metadata['url'] + if isinstance(self.metadata,dict): + for context_entry in self.metadata.get('@context',[]): + if type(context_entry) is dict and '@base' in context_entry: + original_url = self.metadata["url"] + if original_url.startswith('http'): + directory, file_name = original_url.rsplit('/', 1) + self.metadata['url'] = directory + '/' + context_entry['@base'] + file_name + else: + self.metadata['url'] = context_entry['@base'] + self.metadata['url'] + else: + print(f"Error: not dict, {self.metadata}") def _normalize_metadata_csv_url(self): """ Expands 'url' properties if necessary """ @@ -108,7 +113,7 @@ def _fetch_csvs(self): CSVUtils.parse_csv_from_url_to_list(table['url'], self._delimiter(table)), self.metadata['tables'])) else: - self.csvs = [CSVUtils.parse_csv_from_url_to_list(self.metadata['url'], self._delimiter(self.metadata))] + self.csvs = [CSVUtils.parse_csv_from_url_to_list(self.metadata.get('url'), self._delimiter(self.metadata))] @staticmethod def _delimiter(metadata): diff --git a/csvwlib/converter/ToRDFConverter.py b/csvwlib/converter/ToRDFConverter.py index ed438db..945bf29 100644 --- a/csvwlib/converter/ToRDFConverter.py +++ b/csvwlib/converter/ToRDFConverter.py @@ -82,7 +82,7 @@ def parse_virtual_columns(self, row_node, atdm_row, table_metadata): self.graph.add((row_node, CSVW.describes, subject)) else: print(f"term {virtual_column['propertyUrl']} not in namespaces") -S + def _add_file_metadata(self, metadata, node): language = JSONLDUtils.language(self.metadata.get('@context',[])) for key, value in metadata.items(): diff --git a/csvwlib/utils/MetadataLocator.py b/csvwlib/utils/MetadataLocator.py index e9f4830..9b06752 100644 --- a/csvwlib/utils/MetadataLocator.py +++ b/csvwlib/utils/MetadataLocator.py @@ -1,6 +1,7 @@ import json as jsonlib -import requests +import requests, os + from csvwlib.utils.metadata import MetadataValidator from csvwlib.utils.url.WellKnownUriResolver import WellKnownUriResolver @@ -11,7 +12,20 @@ class MetadataLocator: @staticmethod def find_and_get(csv_url, metadata_url=None): if metadata_url is not None: - return jsonlib.loads(requests.get(metadata_url).content.decode()) + if isinstance(metadata_url, dict): # md already parsed + return jsonlib.loads(jsonlib.dumps(metadata_url)) + try: + md = jsonlib.loads(metadata_url) # expect json? + if not isinstance(md,dict): + raise Exception('metadata not dict') + return md + except ValueError as e: + if metadata_url.startswith('http'): # if url + return jsonlib.loads(requests.get(metadata_url).content.decode()) + elif os.path.exists(metadata_url): # expect local file? + with open(metadata_url,"r") as f: + return jsonlib.loads(f.read()) + return None response = requests.head(csv_url) if 'Link' in response.headers and 'describedby' in response.links: @@ -24,7 +38,6 @@ def find_and_get(csv_url, metadata_url=None): metadata = MetadataLocator._retrieve_from_site_wide_conf(csv_url) if metadata is not None: return metadata - if '?' in csv_url: csv_url, query = csv_url.split('?') metadata_url = csv_url + '-metadata.json' diff --git a/csvwlib/utils/metadata.py b/csvwlib/utils/metadata.py index d1b3fbb..bd5d5f9 100644 --- a/csvwlib/utils/metadata.py +++ b/csvwlib/utils/metadata.py @@ -75,22 +75,26 @@ class MetadataValidator: def __init__(self, start_url): MetadataValidator.instance = self self.metadata = {} - self.start_url = start_url + if isinstance(start_url,dict): + if 'url' in start_url.keys(): + self.start_url = start_url['url'] + else: + self.start_url = start_url self.warnings = [] self.table = {} def validate_metadata(self, metadata): - if metadata is None: + if metadata is None or not isinstance(metadata, dict): return self.metadata = metadata - if 'tableSchema' in metadata: + if 'tableSchema' in metadata.keys(): tables = [metadata] else: tables = metadata['tables'] for table in tables: self.table = table - if 'tables' in metadata: + if 'tables' in metadata.keys(): self.check_member_property('tableGroup', metadata) else: self.check_member_property('table', metadata) @@ -101,6 +105,8 @@ def validate_metadata(self, metadata): self.check_titles(table) def check_csv_reference(self, table, metadata): + if isinstance(self.start_url,dict): + return if not self.start_url.endswith('.csv'): return if table['url'] != self.start_url: