From b0f64bd346c6b8afc7a24899a021ef9d3520f047 Mon Sep 17 00:00:00 2001 From: Paul van Genuchten Date: Fri, 10 Oct 2025 14:53:24 +0200 Subject: [PATCH] this introduces the option to trigger convert from a local metadata file or even a json or dict object directly it has some other branches merged, so may need to rebase --- csvwlib/converter/ModelConverter.py | 31 ++++++++++++++++---------- csvwlib/converter/ToRDFConverter.py | 20 ++++++++++++----- csvwlib/utils/DOPUtils.py | 2 +- csvwlib/utils/MetadataLocator.py | 19 +++++++++++++--- csvwlib/utils/json/CommonProperties.py | 12 ++++++++-- csvwlib/utils/metadata.py | 14 ++++++++---- csvwlib/utils/rdf/OntologyUtils.py | 3 ++- csvwlib/utils/url/UriTemplateUtils.py | 1 - 8 files changed, 72 insertions(+), 30 deletions(-) diff --git a/csvwlib/converter/ModelConverter.py b/csvwlib/converter/ModelConverter.py index ab18c8a..d75ff1a 100644 --- a/csvwlib/converter/ModelConverter.py +++ b/csvwlib/converter/ModelConverter.py @@ -24,7 +24,7 @@ def __init__(self, csv_url=None, metadata_url=None): self.csvs = None self.values_valiator = None self.metadata_url = metadata_url - self.start_url = csv_url if csv_url is not None else metadata_url + self.start_url = csv_url if csv_url is not None else (metadata_url if not isinstance(metadata_url,dict) else metadata_url.get('url')) self.metadata = None self.atdm = {'@type': '@AnnotatedTableGroup'} self.mode = CONST_STANDARD_MODE @@ -34,6 +34,8 @@ def convert_to_atdm(self, mode=CONST_STANDARD_MODE): metadata_validator = MetadataValidator(self.start_url) self.mode = mode self.metadata = MetadataLocator.find_and_get(self.csv_url, self.metadata_url) + if self.metadata_url and (isinstance(self.metadata_url,dict) or not self.metadata_url.startswith('http')): + self.metadata_url = "http://example.com/metadata" self._normalize_metadata_base_url() self._normalize_metadata_csv_url() metadata_validator.validate_metadata(self.metadata) @@ -75,14 +77,17 @@ def _add_table_metadata(table_metadata, table): def _normalize_metadata_base_url(self): if self.metadata is None: return - for context_entry in self.metadata['@context']: - if type(context_entry) is dict and '@base' in context_entry: - original_url = self.metadata['url'] - if original_url.startswith('http'): - directory, file_name = original_url.rsplit('/', 1) - self.metadata['url'] = directory + '/' + context_entry['@base'] + file_name - else: - self.metadata['url'] = context_entry['@base'] + self.metadata['url'] + if isinstance(self.metadata,dict): + for context_entry in self.metadata.get('@context',[]): + if type(context_entry) is dict and '@base' in context_entry: + original_url = self.metadata["url"] + if original_url.startswith('http'): + directory, file_name = original_url.rsplit('/', 1) + self.metadata['url'] = directory + '/' + context_entry['@base'] + file_name + else: + self.metadata['url'] = context_entry['@base'] + self.metadata['url'] + else: + print(f"{self.metadata} not dict") def _normalize_metadata_csv_url(self): """ Expands 'url' properties if necessary """ @@ -108,7 +113,7 @@ def _fetch_csvs(self): CSVUtils.parse_csv_from_url_to_list(table['url'], self._delimiter(table)), self.metadata['tables'])) else: - self.csvs = [CSVUtils.parse_csv_from_url_to_list(self.metadata['url'], self._delimiter(self.metadata))] + self.csvs = [CSVUtils.parse_csv_from_url_to_list(self.metadata.get('url'), self._delimiter(self.metadata))] @staticmethod def _delimiter(metadata): @@ -201,7 +206,9 @@ def _normalize_column_names(self): for i, column in enumerate(table['tableSchema']['columns'], start=1): if 'name' not in column: language = JSONLDUtils.language(self.metadata['@context'], table) - titles = column['titles'] if type(column['titles']) is list else [column['titles']] + titles = column.get('titles',[]) + if type(titles) is not list: + titles = [titles] if language is None: column['name'] = DOPUtils.natural_language_first_value(titles) else: @@ -270,7 +277,7 @@ def _set_default_values(self): for i, column_metadata in enumerate(table_metadata['tableSchema']['columns']): if 'default' in column_metadata: for row in csv: - if row[i] == '': + if i < len(row) and row[i] == '': row[i] = column_metadata['default'] def _normalize_numbers_notation(self): diff --git a/csvwlib/converter/ToRDFConverter.py b/csvwlib/converter/ToRDFConverter.py index be66f54..93bb258 100644 --- a/csvwlib/converter/ToRDFConverter.py +++ b/csvwlib/converter/ToRDFConverter.py @@ -70,15 +70,23 @@ def parse_virtual_columns(self, row_node, atdm_row, table_metadata): continue subject = URIRef(UriTemplateUtils.insert_value(virtual_column['aboutUrl'], atdm_row, '', table_metadata['url'])) + predicate = Namespaces.get_term(virtual_column['propertyUrl']) - obj = UriTemplateUtils.insert_value(virtual_column['valueUrl'], atdm_row, '', table_metadata['url']) - obj = CommonProperties.expand_property_if_possible(obj) - self.graph.add((subject, predicate, URIRef(obj))) - if self.mode == CONST_STANDARD_MODE: - self.graph.add((row_node, CSVW.describes, subject)) + if predicate: + + if 'valueUrl' in virtual_column: + obj = UriTemplateUtils.insert_value(virtual_column['valueUrl'], atdm_row, '', table_metadata['url']) + obj = CommonProperties.expand_property_if_possible(obj) + self.graph.add((subject, predicate, URIRef(obj))) + elif 'default' in virtual_column: + self.graph.add((subject, predicate, self._object_node(virtual_column['default'], virtual_column, atdm_row, ''))) + if self.mode == CONST_STANDARD_MODE: + self.graph.add((row_node, CSVW.describes, subject)) + else: + print(f"term {virtual_column['propertyUrl']} not in namespaces") def _add_file_metadata(self, metadata, node): - language = JSONLDUtils.language(self.metadata['@context']) + language = JSONLDUtils.language(self.metadata.get('@context',[])) for key, value in metadata.items(): if CommonProperties.is_common_property(key) or key == 'notes': triples = CommonProperties.property_to_triples((key, metadata[key]), node, language) diff --git a/csvwlib/utils/DOPUtils.py b/csvwlib/utils/DOPUtils.py index 628179a..da932fc 100644 --- a/csvwlib/utils/DOPUtils.py +++ b/csvwlib/utils/DOPUtils.py @@ -7,4 +7,4 @@ def natural_language_first_value(property_value): if type(property_value) is str: return property_value elif type(property_value) is list: - return property_value[0] + return next(iter(property_value or []), None) diff --git a/csvwlib/utils/MetadataLocator.py b/csvwlib/utils/MetadataLocator.py index e9f4830..9b06752 100644 --- a/csvwlib/utils/MetadataLocator.py +++ b/csvwlib/utils/MetadataLocator.py @@ -1,6 +1,7 @@ import json as jsonlib -import requests +import requests, os + from csvwlib.utils.metadata import MetadataValidator from csvwlib.utils.url.WellKnownUriResolver import WellKnownUriResolver @@ -11,7 +12,20 @@ class MetadataLocator: @staticmethod def find_and_get(csv_url, metadata_url=None): if metadata_url is not None: - return jsonlib.loads(requests.get(metadata_url).content.decode()) + if isinstance(metadata_url, dict): # md already parsed + return jsonlib.loads(jsonlib.dumps(metadata_url)) + try: + md = jsonlib.loads(metadata_url) # expect json? + if not isinstance(md,dict): + raise Exception('metadata not dict') + return md + except ValueError as e: + if metadata_url.startswith('http'): # if url + return jsonlib.loads(requests.get(metadata_url).content.decode()) + elif os.path.exists(metadata_url): # expect local file? + with open(metadata_url,"r") as f: + return jsonlib.loads(f.read()) + return None response = requests.head(csv_url) if 'Link' in response.headers and 'describedby' in response.links: @@ -24,7 +38,6 @@ def find_and_get(csv_url, metadata_url=None): metadata = MetadataLocator._retrieve_from_site_wide_conf(csv_url) if metadata is not None: return metadata - if '?' in csv_url: csv_url, query = csv_url.split('?') metadata_url = csv_url + '-metadata.json' diff --git a/csvwlib/utils/json/CommonProperties.py b/csvwlib/utils/json/CommonProperties.py index 0909592..6b9fda9 100644 --- a/csvwlib/utils/json/CommonProperties.py +++ b/csvwlib/utils/json/CommonProperties.py @@ -80,9 +80,17 @@ def expand_property_if_possible(prop): return prop prefix, prop = prop.split(':') - return Namespaces.get(prefix).term(prop) + return CommonProperties.ns_has_term(prefix, prop) @staticmethod def expand_property(prop): prefix, prop = prop.split(':') - return Namespaces.get(prefix).term(prop) + return CommonProperties.ns_has_term(prefix, prop) + + @staticmethod + def ns_has_term(prefix, prop): + pf = Namespaces.get(prefix) + if pf: + return pf.term(prop) + else: + raise Exception(f'Namespace {prefix} for {prop} not registered') \ No newline at end of file diff --git a/csvwlib/utils/metadata.py b/csvwlib/utils/metadata.py index d1b3fbb..bd5d5f9 100644 --- a/csvwlib/utils/metadata.py +++ b/csvwlib/utils/metadata.py @@ -75,22 +75,26 @@ class MetadataValidator: def __init__(self, start_url): MetadataValidator.instance = self self.metadata = {} - self.start_url = start_url + if isinstance(start_url,dict): + if 'url' in start_url.keys(): + self.start_url = start_url['url'] + else: + self.start_url = start_url self.warnings = [] self.table = {} def validate_metadata(self, metadata): - if metadata is None: + if metadata is None or not isinstance(metadata, dict): return self.metadata = metadata - if 'tableSchema' in metadata: + if 'tableSchema' in metadata.keys(): tables = [metadata] else: tables = metadata['tables'] for table in tables: self.table = table - if 'tables' in metadata: + if 'tables' in metadata.keys(): self.check_member_property('tableGroup', metadata) else: self.check_member_property('table', metadata) @@ -101,6 +105,8 @@ def validate_metadata(self, metadata): self.check_titles(table) def check_csv_reference(self, table, metadata): + if isinstance(self.start_url,dict): + return if not self.start_url.endswith('.csv'): return if table['url'] != self.start_url: diff --git a/csvwlib/utils/rdf/OntologyUtils.py b/csvwlib/utils/rdf/OntologyUtils.py index c5e8afd..d6aa992 100644 --- a/csvwlib/utils/rdf/OntologyUtils.py +++ b/csvwlib/utils/rdf/OntologyUtils.py @@ -21,4 +21,5 @@ def type(column_metadata): return None datatype = OntologyUtils._name_mappings.get(datatype, datatype) - return OntologyUtils._type_mappings.get(datatype, XSD.term(datatype)) + xsd_type = getattr(XSD, datatype) + return OntologyUtils._type_mappings.get(datatype, xsd_type) diff --git a/csvwlib/utils/url/UriTemplateUtils.py b/csvwlib/utils/url/UriTemplateUtils.py index b8a18ec..2d5fdcd 100644 --- a/csvwlib/utils/url/UriTemplateUtils.py +++ b/csvwlib/utils/url/UriTemplateUtils.py @@ -27,7 +27,6 @@ def expand_template(template: str, row: dict) -> str: """ def replacer(match): var = match.group(1) - print('r',row) if var in row.keys(): return str(row[var]) raise KeyError(f"Missing value for template variable '{var}'")