Skip to content
Snippets Groups Projects
Commit 9c488690 authored by Jens Bröder's avatar Jens Bröder
Browse files

Add logic to download external schemas for validation.

parent 4addc84b
No related branches found
No related tags found
3 merge requests!11Make release 1.0.0,!10Release tag 1.0.0 to main,!4Update OAI config
Pipeline #204485 failed
......@@ -10,6 +10,17 @@
Module containing the Data model for linked data close to the unhide project, which wraps the original data stores
metadata and provenance data together with derived data for the actual graph
"""
import json
from pathlib import Path
from typing import Optional
from typing import Tuple
from pyshacl import validate as shacl_validate
from rdflib import Graph
from data_mining.util.external_schemas import load_external_schema
SCHEMA_ORG_SHAPE = load_external_schema('schema_org_shacl')
class LinkedDataObject():
......@@ -18,11 +29,11 @@ class LinkedDataObject():
{
metadata: {}
prov: {}
original: {}
derived: {}
patch_stack: []
}
Each LinkedDataObject usually has a representative file on disk or data object in an object store
The derived data will make it into the Unhide graph.
# Provenance might/should be tract somewhere externally, like through AiiDA
......@@ -31,9 +42,9 @@ class LinkedDataObject():
# like if it is stored in disk, or in an objectstore or some other database
"""
def __init__(self, orgi_data: dict, metadata=None, validate=True):
def __init__(self, orgi_data: dict, metadata: dict = None, validate=True):
"""
Initialize an UnhideData instance
Initialize an LinkedDataObject instance
:param orgi_data: jsonld dict of the original data instance
:param metadata: dict which contains further metadata to be stored
......@@ -44,10 +55,14 @@ class LinkedDataObject():
if metadata is None:
metadata = derive_metadata(data)
derived = derive_data(data)
# store original and derived as graphs
patch_stack, derived = derive_data(data)
self.patch_stack = patch_stack
data = {'original': orgi_data, 'metadata': metadata, 'derived': derived}
self.data = data
self.validate()
if validate:
self.validate()
def pack_jsonlddata(self, data: dict, metadata: dict):
"""
......@@ -76,11 +91,37 @@ class LinkedDataObject():
"""
self.data['derived'] = derived
def validate(self, orignal_only: bool = False):
def validate(self, shape_graph: Optional[Graph] = None, original_only: bool = False):
"""
Do a shacl validation on the original data and derived
todo get the default shape graph
=SCHEMA_ORG_SHAPE
"""
return
shape_graph = shape_graph or SCHEMA_ORG_SHAPE
val_org = shacl_validate(self.data['original'], shacl_graph=shape_graph)
conforms, results_graph, results_text = val_org
if not original_only:
val = shacl_validate(self.data['derived'], shacl_graph=shape_graph)
conforms_de, results_graph, results_text = val
conforms = conforms and conforms_de
return conforms
def serialize(self, destination: Path, graph_format='json-ld'):
"""
Serialize the file to a json document, while the graph data is stored in a specific format
"""
total_json = {
'metadata': self.get_meta(),
'original': self.data['original'],
'derived': self.data['derived'],
'patch_stack': [patch.to_string() for patch in self.patch_stack],
#RDF_patch_stack_ids: []
}
with open(destination, 'w', encoding='utf-8') as fileo:
json.dump(total_json, fileo)
def derive_metadata(data: dict) -> dict:
......@@ -89,7 +130,7 @@ def derive_metadata(data: dict) -> dict:
return data
def derive_data(data: dict) -> dict:
def derive_data(data: dict) -> Tuple[list, dict]:
"""Derive ata from the data and complete it
# steps to do:
......@@ -97,4 +138,6 @@ def derive_data(data: dict) -> dict:
# 2. Apply shacl rule list and infer triples
# 3. store prov metadata on it in metadata
"""
return data
patch_stack: list = []
return patch_stack, data
# -*- coding: utf-8 -*-
#############################################################################################
# Copyright (c), Helmholtz Metadata Collaboration (HMC). All rights reserved. #
# This file is part of the data-mining package. #
# The code is hosted at https://codebase.helmholtz.cloud/hmc/hmc-public/unhide/data_mining #
# For further information on the license, see the LICENSE file #
# For further information please visit https://www.helmholtz-metadaten.de/en #
#############################################################################################
"""
Utility to download, load and process external schemas needed for example validation
"""
from pathlib import Path
import requests
from rdflib import Graph
EXTERNAL_SCHEMAS_FOLDER = Path(__file__).resolve().parent.parent.parent / 'external_schema'
KNOWN_SCHEMAS = {
'schema_org': 'https://schema.org/version/latest/schemaorg-current-https.jsonld',
'schema_org_shacl': 'https://datashapes.org/schema.jsonld',
'codemeta': 'https://doi.org/10.5063/schema/codemeta-2.0'
}
def load_external_schema(schema_name: str = 'schema_org_shacl') -> Graph:
"""
Read a schema from file if it is there, otherwise download it and cache
"""
if schema_name not in KNOWN_SCHEMAS:
raise ValueError(f'Schema: {schema_name} not known. Could not be loaded.')
schema_path = EXTERNAL_SCHEMAS_FOLDER / f'{schema_name}.jsonld'
if not schema_path.exists():
data = requests.get(KNOWN_SCHEMAS[schema_name], timeout=(10, 100)).text #content
with open(schema_path, 'w', encoding='utf-8') as fileo:
fileo.write(data)
schema = Graph().parse(schema_path)
return schema
......@@ -20,7 +20,7 @@ def convert_xml_to_json(xml_data: str, to_replace: Optional[List[tuple]] = None)
"""AI is creating summary for convert_xml_to_json
:param xml_data: xml data as string
:type xml_data: [type]
:type xml_data: str
"""
if to_replace is None:
to_replace = [('r3d:', ''), (r' content="', r' content_save="')]
......
......@@ -4,6 +4,9 @@ In this folder you can put external schemas against which the data (json-ld) sho
for example the schema.org schema:
https://schema.org/version/latest/schemaorg-current-https.jsonld
shacl shapes of schema.org:
https://datashapes.org/schema.jsonld
or the codemeta.json schema:
https://doi.org/10.5063/schema/codemeta-2.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment