From 1bcbb0fc1ff334e930fdee6bc172916976ff30f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Preu=C3=9F?= <gabriel.preuss@helmholtz-berlin.de> Date: Wed, 26 Mar 2025 17:01:23 +0100 Subject: [PATCH 1/4] issue 133 split up toolbox to 3 independent crons: harvesting, find linked data pubs, assessment --- .env.example | 2 + .pre-commit-config.yaml | 4 + REUSE.toml | 2 +- docker-compose.yaml | 2 +- poetry.lock | 89 ++-- pyproject.toml | 6 +- .../fuji_scorer/test_fuji_scorer.py | 46 +- toolbox/__init__.py | 3 - toolbox/config.py | 44 ++ toolbox/data_enricher/__init__.py | 6 - toolbox/data_enricher/fair_meter/__init__.py | 2 - .../fair_meter/fuji_scorer/__init__.py | 3 - .../fair_meter/fuji_scorer/fuji_conf.py | 22 - .../fair_meter/fuji_scorer/fuji_scorer.py | 92 ---- toolbox/data_enricher/linked_data_finder.py | 463 ++++++++---------- .../db_models => database}/config.py | 0 .../models}/DataPublication/data_pub_get.py | 30 +- .../DataPublication/data_pub_insert.py | 31 +- .../DataPublication/data_pub_update.py | 44 +- .../models}/LitPub/lit_pub_get.py | 58 ++- .../models}/LitPub/lit_pub_insert.py | 10 +- .../models}/LitPub/lit_pub_update.py | 10 +- .../models}/TableModules/fuji_score.py | 6 +- .../models}/TableModules/publication.py | 11 +- .../TableModules/publication_authors.py | 2 +- .../TableModules/publication_identifier.py | 11 +- .../models}/TableModules/reference.py | 0 .../TableModules/related_identifier.py | 0 toolbox/database/models/cron_assessment.py | 52 ++ .../database/models/cron_data_enrichment.py | 46 ++ toolbox/database/models/cron_last_run.py | 26 + .../exception/multiple_datapubs_error.py | 0 .../exception/publication_exists_error.py | 0 .../exception/publication_mismatch_error.py | 0 .../exception/publication_not_found_error.py | 8 +- toolbox/database/models/publication.py | 35 ++ .../mysql_connection.py | 31 +- .../db_models => database}/schema.sql | 36 +- toolbox/exporter/__init__.py | 3 - toolbox/exporter/db_exporter.py | 49 +- toolbox/exporter/db_models/__init__.py | 0 toolbox/exporter/db_models/publication_db.py | 55 --- toolbox/extractor/extract.py | 171 +++++++ .../metadata_extractor/__init__.py | 0 .../abstract_metadata_extractor.py | 0 .../dublin_core_xml_metadata_extractor.py | 0 .../marc_xml_metadata_extractor.py | 0 .../publication_validator.py | 0 .../fuji_scorer}/__init__.py | 0 toolbox/fair_meter/fuji_scorer/fuji_conf.py | 19 + toolbox/fair_meter/fuji_scorer/fuji_scorer.py | 68 +++ toolbox/toolbox.py | 352 ++++--------- .../type_definitions/data_metadata_dict.py | 57 ++- 53 files changed, 1044 insertions(+), 963 deletions(-) create mode 100644 toolbox/config.py delete mode 100644 toolbox/data_enricher/__init__.py delete mode 100644 toolbox/data_enricher/fair_meter/__init__.py delete mode 100644 toolbox/data_enricher/fair_meter/fuji_scorer/__init__.py delete mode 100644 toolbox/data_enricher/fair_meter/fuji_scorer/fuji_conf.py delete mode 100644 toolbox/data_enricher/fair_meter/fuji_scorer/fuji_scorer.py rename toolbox/{exporter/db_models => database}/config.py (100%) rename toolbox/{exporter/db_models => database/models}/DataPublication/data_pub_get.py (76%) rename toolbox/{exporter/db_models => database/models}/DataPublication/data_pub_insert.py (59%) rename toolbox/{exporter/db_models => database/models}/DataPublication/data_pub_update.py (66%) rename toolbox/{exporter/db_models => database/models}/LitPub/lit_pub_get.py (63%) rename toolbox/{exporter/db_models => database/models}/LitPub/lit_pub_insert.py (93%) rename toolbox/{exporter/db_models => database/models}/LitPub/lit_pub_update.py (88%) rename toolbox/{exporter/db_models => database/models}/TableModules/fuji_score.py (91%) rename toolbox/{exporter/db_models => database/models}/TableModules/publication.py (93%) rename toolbox/{exporter/db_models => database/models}/TableModules/publication_authors.py (96%) rename toolbox/{exporter/db_models => database/models}/TableModules/publication_identifier.py (96%) rename toolbox/{exporter/db_models => database/models}/TableModules/reference.py (100%) rename toolbox/{exporter/db_models => database/models}/TableModules/related_identifier.py (100%) create mode 100644 toolbox/database/models/cron_assessment.py create mode 100644 toolbox/database/models/cron_data_enrichment.py create mode 100644 toolbox/database/models/cron_last_run.py rename toolbox/{exporter/db_models => database/models}/exception/multiple_datapubs_error.py (100%) rename toolbox/{exporter/db_models => database/models}/exception/publication_exists_error.py (100%) rename toolbox/{exporter/db_models => database/models}/exception/publication_mismatch_error.py (100%) rename toolbox/{exporter/db_models => database/models}/exception/publication_not_found_error.py (54%) create mode 100644 toolbox/database/models/publication.py rename toolbox/{exporter/db_models => database}/mysql_connection.py (51%) rename toolbox/{exporter/db_models => database}/schema.sql (92%) delete mode 100644 toolbox/exporter/__init__.py delete mode 100644 toolbox/exporter/db_models/__init__.py delete mode 100644 toolbox/exporter/db_models/publication_db.py create mode 100644 toolbox/extractor/extract.py rename toolbox/{data_enricher => extractor}/metadata_extractor/__init__.py (100%) rename toolbox/{data_enricher => extractor}/metadata_extractor/abstract_metadata_extractor.py (100%) rename toolbox/{data_enricher => extractor}/metadata_extractor/dublin_core_xml_metadata_extractor.py (100%) rename toolbox/{data_enricher => extractor}/metadata_extractor/marc_xml_metadata_extractor.py (100%) rename toolbox/{data_enricher => extractor}/publication_validator.py (100%) rename toolbox/{exporter/db_models/TableModules => fair_meter/fuji_scorer}/__init__.py (100%) create mode 100644 toolbox/fair_meter/fuji_scorer/fuji_conf.py create mode 100644 toolbox/fair_meter/fuji_scorer/fuji_scorer.py diff --git a/.env.example b/.env.example index 9dfaf7a..95cf71f 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,5 @@ +DEBUG=0 + # toolbox setup MAX_THREADS=1 FUJI_HOST=fuji_server diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 175b997..4393ba7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,6 +50,7 @@ repos: - pymysql - pytest - pathvalidate + - python-dotenv args: [ '--extension-pkg-whitelist=lxml.etree', '--max-args=6', @@ -59,6 +60,9 @@ repos: rev: v3.14.0 hooks: - id: reorder-python-imports + args: [ + '--py39-plus' + ] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.15.0 diff --git a/REUSE.toml b/REUSE.toml index c36bfb5..0ba0dc6 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -18,7 +18,7 @@ SPDX-License-Identifier = "CC0-1.0" [[annotations]] path = [ "toolbox/**/*.py", - "toolbox/exporter/db_models/schema.sql", + "toolbox/database/schema.sql", "test/**/*.py", "test/**/*.json", "test/**/*.xml", diff --git a/docker-compose.yaml b/docker-compose.yaml index ed72125..234bd0e 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -35,7 +35,7 @@ services: - MARIADB_PASSWORD=${TOOLBOX_DB_PASS} - MARIADB_RANDOM_ROOT_PASSWORD=true volumes: - - ./toolbox/exporter/db_models/schema.sql:/docker-entrypoint-initdb.d/add_schema.sql + - ./toolbox/database/schema.sql:/docker-entrypoint-initdb.d/add_schema.sql - ./mariadb:/var/lib/mysql networks: - toolbox_internal diff --git a/poetry.lock b/poetry.lock index 213dbfd..12e5286 100644 --- a/poetry.lock +++ b/poetry.lock @@ -184,14 +184,14 @@ test = ["pytest (>=6)"] [[package]] name = "filelock" -version = "3.17.0" +version = "3.18.0" description = "A platform independent file lock." optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"}, - {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"}, + {file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"}, + {file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"}, ] [package.extras] @@ -201,14 +201,14 @@ typing = ["typing-extensions (>=4.12.2)"] [[package]] name = "identify" -version = "2.6.8" +version = "2.6.9" description = "File identification library for Python" optional = false python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "identify-2.6.8-py2.py3-none-any.whl", hash = "sha256:83657f0f766a3c8d0eaea16d4ef42494b39b34629a4b3192a9d020d349b3e255"}, - {file = "identify-2.6.8.tar.gz", hash = "sha256:61491417ea2c0c5c670484fd8abbb34de34cdae1e5f39a73ee65e48e4bb663fc"}, + {file = "identify-2.6.9-py2.py3-none-any.whl", hash = "sha256:c98b4322da415a8e5a70ff6e51fbc2d2932c015532d77e9f8537b4ba7813b150"}, + {file = "identify-2.6.9.tar.gz", hash = "sha256:d40dfe3142a1421d8518e3d3985ef5ac42890683e32306ad614a29490abeb6bf"}, ] [package.extras] @@ -231,26 +231,26 @@ all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2 [[package]] name = "iniconfig" -version = "2.0.0" +version = "2.1.0" description = "brain-dead simple config-ini parsing" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["test"] files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, + {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, + {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] [[package]] name = "jinja2" -version = "3.1.5" +version = "3.1.6" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" groups = ["dev"] files = [ - {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, - {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, + {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"}, + {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"}, ] [package.dependencies] @@ -627,20 +627,20 @@ dev = ["hypothesis", "mypy", "pdoc-pyo3-sample-library (==1.0.11)", "pygments (> [[package]] name = "platformdirs" -version = "4.3.6" +version = "4.3.7" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, - {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, + {file = "platformdirs-4.3.7-py3-none-any.whl", hash = "sha256:a03875334331946f13c549dbd8f4bac7a13a50a895a0eb1e8c6a8ace80d40a94"}, + {file = "platformdirs-4.3.7.tar.gz", hash = "sha256:eb437d586b6a0986388f0d6f74aa0cde27b48d0e3d66843640bfb6bdcdb6e351"}, ] [package.extras] -docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] -type = ["mypy (>=1.11.2)"] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.4)", "pytest-cov (>=6)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.14.1)"] [[package]] name = "pluggy" @@ -761,6 +761,21 @@ pytest = ">=6.2.5" [package.extras] dev = ["pre-commit", "pytest-asyncio", "tox"] +[[package]] +name = "python-dotenv" +version = "1.1.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d"}, + {file = "python_dotenv-1.1.0.tar.gz", hash = "sha256:41f90bc6f5f177fb41f53e87666db362025010eb28f60a01c9143bfa33a2b2d5"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pyyaml" version = "6.0.2" @@ -866,14 +881,14 @@ fixture = ["fixtures"] [[package]] name = "rich" -version = "13.9.4" +version = "14.0.0" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.8.0" groups = ["main"] files = [ - {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, - {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, + {file = "rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0"}, + {file = "rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725"}, ] [package.dependencies] @@ -886,19 +901,19 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "setuptools" -version = "75.8.0" +version = "78.1.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"}, - {file = "setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6"}, + {file = "setuptools-78.1.0-py3-none-any.whl", hash = "sha256:3e386e96793c8702ae83d17b853fb93d3e09ef82ec62722e61da5cd22376dcd8"}, + {file = "setuptools-78.1.0.tar.gz", hash = "sha256:18fd474d4a82a5f83dac888df697af65afa82dec7323d09c3e37d1f14288da54"}, ] [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] -core = ["importlib_metadata (>=6)", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +core = ["importlib_metadata (>=6)", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] @@ -978,14 +993,14 @@ files = [ [[package]] name = "typer" -version = "0.15.1" +version = "0.15.2" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847"}, - {file = "typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a"}, + {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"}, + {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"}, ] [package.dependencies] @@ -996,14 +1011,14 @@ typing-extensions = ">=3.7.4.3" [[package]] name = "typing-extensions" -version = "4.12.2" +version = "4.13.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, - {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, + {file = "typing_extensions-4.13.0-py3-none-any.whl", hash = "sha256:c8dd92cc0d6425a97c18fbb9d1954e5ff92c1ca881a309c45f06ebc0b79058e5"}, + {file = "typing_extensions-4.13.0.tar.gz", hash = "sha256:0a4ac55a5820789d87e297727d229866c9650f6521b64206413c4fbada24d95b"}, ] [[package]] @@ -1026,14 +1041,14 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.29.2" +version = "20.30.0" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" groups = ["dev"] files = [ - {file = "virtualenv-20.29.2-py3-none-any.whl", hash = "sha256:febddfc3d1ea571bdb1dc0f98d7b45d24def7428214d4fb73cc486c9568cce6a"}, - {file = "virtualenv-20.29.2.tar.gz", hash = "sha256:fdaabebf6d03b5ba83ae0a02cfe96f48a716f4fae556461d180825866f75b728"}, + {file = "virtualenv-20.30.0-py3-none-any.whl", hash = "sha256:e34302959180fca3af42d1800df014b35019490b119eba981af27f2fa486e5d6"}, + {file = "virtualenv-20.30.0.tar.gz", hash = "sha256:800863162bcaa5450a6e4d721049730e7f2dae07720e0902b0e4040bd6f9ada8"}, ] [package.dependencies] @@ -1048,4 +1063,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "35e15b16819a652f433f94c1b302c3eff947be56428f3fe9520615817334bfb0" +content-hash = "dfc0edbec28d7ba52622b0dd7684cd0ecb70157f33b8d0f3112315be8e45058e" diff --git a/pyproject.toml b/pyproject.toml index 720475c..6154861 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,10 +34,11 @@ pyyaml = "^6.0" pymarc = "~5.1.2" requests = "^2.32.3" sickle = "^0.7.0" -setuptools = "75.8.0" -typer = "^0.15.1" +setuptools = "^78.1.0" +typer = "^0.15.2" pymysql = "^1.1.1" pathvalidate = "^3.2.0" +python-dotenv = "^1.1.0" [tool.poetry.group.dev.dependencies] pre-commit = "^3.0.4" @@ -56,6 +57,7 @@ indent_width = 4 [tool.pylint] min-public-methods=0 max-line-length=120 +max-locals=16 min-similarity-lines=25 max-positional-arguments=6 disable='wrong-import-order, invalid-name' # conflicts with reorder_python_imports diff --git a/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py b/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py index d30f87c..149e8a4 100644 --- a/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py +++ b/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py @@ -10,10 +10,9 @@ from requests.exceptions import ConnectTimeout from requests.exceptions import ReadTimeout from requests_mock import Mocker -from toolbox.data_enricher.fair_meter.fuji_scorer import FUJI_API -from toolbox.data_enricher.fair_meter.fuji_scorer import FujiScorer -from toolbox.data_enricher.fair_meter.fuji_scorer import RETRY_WAITING_TIME -from toolbox.type_definitions import DataMetadata +from toolbox.fair_meter.fuji_scorer.fuji_conf import FUJI_API +from toolbox.fair_meter.fuji_scorer.fuji_conf import RETRY_WAITING_TIME +from toolbox.fair_meter.fuji_scorer.fuji_scorer import get_fuji_score class TestFujiScorer(unittest.TestCase): @@ -25,7 +24,6 @@ class TestFujiScorer(unittest.TestCase): with open(self.path, encoding='utf-8') as file: self.response = js.load(file) - self.scorer = FujiScorer() self.data_pid = "10.5281/zenodo.259703" self.data_pid_type = "doi" @@ -34,9 +32,10 @@ class TestFujiScorer(unittest.TestCase): with Mocker() as mock: mock.post(FUJI_API, json=self.response) - data = self.scorer.get_fuji_score(self.data_pid, self.data_pid_type) + data = get_fuji_score(self.data_pid, self.data_pid_type) # asserts method gives correct answers + assert data is not None self.assertEqual(data["pid"], self.data_pid) self.assertEqual(data["pid_type"], self.data_pid_type) self.assertEqual( @@ -66,7 +65,7 @@ class TestFujiScorer(unittest.TestCase): mock.post(FUJI_API, answers) # calls get_fuji_score function - answer = self.scorer.get_fuji_score(self.data_pid, self.data_pid_type) + answer = get_fuji_score(self.data_pid, self.data_pid_type) # asserts Function was called 5 times, with the right sleep time on the last call and gets an answer self.assertEqual(mock.call_count, 5) self.assertEqual(mock.request_history[4].timeout, 2200) @@ -79,7 +78,7 @@ class TestFujiScorer(unittest.TestCase): # stops normal mock object and instead uses a mock object that gives back Exceptions with Mocker() as mock: mock.post(FUJI_API, exc=ReadTimeout) - answer = self.scorer.get_fuji_score(self.data_pid, self.data_pid_type) + answer = get_fuji_score(self.data_pid, self.data_pid_type) # asserts Function was called 5 times, with the right timeout on the last call and gets an answer self.assertEqual(mock.call_count, 11) @@ -87,34 +86,3 @@ class TestFujiScorer(unittest.TestCase): sleep.assert_called_with(10 * RETRY_WAITING_TIME) self.assertIsNone(answer) - - def test_add_data(self): - """tests add_fuji_scores function""" - # generate entries for data_pubs in add_fuji_scores - entry1 = DataMetadata() - entry1.DataPID = "10.5281/zenodo.259703" - entry1.DataPIDType = "doi" - - entry2 = DataMetadata() - entry2.DataPID = "test" - entry2.DataPIDType = "doi" - - # fill data_pubs - data_pubs = [entry1, entry2] - - with Mocker() as mock: - mock.post(FUJI_API, json=self.response) - answer = self.scorer.add_fuji_scores(data_pubs) - - # asserts mock was called for each data entry - self.assertEqual(mock.call_count, len(data_pubs)) - - data_pubs[0].FAIRScores = self.scorer.get_fuji_score( - data_pubs[0].DataPID, data_pubs[0].DataPIDType - ) - data_pubs[1].FAIRScores = self.scorer.get_fuji_score( - data_pubs[1].DataPID, data_pubs[1].DataPIDType - ) - - # asserts add_fuji_scores gives back the right answer - self.assertEqual(answer, data_pubs) diff --git a/toolbox/__init__.py b/toolbox/__init__.py index 2c12ec7..43bb212 100644 --- a/toolbox/__init__.py +++ b/toolbox/__init__.py @@ -3,7 +3,4 @@ A package to harvest literature-publication metadata from publications by Helmho look for related data-publications, assess the F.A.I.R.-scores of these and output all results as JSON and/or CSV-files. """ -from . import data_enricher -from . import exporter -from . import harvester from . import toolbox diff --git a/toolbox/config.py b/toolbox/config.py new file mode 100644 index 0000000..91a500e --- /dev/null +++ b/toolbox/config.py @@ -0,0 +1,44 @@ +""" +CONFIG FILE +""" +import logging +import os +from dataclasses import dataclass + +from dotenv import load_dotenv + +load_dotenv() + + +@dataclass +class Config: + # pylint: disable=too-many-instance-attributes + """ + Configs are read from .env file + """ + DEBUG: bool = ( + os.environ.get("DEBUG") == "1" or os.environ.get("DEBUG", "").lower() == "true" + ) + + MAX_THREADS: int = int(os.environ.get("MAX_THREADS", "") or 1) + DAYS_UNTIL_REENRICHMENT: int = 14 + DB_USER: str = os.environ.get('TOOLBOX_DB_USER', '') + DB_PASS: str = os.environ.get('TOOLBOX_DB_PASS', '') + DB_HOST: str = os.environ.get('TOOLBOX_DB_HOST', '') + DB_NAME: str = os.environ.get('TOOLBOX_DB_NAME', '') + FUJI_HOST: str = os.environ.get('FUJI_HOST', '') + FUJI_PORT: str = os.environ.get('FUJI_PORT', '80') + FUJI_PASS: str = os.environ.get('FUJI_PASS', 'bWFydmVsOndvbmRlcndvbWFu') + FUJI_PROTOCOL: str = os.environ.get('FUJI_PROTOCOL', 'http') + + +config = Config() + +logging.basicConfig( + level=(logging.DEBUG if config.DEBUG else logging.INFO), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +if config.DEBUG: + logging.info("Following config found: %s", config) diff --git a/toolbox/data_enricher/__init__.py b/toolbox/data_enricher/__init__.py deleted file mode 100644 index 1afd95f..0000000 --- a/toolbox/data_enricher/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -A package to enrich metadata from literature-publications of Helmholtz centers. -""" -from . import fair_meter -from . import linked_data_finder -from . import metadata_extractor diff --git a/toolbox/data_enricher/fair_meter/__init__.py b/toolbox/data_enricher/fair_meter/__init__.py deleted file mode 100644 index dfd5ba8..0000000 --- a/toolbox/data_enricher/fair_meter/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""A package to assess the F.A.I.R.-ness of data-publications.""" -from . import fuji_scorer diff --git a/toolbox/data_enricher/fair_meter/fuji_scorer/__init__.py b/toolbox/data_enricher/fair_meter/fuji_scorer/__init__.py deleted file mode 100644 index 411b526..0000000 --- a/toolbox/data_enricher/fair_meter/fuji_scorer/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""A package to assess the F.A.I.R.-ness of data-publications via the F-UJI Tool.""" -from .fuji_conf import * -from .fuji_scorer import FujiScorer diff --git a/toolbox/data_enricher/fair_meter/fuji_scorer/fuji_conf.py b/toolbox/data_enricher/fair_meter/fuji_scorer/fuji_conf.py deleted file mode 100644 index 915a76c..0000000 --- a/toolbox/data_enricher/fair_meter/fuji_scorer/fuji_conf.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -A collection of constants used by the class FujiScorer. -""" -import os - -FUJI_HOST = os.environ.get('FUJI_HOST', 'localhost') -FUJI_PORT = os.environ.get('FUJI_PORT', '1071') -FUJI_PROTOCOL = os.environ.get('FUJI_PROTOCOL', 'http') -FUJI_API = f"{FUJI_PROTOCOL}://{FUJI_HOST}:{FUJI_PORT}/fuji/api/v1/evaluate" - -REQ_DICT = {'test_debug': True, 'use_datacite': True} - -HEADERS = { - 'accept': 'application/json', - 'Authorization': 'Basic bWFydmVsOndvbmRlcndvbWFu', - 'Content-Type': 'application/json' -} - -INITIAL_TIMEOUT = 1000 -EXTRA_TIMEOUT = 300 -RETRY_WAITING_TIME = 60 -MAX_RETRY_COUNTER = 10 diff --git a/toolbox/data_enricher/fair_meter/fuji_scorer/fuji_scorer.py b/toolbox/data_enricher/fair_meter/fuji_scorer/fuji_scorer.py deleted file mode 100644 index b6f0bb2..0000000 --- a/toolbox/data_enricher/fair_meter/fuji_scorer/fuji_scorer.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -A class to get a F.A.I.R.-assessment (Fairness, Accessibility, Interoperability, Reusability) from -the F-UJI-Server for a given data-publication. -""" -import logging -from typing import Optional - -from .fuji_conf import EXTRA_TIMEOUT -from .fuji_conf import FUJI_API -from .fuji_conf import HEADERS -from .fuji_conf import INITIAL_TIMEOUT -from .fuji_conf import MAX_RETRY_COUNTER -from .fuji_conf import REQ_DICT -from .fuji_conf import RETRY_WAITING_TIME -from toolbox.type_definitions import DataMetadata -from toolbox.utils.toolbox_request import ToolboxRequest - - -class FujiScorer: - """ - A class to get a F.A.I.R.-assessment (Fairness, Accessibility, Interoperability, Reusability) - from the F-UJI-Server for a given data-publication. - """ - def add_fuji_scores( - self, - data_pubs: list[DataMetadata], - ) -> list[DataMetadata]: - """ - Add the fuji score to each data publication contained in the input list, if it has a DOI. - :param data_pubs: a list of dictionaries representing data publications - :return: the input list, enriched with the FAir-scores obtained from the F-UJI API. - """ - for data_pub in data_pubs: - if data_pub.DataPID and data_pub.DataPIDType: - data_pub.FAIRScores = self.get_fuji_score( - data_pub.DataPID, data_pub.DataPIDType - ) - - return data_pubs - - def get_fuji_score( - self, - data_pid: str, - data_pid_type: str, - ) -> Optional[dict]: - """ - Assess the F.A.I.R.-ness of a data-publication via the F-UJI server and return the resulting - F-UJI scores in a dictionary. - :param data_pid: the PID of a data-publication - :param data_pid_type: the PID-type, e.g. DOI or HANDLE - :return: a dictionary of F-UJI metrics for the data-publication. - """ - json = REQ_DICT | {'object_identifier': data_pid} - logging.debug("FUJI scorer started for data_pid %s", data_pid) - request = ToolboxRequest( - initial_timeout=INITIAL_TIMEOUT, - extra_timeout=EXTRA_TIMEOUT, - retry_waiting_time=RETRY_WAITING_TIME, - max_retry_counter=MAX_RETRY_COUNTER, - ) - - fuji_response = request.post(FUJI_API, json=json, headers=HEADERS) - - return self._select_fuji_metrics(fuji_response, data_pid, data_pid_type) - - @staticmethod - def _select_fuji_metrics(rs_json: dict, pid: str, pid_type: str) -> Optional[dict]: - """ - Select important information from F-UJI response. - :param rs_json: the JSON got as a response from F-UJI - :param pid: the PID of the data-publication - :param pid: the PID-type, e.g. DOI or HANDLE - :return: - """ - try: - list_metric = { - 'pid': pid, - 'pid_type': pid_type, - 'metric_specification': rs_json['metric_specification'], - 'timestamp': rs_json['end_timestamp'] - } - - for score in rs_json['summary']: - for subscore in rs_json['summary'][score]: - metric = score + '_' + subscore - metric_score = rs_json['summary'][score][subscore] - list_metric[metric] = float(metric_score) - - return list_metric - - except KeyError: - return None diff --git a/toolbox/data_enricher/linked_data_finder.py b/toolbox/data_enricher/linked_data_finder.py index ff2675f..520ccbc 100644 --- a/toolbox/data_enricher/linked_data_finder.py +++ b/toolbox/data_enricher/linked_data_finder.py @@ -1,5 +1,5 @@ """ -A class to get related (i.e. linked) data-publications from the DOI of a literature publication. +A module to get related (i.e. linked) data-publications from the DOI of a literature publication. Presently the only source from which metadata on published datasets is pulled is the website Scholexplorer which is requested via API. The response comes as JSON and is further processed and returned as a special type of dictionary. @@ -17,272 +17,203 @@ from toolbox.type_definitions.data_metadata_dict import RelationshipType from toolbox.utils.mapping import map_data_publishers -class LinkedDataFinder: +def find_linked_data_publications( + lit_pid: str, + lit_pid_type: str = 'doi', +) -> list[DataMetadata]: """ - A class to get linked data-publications from the DOI of a literature publication. + Request the Scholexplorer API for dataset publications that are supplements (or otherwise + linked) to a literature publication. Process the response and return a list of objects + representing found data_publications. Add additional info via a request to the DataCite API if possible. + :param lit_pid: a pid for a literature publication + :param lit_pid_type: the pid type of the pid in lit_pid + :return: a list of dictionaries representing each a data publication """ - def find_linked_data_publications( - self, - lit_pid: str, - lit_pid_type: str = 'doi', - scholix_data: Optional[ScholexplorerRequest] = None - ) -> list[DataMetadata]: - """ - Request the Scholexplorer API for dataset publications that are supplements (or otherwise - linked) to a literature publication. Process the response and return a list of objects - representing found data_publications. Add additional info via a request to the DataCite API if possible. - :param lit_pid: a pid for a literature publication - :param lit_pid_type: the pid type of the pid in lit_pid - :param scholix_data If already requested scholix data can be submitted here - :return: a list of dictionaries representing each a data publication - """ - if scholix_data is None: - scholix_data = ScholexplorerRequest(lit_pid) - - data_pubs = scholix_data.get_data_pubs() - - if not data_pubs: - logging.debug("No data_pubs found for lit_pid %s", lit_pid) - - data_metadata = [ - self._to_data_metadata_dict(data_pub, lit_pid, lit_pid_type) - for data_pub in data_pubs - ] - - return self._add_datacite_info(self._merge_same_datasets(data_metadata)) - - @staticmethod - def _merge_same_datasets(datasets: list[DataMetadata]) -> list[DataMetadata]: - """ - Merges datasets if they are the same - :param datasets: list of datasets - :return: list without duplicates - """ - merged_datasets: list[DataMetadata] = [] - - for dataset in datasets: - found_dataset = next( - ( - ds for ds in merged_datasets if ds.DataPID == dataset.DataPID and - ds.DataPIDType == dataset.DataPIDType - ), None - ) - - if found_dataset: - found_dataset.RelationshipTypes += dataset.RelationshipTypes - continue - - merged_datasets.append(dataset) - - return merged_datasets - - def _to_data_metadata_dict( - self, dataset: dict[str, Any], lit_pid: str, lit_pid_type: str - ) -> DataMetadata: - """ - Convert a dictionary to a DataMetadataDict. - :param dataset: a dictionary with values form the Scholexplorer response. - :param lit_pid: the DOI of the literature publication the dataset belongs to - :return: an object representing a data publication - """ - data_metadata = DataMetadata() - data_metadata.Title = dataset['Title'] - data_metadata.Creators = self._get_creators(dataset) - data_metadata.Publishers = self._get_publishers(dataset) - data_metadata.PublicationDate = dataset['PublicationDate'] - data_metadata.Identifier = self._get_identifier(dataset) - data_metadata.DataPID, data_metadata.DataPIDType = self._get_data_pid(dataset) - data_metadata.LitPID = lit_pid - data_metadata.LitPIDType = lit_pid_type - data_metadata.RelationshipTypes = [ - self._get_relationship_type(dataset, lit_pid, lit_pid_type) - ] - data_metadata.FAIRScores = None - data_metadata.LastUpdated = datetime.today().strftime('%Y-%m-%d') - - return data_metadata - - @staticmethod - def _get_relationship_type( - dataset: dict[str, Any], lit_pid: str, lit_pid_type: str - ) -> RelationshipType: - """ - get the relationship type of the dataset to a literature - :param dataset: Date of publication - :param lit_pid: pid of a lit publication - :param lit_pid_type: type of the pid - :return: RelationshipType - """ - identifier = Identifier() - identifier.ID = lit_pid - identifier.IDScheme = lit_pid_type - - rel_type = RelationshipType() - rel_type.Name = dataset['RelationshipType']['Name'] - rel_type.SubType = dataset['RelationshipType']['SubType'] - rel_type.SubTypeSchema = dataset['RelationshipType']['SubTypeSchema'] - rel_type.RelatedLitPID = identifier - - return rel_type - - @staticmethod - def _get_creators(dataset: dict[str, Any]) -> list[str]: - """ - Get the dataset creators. - :param dataset: a dictionary representing a dataset - :return: a list of the creators - """ - creators = [] - - for creator in dataset['Creator']: + data_pubs = ScholexplorerRequest(lit_pid).get_data_pubs() + + if not data_pubs: + logging.debug("No data_pubs found for lit_pid %s", lit_pid) + + data_metadata = [ + _to_data_metadata_dict(data_pub, lit_pid, lit_pid_type) for data_pub in data_pubs + ] + + return _add_datacite_info(_merge_same_datasets(data_metadata)) + + +def _merge_same_datasets(datasets: list[DataMetadata]) -> list[DataMetadata]: + """ + Merges datasets if they are the same + :param datasets: list of datasets + :return: list without duplicates + """ + merged_datasets: list[DataMetadata] = [] + + for dataset in datasets: + found_dataset = next( + ( + ds for ds in merged_datasets + if ds.DataPID == dataset.DataPID and ds.DataPIDType == dataset.DataPIDType + ), None + ) + + if found_dataset: + found_dataset.RelationshipTypes += dataset.RelationshipTypes + continue + + merged_datasets.append(dataset) + + return merged_datasets + + +def _to_data_metadata_dict( + dataset: dict[str, Any], lit_pid: str, lit_pid_type: str +) -> DataMetadata: + """ + Convert a dictionary to a DataMetadataDict. + :param dataset: a dictionary with values form the Scholexplorer response. + :param lit_pid: the DOI of the literature publication the dataset belongs to + :return: an object representing a data publication + """ + data_metadata = DataMetadata() + data_metadata.Title = dataset['Title'] + data_metadata.Creators = _get_creators(dataset) + data_metadata.Publishers = _get_publishers(dataset) + data_metadata.PublicationDate = dataset['PublicationDate'] + data_metadata.Identifiers = _get_identifier(dataset) + data_metadata.LitPID = lit_pid + data_metadata.LitPIDType = lit_pid_type + data_metadata.RelationshipTypes = [ + _get_relationship_type(dataset, lit_pid, lit_pid_type) + ] + data_metadata.LastUpdated = datetime.today().strftime('%Y-%m-%d') + + return data_metadata + + +def _get_relationship_type( + dataset: dict[str, Any], lit_pid: str, lit_pid_type: str +) -> RelationshipType: + """ + get the relationship type of the dataset to a literature + :param dataset: Date of publication + :param lit_pid: pid of a lit publication + :param lit_pid_type: type of the pid + :return: RelationshipType + """ + identifier = Identifier() + identifier.ID = lit_pid + identifier.IDScheme = lit_pid_type + + rel_type = RelationshipType() + rel_type.Name = dataset['RelationshipType']['Name'] + rel_type.SubType = dataset['RelationshipType']['SubType'] + rel_type.SubTypeSchema = dataset['RelationshipType']['SubTypeSchema'] + rel_type.RelatedLitPID = identifier + + return rel_type + + +def _get_creators(dataset: dict[str, Any]) -> list[str]: + """ + Get the dataset creators. + :param dataset: a dictionary representing a dataset + :return: a list of the creators + """ + creators = [] + + for creator in dataset['Creator']: + try: + creators.append(creator['Name']) + except KeyError: try: - creators.append(creator['Name']) + creators.append(creator['name']) except KeyError: - try: - creators.append(creator['name']) - except KeyError: - pass - - return creators - - def _get_identifier(self, dataset: dict[str, Any]) -> list[Identifier]: - """ - Get the dataset identifier. - :param dataset: a dictionary representing a dataset - :return: a list of the identifier - """ - - identifier_list: list[Identifier] = [] - avoid_duplicates: dict[str, Optional[str]] = {} - - for ident_from_data in dataset['Identifier']: - new_ident = Identifier() - new_ident.ID = ident_from_data['ID'] - new_ident.IDScheme = ident_from_data['IDScheme'] - - if 'IDURL' in ident_from_data: - new_ident.IDURL = ident_from_data['IDURL'] - - if (ident_hash := f"{new_ident.ID}-{new_ident.IDScheme}") in avoid_duplicates: - if avoid_duplicates[ident_hash] is None and new_ident.IDURL: - self._update_identifier_url(new_ident, identifier_list) - elif new_ident.IDURL and avoid_duplicates[ident_hash] != new_ident.IDURL: - logging.debug( - "Got second mismatched identifier URL for identifier %s having old URL %s", - new_ident, avoid_duplicates[ident_hash] - ) - else: - identifier_list.append(new_ident) - avoid_duplicates[ident_hash] = new_ident.IDURL - - return identifier_list - - @staticmethod - def _update_identifier_url( - new_ident: Identifier, - identifier_list: list[Identifier], - ): - """ - Updates the dataset identifier url. - :param new_ident: new identifier - :param identifier_list: list of identifier - """ - for old_ident in identifier_list: - if old_ident.ID == new_ident.ID and old_ident.IDScheme == new_ident.IDScheme: - old_ident.IDURL = new_ident.IDURL - - @staticmethod - def _get_publishers(dataset: dict[str, Any]) -> list[str]: - """ - Get and infer the publisher names. - :param dataset: a dictionary representing a dataset - :return: a list of publisher-names - """ - publishers = [publisher['name'] for publisher in dataset['Publisher']] - publishers = map_data_publishers(publishers) - id_schemes = [identifier['IDScheme'] for identifier in dataset['Identifier']] - - if 'pdb' in id_schemes: - publishers.append('Protein Data Bank archive (PDB)') - - if 'ena' in id_schemes: - publishers.append('European Nucleotide Archive (ENA)') - - if 'uniprot' in id_schemes: - publishers.append('Universal Protein Knowledgebase (UniProt)') - - if {'genbank', 'ncbi-p', 'ncbi-n'} & set(id_schemes): - publishers.append('National Library of Medicine (NLM)') - - return publishers - - @staticmethod - def _extract_identifier(dataset: dict[str, Any]) -> dict[str, str]: - """ - Extract an identifier (PDB) from a dataset and return it as a dictionary. - :param dataset: a dictionary representing a dataset publication - :return: a dictionary containing one or more PIDs, the type of which is the key, while the PID is the - corresponding value. - """ - pid_dict: dict[str, str] = {} - - # first we collect all available PIDs - for identifier in dataset['Identifier']: - if isinstance(identifier['IDScheme'], - str) and isinstance(identifier['ID'], str): - if identifier['IDScheme'] not in pid_dict: - pid_dict = pid_dict | { - identifier['IDScheme'].lower(): identifier['ID'], - } - - # transform pdb to doi if none exists - if 'pdb' in pid_dict and 'doi' not in pid_dict: - pid_dict = pid_dict | {'doi': f"10.2210/pdb{pid_dict['pdb']}/pdb"} - - # use URLs if no other identifier exists - for identifier in dataset['Identifier']: - if isinstance(identifier['IDScheme'], - str) and isinstance(identifier['IDURL'], str): - if identifier['IDScheme'] not in pid_dict: - pid_dict = pid_dict | { - identifier['IDScheme'].lower(): identifier['IDURL'], - } - - return pid_dict - - def _get_data_pid( - self, - dataset: dict[str, Any], - ) -> tuple[Optional[str], Optional[str]]: - """ - Get the DOI or if none is found, converts non-DOI-identifier to DOI if possible - :param dataset: a dictionary representing a dataset - :return: the DOI or None - """ - pid_priority = ['doi', 'hdl'] - pid_list = self._extract_identifier(dataset) - - for pid_type in pid_priority: - if pid_type in pid_list: - return pid_list[pid_type], pid_type - - # if we don't have a pid from our priority list we return any pid we have - for pid_type in pid_list: - return pid_list[pid_type], pid_type - - logging.debug('No PID found for dataset %s', dataset) - - return None, None - - @staticmethod - def _add_datacite_info(data_metadata: list[DataMetadata]) -> list[DataMetadata]: - """ - Add additional information gathered from DataCite. - :param data_metadata: a dictionary representing a dataset - :return: list of datametadata - """ - for data_pub in data_metadata: - if data_pub.DataPID and data_pub.DataPIDType == 'doi': - DataCiteRequester(data_pub.DataPID).add_info_to_datapub(data_pub) - - return data_metadata + pass + + return creators + + +def _get_identifier(dataset: dict[str, Any]) -> list[Identifier]: + """ + Get the dataset identifier. + :param dataset: a dictionary representing a dataset + :return: a list of the identifier + """ + + identifier_list: list[Identifier] = [] + avoid_duplicates: dict[str, Optional[str]] = {} + + for ident_from_data in dataset['Identifier']: + new_ident = Identifier() + new_ident.ID = ident_from_data['ID'] + new_ident.IDScheme = ident_from_data['IDScheme'] + + if 'IDURL' in ident_from_data: + new_ident.IDURL = ident_from_data['IDURL'] + + if (ident_hash := f"{new_ident.ID}-{new_ident.IDScheme}") in avoid_duplicates: + if avoid_duplicates[ident_hash] is None and new_ident.IDURL: + _update_identifier_url(new_ident, identifier_list) + elif new_ident.IDURL and avoid_duplicates[ident_hash] != new_ident.IDURL: + logging.debug( + "Got second mismatched identifier URL for identifier %s having old URL %s", + new_ident, avoid_duplicates[ident_hash] + ) + else: + identifier_list.append(new_ident) + avoid_duplicates[ident_hash] = new_ident.IDURL + + return identifier_list + + +def _update_identifier_url( + new_ident: Identifier, + identifier_list: list[Identifier], +): + """ + Updates the dataset identifier url. + :param new_ident: new identifier + :param identifier_list: list of identifier + """ + for old_ident in identifier_list: + if old_ident.ID == new_ident.ID and old_ident.IDScheme == new_ident.IDScheme: + old_ident.IDURL = new_ident.IDURL + + +def _get_publishers(dataset: dict[str, Any]) -> list[str]: + """ + Get and infer the publisher names. + :param dataset: a dictionary representing a dataset + :return: a list of publisher-names + """ + publishers = [publisher['name'] for publisher in dataset['Publisher']] + publishers = map_data_publishers(publishers) + id_schemes = [identifier['IDScheme'] for identifier in dataset['Identifier']] + + if 'pdb' in id_schemes: + publishers.append('Protein Data Bank archive (PDB)') + + if 'ena' in id_schemes: + publishers.append('European Nucleotide Archive (ENA)') + + if 'uniprot' in id_schemes: + publishers.append('Universal Protein Knowledgebase (UniProt)') + + if {'genbank', 'ncbi-p', 'ncbi-n'} & set(id_schemes): + publishers.append('National Library of Medicine (NLM)') + + return publishers + + +def _add_datacite_info(data_metadata: list[DataMetadata]) -> list[DataMetadata]: + """ + Add additional information gathered from DataCite. + :param data_metadata: a dictionary representing a dataset + :return: list of datametadata + """ + for data_pub in data_metadata: + if data_pub.DataPID and data_pub.DataPIDType == 'doi': + DataCiteRequester(data_pub.DataPID).add_info_to_datapub(data_pub) + + return data_metadata diff --git a/toolbox/exporter/db_models/config.py b/toolbox/database/config.py similarity index 100% rename from toolbox/exporter/db_models/config.py rename to toolbox/database/config.py diff --git a/toolbox/exporter/db_models/DataPublication/data_pub_get.py b/toolbox/database/models/DataPublication/data_pub_get.py similarity index 76% rename from toolbox/exporter/db_models/DataPublication/data_pub_get.py rename to toolbox/database/models/DataPublication/data_pub_get.py index f187d67..8d0514f 100644 --- a/toolbox/exporter/db_models/DataPublication/data_pub_get.py +++ b/toolbox/database/models/DataPublication/data_pub_get.py @@ -3,12 +3,11 @@ from typing import Optional from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.config import DB_STRING_SEPERATOR -from toolbox.exporter.db_models.TableModules import fuji_score -from toolbox.exporter.db_models.TableModules import publication -from toolbox.exporter.db_models.TableModules import publication_authors -from toolbox.exporter.db_models.TableModules import publication_identifier -from toolbox.exporter.db_models.TableModules import related_identifier +from toolbox.database.config import DB_STRING_SEPERATOR +from toolbox.database.models.TableModules import publication +from toolbox.database.models.TableModules import publication_authors +from toolbox.database.models.TableModules import publication_identifier +from toolbox.database.models.TableModules import related_identifier from toolbox.type_definitions import DataMetadata from toolbox.type_definitions.data_metadata_dict import Identifier from toolbox.type_definitions.data_metadata_dict import RelationshipType @@ -16,30 +15,39 @@ from toolbox.type_definitions.publication_type import PublicationType def get(cursor: DictCursor, pid: Optional[str]) -> Optional[DataMetadata]: - """Get a data-publication from the database.""" + """Get a data-publication from the database by PID""" if not ( pub_id := publication_identifier.get_pub_id_by_pid(cursor, pid, PublicationType.DATASET) ): return None - data_pub = DataMetadata() - data_pub.Identifier = publication_identifier.get_identifier_by_pub_id(cursor, pub_id) + data_pub = get_by_pub_id(cursor, pub_id) + # ensure pids are aligned with the one we initially looked up pid_type = None - for identifier in data_pub.Identifier: + for identifier in data_pub.Identifiers: if pid == identifier.ID: pid_type = identifier.IDScheme data_pub.DataPID = pid data_pub.DataPIDType = pid_type + + return data_pub + + +def get_by_pub_id(cursor: DictCursor, pub_id: int) -> DataMetadata: + """Get a data-publication from the database by id from the publication table""" + data_pub = DataMetadata() + # DataPID and DataPIDType is set here implicitly! + data_pub.Identifiers = publication_identifier.get_identifier_by_pub_id(cursor, pub_id) + data_pub.Title, _, publisher_list, data_pub.PublicationDate, \ data_pub.DataInternalID, data_pub.DataInternalIDType, data_pub.LastUpdated = publication.select(cursor, pub_id) data_pub.Publishers = publisher_list.split(DB_STRING_SEPERATOR) \ if publisher_list else [] data_pub.Creators = publication_authors.select_by_pub_id(cursor, pub_id) data_pub.RelationshipTypes = _get_relationship_types(cursor, pub_id) - data_pub.FAIRScores = fuji_score.get_latest(cursor, pub_id, pid, pid_type) data_pub.RelatedIdentifiers = related_identifier.get(cursor, pub_id) if data_pub.RelationshipTypes: diff --git a/toolbox/exporter/db_models/DataPublication/data_pub_insert.py b/toolbox/database/models/DataPublication/data_pub_insert.py similarity index 59% rename from toolbox/exporter/db_models/DataPublication/data_pub_insert.py rename to toolbox/database/models/DataPublication/data_pub_insert.py index b6eef7c..bc199f3 100644 --- a/toolbox/exporter/db_models/DataPublication/data_pub_insert.py +++ b/toolbox/database/models/DataPublication/data_pub_insert.py @@ -5,13 +5,12 @@ from pymysql.constants.ER import DUP_ENTRY from pymysql.cursors import DictCursor from pymysql.err import IntegrityError -from toolbox.exporter.db_models.exception.publication_exists_error import PublicationExistsError -from toolbox.exporter.db_models.TableModules import fuji_score -from toolbox.exporter.db_models.TableModules import publication -from toolbox.exporter.db_models.TableModules import publication_authors -from toolbox.exporter.db_models.TableModules import publication_identifier -from toolbox.exporter.db_models.TableModules import reference -from toolbox.exporter.db_models.TableModules import related_identifier +from toolbox.database.models.exception.publication_exists_error import PublicationExistsError +from toolbox.database.models.TableModules import publication +from toolbox.database.models.TableModules import publication_authors +from toolbox.database.models.TableModules import publication_identifier +from toolbox.database.models.TableModules import reference +from toolbox.database.models.TableModules import related_identifier from toolbox.type_definitions import DataMetadata from toolbox.type_definitions.publication_type import PublicationType @@ -26,10 +25,6 @@ def insert(cursor: DictCursor, data_pub: DataMetadata) -> int: raise PublicationExistsError from error if error.args[0] == DUP_ENTRY else error publication_authors.insert(cursor, data_pub, data_pub_id) - if data_pub.FAIRScores: - fuji_score.insert(cursor, data_pub_id, data_pub.FAIRScores) - _insert_publication_metric(cursor, data_pub_id, 'Fuji Score') - for rel_type in data_pub.RelationshipTypes: lit_pub_id = publication_identifier.get_pub_id_by_pid( cursor, rel_type.RelatedLitPID.ID, PublicationType.LITERATURE @@ -47,17 +42,3 @@ def insert(cursor: DictCursor, data_pub: DataMetadata) -> int: related_identifier.insert(cursor, data_pub_id, data_pub.RelatedIdentifiers) return data_pub_id - - -def _insert_publication_metric( - cursor: DictCursor, - pub_id: int, - metric_type: str, -) -> None: - cursor.execute( - """ - INSERT INTO metric (publication_id, type) - VALUES (%s, %s) - """, - [pub_id, metric_type], - ) diff --git a/toolbox/exporter/db_models/DataPublication/data_pub_update.py b/toolbox/database/models/DataPublication/data_pub_update.py similarity index 66% rename from toolbox/exporter/db_models/DataPublication/data_pub_update.py rename to toolbox/database/models/DataPublication/data_pub_update.py index 8eea1a3..d7689ef 100644 --- a/toolbox/exporter/db_models/DataPublication/data_pub_update.py +++ b/toolbox/database/models/DataPublication/data_pub_update.py @@ -3,15 +3,14 @@ import logging from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.DataPublication import data_pub_get -from toolbox.exporter.db_models.exception.multiple_datapubs_error import MultipleDataPubsError -from toolbox.exporter.db_models.exception.publication_not_found_error import PublicationNotFoundError -from toolbox.exporter.db_models.TableModules import fuji_score -from toolbox.exporter.db_models.TableModules import publication -from toolbox.exporter.db_models.TableModules import publication_authors -from toolbox.exporter.db_models.TableModules import publication_identifier -from toolbox.exporter.db_models.TableModules import reference -from toolbox.exporter.db_models.TableModules import related_identifier +from toolbox.database.models.DataPublication import data_pub_get +from toolbox.database.models.exception.multiple_datapubs_error import MultipleDataPubsError +from toolbox.database.models.exception.publication_not_found_error import PublicationNotFoundError +from toolbox.database.models.TableModules import publication +from toolbox.database.models.TableModules import publication_authors +from toolbox.database.models.TableModules import publication_identifier +from toolbox.database.models.TableModules import reference +from toolbox.database.models.TableModules import related_identifier from toolbox.type_definitions import DataMetadata from toolbox.type_definitions.publication_type import PublicationType @@ -49,10 +48,6 @@ def update( logging.debug("RelatedIdentifiers changed, updating") related_identifier.update(cursor, pub_id, new_data_pub.RelatedIdentifiers) - if new_data_pub.FAIRScores and _fuji_scores_differ(db_data_pub, new_data_pub): - logging.debug("FAIRScores changed, updating") - fuji_score.insert(cursor, pub_id, new_data_pub.FAIRScores) - for new_rel_type in new_data_pub.RelationshipTypes: rel_type_unknown = new_rel_type not in db_data_pub.RelationshipTypes lit_pub_id = publication_identifier.get_pub_id_by_pid( @@ -103,26 +98,3 @@ def _prop_changed( return False return old_prop_value != new_prop_value - - -def _fuji_scores_differ(data_pub1: DataMetadata, data_pub2: DataMetadata) -> bool: - fuji_score_1 = data_pub1.FAIRScores.copy() if data_pub1.FAIRScores else None - fuji_score_2 = data_pub2.FAIRScores.copy() if data_pub2.FAIRScores else None - - if fuji_score_1 and fuji_score_2: - # we only want to compare the plain fuji_scores + current metric - del fuji_score_2['timestamp'] - del fuji_score_1['timestamp'] - - del fuji_score_2['pid'] - del fuji_score_1['pid'] - - del fuji_score_2['pid_type'] - del fuji_score_1['pid_type'] - - return fuji_score_1 != fuji_score_2 - - if fuji_score_2 and fuji_score_1 is None or fuji_score_1 and fuji_score_2 is None: - return True - - return False diff --git a/toolbox/exporter/db_models/LitPub/lit_pub_get.py b/toolbox/database/models/LitPub/lit_pub_get.py similarity index 63% rename from toolbox/exporter/db_models/LitPub/lit_pub_get.py rename to toolbox/database/models/LitPub/lit_pub_get.py index cf565f2..2feb591 100644 --- a/toolbox/exporter/db_models/LitPub/lit_pub_get.py +++ b/toolbox/database/models/LitPub/lit_pub_get.py @@ -3,11 +3,11 @@ from typing import Optional from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.config import DB_STRING_SEPERATOR -from toolbox.exporter.db_models.config import UNKNOWN_FIELD_CONTENT -from toolbox.exporter.db_models.TableModules import publication -from toolbox.exporter.db_models.TableModules import publication_authors -from toolbox.exporter.db_models.TableModules import publication_identifier +from toolbox.database.config import DB_STRING_SEPERATOR +from toolbox.database.config import UNKNOWN_FIELD_CONTENT +from toolbox.database.models.TableModules import publication +from toolbox.database.models.TableModules import publication_authors +from toolbox.database.models.TableModules import publication_identifier from toolbox.type_definitions import LitMetadata from toolbox.type_definitions.publication_type import PublicationType @@ -18,38 +18,36 @@ def get(cursor: DictCursor, pid: Optional[str]) -> Optional[LitMetadata]: cursor, pid, PublicationType.LITERATURE ) - if pub_id is not None: - lit_pub = LitMetadata() + return get_by_pub_id(cursor, pub_id) if pub_id else None - if lit_identifier := publication_identifier.get_identifier_by_pub_id( - cursor, pub_id - ): - lit_pub.LitPID = lit_identifier[0].ID - lit_pub.LitPIDType = lit_identifier[0].IDScheme - lit_pub.LitTitle, lit_pub.LitJournal, \ - publisher_list, lit_pub.LitPublicationYear, \ - lit_pub.LitInternalID, lit_pub.LitInternalIDType, \ - lit_pub.LastUpdated = publication.select(cursor, pub_id) +def get_by_pub_id(cursor: DictCursor, pub_id: int) -> LitMetadata: + """Get a literature publication from the database by id from the publication table""" + lit_pub = LitMetadata() - publisher = publisher_list.split(DB_STRING_SEPERATOR) if publisher_list else [] + if lit_identifier := publication_identifier.get_identifier_by_pub_id(cursor, pub_id): + lit_pub.LitPID = lit_identifier[0].ID + lit_pub.LitPIDType = lit_identifier[0].IDScheme - if UNKNOWN_FIELD_CONTENT in publisher: - publisher.remove(UNKNOWN_FIELD_CONTENT) + lit_pub.LitTitle, lit_pub.LitJournal, \ + publisher_list, lit_pub.LitPublicationYear, \ + lit_pub.LitInternalID, lit_pub.LitInternalIDType, \ + lit_pub.LastUpdated = publication.select(cursor, pub_id) - lit_pub.LitPublisher = publisher - lit_pub.LitAuthors = publication_authors.select_by_pub_id(cursor, pub_id) - lit_pub.LitHelmholtzCenter = _get_helmholtz_center(cursor, pub_id) - lit_pub.HelmholtzResearchField = _get_research_field(cursor, pub_id) - lit_pub.HelmholtzResearchSubField = _get_research_sub_field(cursor, pub_id) - lit_pub.HelmholtzLargeScaleFacilities = _get_large_scale_facilities( - cursor, pub_id - ) - lit_pub.HelmholtzInstruments = _get_instruments(cursor, pub_id) + publisher = publisher_list.split(DB_STRING_SEPERATOR) if publisher_list else [] - return lit_pub + if UNKNOWN_FIELD_CONTENT in publisher: + publisher.remove(UNKNOWN_FIELD_CONTENT) - return None + lit_pub.LitPublisher = publisher + lit_pub.LitAuthors = publication_authors.select_by_pub_id(cursor, pub_id) + lit_pub.LitHelmholtzCenter = _get_helmholtz_center(cursor, pub_id) + lit_pub.HelmholtzResearchField = _get_research_field(cursor, pub_id) + lit_pub.HelmholtzResearchSubField = _get_research_sub_field(cursor, pub_id) + lit_pub.HelmholtzLargeScaleFacilities = _get_large_scale_facilities(cursor, pub_id) + lit_pub.HelmholtzInstruments = _get_instruments(cursor, pub_id) + + return lit_pub def _get_helmholtz_center(cursor: DictCursor, pub_id: int) -> Optional[str]: diff --git a/toolbox/exporter/db_models/LitPub/lit_pub_insert.py b/toolbox/database/models/LitPub/lit_pub_insert.py similarity index 93% rename from toolbox/exporter/db_models/LitPub/lit_pub_insert.py rename to toolbox/database/models/LitPub/lit_pub_insert.py index a9986c1..a611055 100644 --- a/toolbox/exporter/db_models/LitPub/lit_pub_insert.py +++ b/toolbox/database/models/LitPub/lit_pub_insert.py @@ -6,11 +6,11 @@ from pymysql.constants.ER import DUP_ENTRY from pymysql.cursors import DictCursor from pymysql.err import IntegrityError -from toolbox.exporter.db_models.config import UNKNOWN_FIELD_CONTENT -from toolbox.exporter.db_models.exception.publication_exists_error import PublicationExistsError -from toolbox.exporter.db_models.TableModules import publication -from toolbox.exporter.db_models.TableModules import publication_authors -from toolbox.exporter.db_models.TableModules import publication_identifier +from toolbox.database.config import UNKNOWN_FIELD_CONTENT +from toolbox.database.models.exception.publication_exists_error import PublicationExistsError +from toolbox.database.models.TableModules import publication +from toolbox.database.models.TableModules import publication_authors +from toolbox.database.models.TableModules import publication_identifier from toolbox.type_definitions import LitMetadata diff --git a/toolbox/exporter/db_models/LitPub/lit_pub_update.py b/toolbox/database/models/LitPub/lit_pub_update.py similarity index 88% rename from toolbox/exporter/db_models/LitPub/lit_pub_update.py rename to toolbox/database/models/LitPub/lit_pub_update.py index 1ca9a2c..24bb0b8 100644 --- a/toolbox/exporter/db_models/LitPub/lit_pub_update.py +++ b/toolbox/database/models/LitPub/lit_pub_update.py @@ -3,11 +3,11 @@ import logging from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.exception.publication_not_found_error import PublicationNotFoundError -from toolbox.exporter.db_models.LitPub import lit_pub_get -from toolbox.exporter.db_models.TableModules import publication -from toolbox.exporter.db_models.TableModules import publication_authors -from toolbox.exporter.db_models.TableModules import publication_identifier +from toolbox.database.models.exception.publication_not_found_error import PublicationNotFoundError +from toolbox.database.models.LitPub import lit_pub_get +from toolbox.database.models.TableModules import publication +from toolbox.database.models.TableModules import publication_authors +from toolbox.database.models.TableModules import publication_identifier from toolbox.type_definitions import LitMetadata diff --git a/toolbox/exporter/db_models/TableModules/fuji_score.py b/toolbox/database/models/TableModules/fuji_score.py similarity index 91% rename from toolbox/exporter/db_models/TableModules/fuji_score.py rename to toolbox/database/models/TableModules/fuji_score.py index ecb9688..2fff5a8 100644 --- a/toolbox/exporter/db_models/TableModules/fuji_score.py +++ b/toolbox/database/models/TableModules/fuji_score.py @@ -31,9 +31,7 @@ def insert( return cursor.lastrowid -def get_latest( - cursor: DictCursor, pub_id: int, pid: Optional[str], pid_type: Optional[str] -) -> Optional[dict]: +def get_latest(cursor: DictCursor, pub_id: int) -> Optional[dict]: """ Selects the latest inserted fuji score from the database. """ @@ -58,7 +56,7 @@ def get_latest( if 'publication_id' in fair_scores: del fair_scores['publication_id'] - return fair_scores | {'pid': pid, 'pid_type': pid_type} + return fair_scores return None diff --git a/toolbox/exporter/db_models/TableModules/publication.py b/toolbox/database/models/TableModules/publication.py similarity index 93% rename from toolbox/exporter/db_models/TableModules/publication.py rename to toolbox/database/models/TableModules/publication.py index cf59c8b..6847786 100644 --- a/toolbox/exporter/db_models/TableModules/publication.py +++ b/toolbox/database/models/TableModules/publication.py @@ -6,10 +6,11 @@ from typing import Optional from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.config import DB_STRING_SEPERATOR -from toolbox.exporter.db_models.config import UNKNOWN_FIELD_CONTENT -from toolbox.exporter.db_models.exception.publication_mismatch_error import PublicationMismatchError -from toolbox.exporter.db_models.TableModules import publication_identifier +from toolbox.database.config import DB_STRING_SEPERATOR +from toolbox.database.config import UNKNOWN_FIELD_CONTENT +from toolbox.database.models.exception.publication_mismatch_error import PublicationMismatchError +from toolbox.database.models.exception.publication_not_found_error import PublicationNotFoundError +from toolbox.database.models.TableModules import publication_identifier from toolbox.type_definitions import DataMetadata from toolbox.type_definitions import LitMetadata @@ -32,7 +33,7 @@ def select( if result := cursor.fetchone(): return tuple(result.values()) - return (None, ) * 7 + raise PublicationNotFoundError(pub_id) def insert(cursor: DictCursor, pub: LitMetadata | DataMetadata) -> int: diff --git a/toolbox/exporter/db_models/TableModules/publication_authors.py b/toolbox/database/models/TableModules/publication_authors.py similarity index 96% rename from toolbox/exporter/db_models/TableModules/publication_authors.py rename to toolbox/database/models/TableModules/publication_authors.py index 1572e69..3d097f8 100644 --- a/toolbox/exporter/db_models/TableModules/publication_authors.py +++ b/toolbox/database/models/TableModules/publication_authors.py @@ -3,7 +3,7 @@ A module representing operations on the table publication_authors in the Databas """ from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.config import DB_STRING_SEPERATOR +from toolbox.database.config import DB_STRING_SEPERATOR from toolbox.type_definitions import DataMetadata from toolbox.type_definitions import LitMetadata diff --git a/toolbox/exporter/db_models/TableModules/publication_identifier.py b/toolbox/database/models/TableModules/publication_identifier.py similarity index 96% rename from toolbox/exporter/db_models/TableModules/publication_identifier.py rename to toolbox/database/models/TableModules/publication_identifier.py index 2ea3e2f..27db99f 100644 --- a/toolbox/exporter/db_models/TableModules/publication_identifier.py +++ b/toolbox/database/models/TableModules/publication_identifier.py @@ -6,7 +6,7 @@ from typing import Optional from pymysql.cursors import DictCursor -from toolbox.exporter.db_models.config import UNKNOWN_FIELD_CONTENT +from toolbox.database.config import UNKNOWN_FIELD_CONTENT from toolbox.type_definitions import DataMetadata from toolbox.type_definitions import LitMetadata from toolbox.type_definitions.data_metadata_dict import Identifier @@ -60,7 +60,7 @@ def get_multiple_for_publication( ) -> list[int]: """Get the list of found DB-ids for a data publication resp. its identifier list""" pub_ids = [] - identifiers = pub.Identifier.copy() + identifiers = pub.Identifiers.copy() if pub.DataPID and pub.DataPIDType: identifier = Identifier() @@ -121,7 +121,6 @@ def insert( VALUES (%(publication_id)s, %(identifier_id)s, %(identifier_type)s, %(identifier_url)s) """ - if isinstance(pub, LitMetadata) and pub.LitPID and pub.LitPIDType: cursor.execute( pub_identifier_query, { @@ -136,7 +135,7 @@ def insert( query_data = [] found_data_pid = False - for identifier in pub.Identifier: + for identifier in pub.Identifiers: if pub.DataPID == identifier.ID \ and pub.DataPIDType == identifier.IDScheme: found_data_pid = True @@ -176,8 +175,8 @@ def add( new_identifier: list[Identifier] = [] - for new_ident in new_pub.Identifier: - for old_ident in db_pub.Identifier: + for new_ident in new_pub.Identifiers: + for old_ident in db_pub.Identifiers: if new_ident.ID == old_ident.ID and new_ident.IDScheme == old_ident.IDScheme and old_ident.IDURL: if new_ident.IDURL: logging.debug( diff --git a/toolbox/exporter/db_models/TableModules/reference.py b/toolbox/database/models/TableModules/reference.py similarity index 100% rename from toolbox/exporter/db_models/TableModules/reference.py rename to toolbox/database/models/TableModules/reference.py diff --git a/toolbox/exporter/db_models/TableModules/related_identifier.py b/toolbox/database/models/TableModules/related_identifier.py similarity index 100% rename from toolbox/exporter/db_models/TableModules/related_identifier.py rename to toolbox/database/models/TableModules/related_identifier.py diff --git a/toolbox/database/models/cron_assessment.py b/toolbox/database/models/cron_assessment.py new file mode 100644 index 0000000..61ed209 --- /dev/null +++ b/toolbox/database/models/cron_assessment.py @@ -0,0 +1,52 @@ +"""Module for getting cron info for assessment cron""" +from datetime import datetime + +from pymysql.cursors import DictCursor + + +def add(cursor: DictCursor, pub_id: int, date: datetime = datetime.now()) -> None: + """Add an assessment time for a publication via database id. If already one exists we take the earlier one.""" + cursor.execute( + """ + INSERT INTO cron_assessment + VALUES (%s, %s, '0000-00-00 00:00:00') + ON DUPLICATE KEY UPDATE next_run = LEAST(next_run, VALUES(next_run)) + """, [pub_id, date] + ) + + +def delete_next_assessment(cursor: DictCursor, pub_id: int) -> None: + """Set next assessment time for publications via a list of publication ids""" + cursor.execute( + """ + DELETE FROM cron_assessment + WHERE publication_id = %s + """, pub_id + ) + + +def set_start_assessment(cursor: DictCursor, pub_ids: list[int]) -> None: + """Set next assessment time for publications via a list of publication ids""" + cursor.execute( + f""" + UPDATE cron_assessment + SET last_assessment_start = NOW() + WHERE publication_id in ({ ','.join(['%s'] * len(pub_ids))}) + """, pub_ids + ) + + +def get_pub_ids_to_assess(cursor: DictCursor, limit: int) -> list[int]: + """Get a limited list of publication ids which need to be assessed""" + cursor.execute( + """ + SELECT publication_id + FROM cron_assessment + WHERE next_run <= NOW() + AND last_assessment_start < NOW() - INTERVAL 12 HOUR + ORDER BY next_run DESC, publication_id LIMIT %s + FOR UPDATE + """, limit + ) + + return [row['publication_id'] for row in cursor.fetchall()] diff --git a/toolbox/database/models/cron_data_enrichment.py b/toolbox/database/models/cron_data_enrichment.py new file mode 100644 index 0000000..43c1eb0 --- /dev/null +++ b/toolbox/database/models/cron_data_enrichment.py @@ -0,0 +1,46 @@ +"""Module for getting cron info for data enrichment cron""" +from datetime import datetime + +from pymysql.cursors import DictCursor + + +def add(cursor: DictCursor, pub_id: int, date: datetime = datetime.now()) -> None: + """Add an enrichment time for a publication via database id. If already one exists we take the earlier one.""" + cursor.execute( + """ + INSERT INTO cron_data_enrichment + VALUES (%s, %s) + ON DUPLICATE KEY UPDATE next_run = LEAST(next_run, VALUES(next_run)) + """, [pub_id, date] + ) + + +def set_next_data_enrichment( + cursor: DictCursor, pub_ids: list[int], date: datetime = datetime.now() +) -> None: + """Set next enrichment time for publications via a list of publication ids""" + values = [(pub_id, date) for pub_id in pub_ids] + + cursor.executemany( + """ + INSERT INTO cron_data_enrichment + VALUES (%s, %s) + ON DUPLICATE KEY UPDATE next_run = VALUES(next_run) + """, values + ) + + +def get_pub_ids_to_enrich(cursor: DictCursor, limit: int = 10) -> list[int]: + """Get a limited list of publication ids which need to be enriched""" + cursor.execute( + """ + SELECT publication_id + FROM cron_data_enrichment + WHERE next_run < NOW() + ORDER BY next_run DESC, publication_id LIMIT %s + FOR UPDATE + """, + limit, + ) + + return [row['publication_id'] for row in cursor.fetchall()] diff --git a/toolbox/database/models/cron_last_run.py b/toolbox/database/models/cron_last_run.py new file mode 100644 index 0000000..a6ff8b8 --- /dev/null +++ b/toolbox/database/models/cron_last_run.py @@ -0,0 +1,26 @@ +"""Module for getting last run info for harvesting cron""" +from datetime import datetime + +from pymysql.cursors import DictCursor + + +def set_centers_last_run(cursor: DictCursor, data: dict[str, datetime]) -> None: + """Set last run time for center""" + cursor.executemany( + """ + INSERT INTO cron_last_run + VALUES (%s, %s) + ON DUPLICATE KEY UPDATE last_run = VALUES(last_run) + """, list(data.items()) + ) + + +def get_last_run_dates(cursor: DictCursor) -> dict[str, datetime]: + """Get last run time for all centers""" + cursor.execute("SELECT center, last_run FROM cron_last_run FOR UPDATE") + center_last_run: dict[str, datetime] = {} + + for data in cursor.fetchall(): + center_last_run[data['center']] = data['last_run'].date() + + return center_last_run diff --git a/toolbox/exporter/db_models/exception/multiple_datapubs_error.py b/toolbox/database/models/exception/multiple_datapubs_error.py similarity index 100% rename from toolbox/exporter/db_models/exception/multiple_datapubs_error.py rename to toolbox/database/models/exception/multiple_datapubs_error.py diff --git a/toolbox/exporter/db_models/exception/publication_exists_error.py b/toolbox/database/models/exception/publication_exists_error.py similarity index 100% rename from toolbox/exporter/db_models/exception/publication_exists_error.py rename to toolbox/database/models/exception/publication_exists_error.py diff --git a/toolbox/exporter/db_models/exception/publication_mismatch_error.py b/toolbox/database/models/exception/publication_mismatch_error.py similarity index 100% rename from toolbox/exporter/db_models/exception/publication_mismatch_error.py rename to toolbox/database/models/exception/publication_mismatch_error.py diff --git a/toolbox/exporter/db_models/exception/publication_not_found_error.py b/toolbox/database/models/exception/publication_not_found_error.py similarity index 54% rename from toolbox/exporter/db_models/exception/publication_not_found_error.py rename to toolbox/database/models/exception/publication_not_found_error.py index 90b03d2..768c4f6 100644 --- a/toolbox/exporter/db_models/exception/publication_not_found_error.py +++ b/toolbox/database/models/exception/publication_not_found_error.py @@ -1,13 +1,11 @@ """ An exception raised, when there is no Publication found for the given pid """ -from typing import Optional class PublicationNotFoundError(Exception): """ - An exception raised, when there is no Publication found for the given pid + An exception raised, when there is no Publication found for the given id """ - def __init__(self, pid: Optional[str]): - self.pid = pid - super().__init__(f"Publication not found for pid {pid}") + def __init__(self, pub_id): + super().__init__(f"Publication not found for pid {pub_id}") diff --git a/toolbox/database/models/publication.py b/toolbox/database/models/publication.py new file mode 100644 index 0000000..a851a1e --- /dev/null +++ b/toolbox/database/models/publication.py @@ -0,0 +1,35 @@ +""" +A model class representing a literature publication in a database. +""" +from pymysql.cursors import DictCursor + +from toolbox.database.models import cron_assessment +from toolbox.database.models import cron_data_enrichment +from toolbox.database.models.DataPublication import data_pub_insert +from toolbox.database.models.DataPublication import data_pub_update +from toolbox.database.models.LitPub import lit_pub_insert +from toolbox.database.models.LitPub import lit_pub_update +from toolbox.type_definitions import DataMetadata +from toolbox.type_definitions import LitMetadata + +Publication = LitMetadata | DataMetadata + + +def insert(cursor: DictCursor, pub: Publication) -> int: + """ Insert a literature publication into the database. """ + if isinstance(pub, LitMetadata): + publication_id = lit_pub_insert.insert(cursor=cursor, lit_pub=pub) + cron_data_enrichment.add(cursor, publication_id) + else: + publication_id = data_pub_insert.insert(cursor=cursor, data_pub=pub) + cron_assessment.add(cursor, publication_id) + + return publication_id + + +def update(cursor: DictCursor, pub: Publication) -> None: + """Update all data for a literature publication in the database.""" + if isinstance(pub, LitMetadata): + lit_pub_update.update(cursor=cursor, new_lit_pub=pub) + else: + data_pub_update.update(cursor=cursor, new_data_pub=pub) diff --git a/toolbox/exporter/db_models/mysql_connection.py b/toolbox/database/mysql_connection.py similarity index 51% rename from toolbox/exporter/db_models/mysql_connection.py rename to toolbox/database/mysql_connection.py index f403424..897f073 100644 --- a/toolbox/exporter/db_models/mysql_connection.py +++ b/toolbox/database/mysql_connection.py @@ -1,17 +1,18 @@ """ A class representing a connection to a MySQL database, e.g. MariaDB. """ +from collections.abc import Iterator +from contextlib import contextmanager +from typing import Optional + import pymysql from pymysql.cursors import DictCursor +from toolbox.config import config + class DbConfig: """Typing object for a DbConfig of Toolbox MysqlConnection""" - user: str - password: str - host: str - database: str - def __init__(self, user: str, password: str, host: str, database: str): self.user = user self.password = password @@ -23,8 +24,16 @@ class MysqlConnection: """ A class representing a connection to a MySQL database, e.g. MariaDB. """ - def __init__(self, db_config: DbConfig): + def __init__(self, db_config: Optional[DbConfig] = None): """Connect to Database.""" + + db_config = db_config or DbConfig( + user=config.DB_USER, + password=config.DB_PASS, + host=config.DB_HOST, + database=config.DB_NAME, + ) + self.connection = pymysql.connect( user=db_config.user, password=db_config.password, @@ -33,3 +42,13 @@ class MysqlConnection: cursorclass=DictCursor, charset="utf8mb4", ) + + @property + @contextmanager + def cursor(self) -> Iterator[DictCursor]: + """Provide a cursor for the current connection in a context manager.""" + with self.connection as connection, connection.cursor() as cursor: + connection.ping(reconnect=True) + connection.begin() + yield cursor + connection.commit() diff --git a/toolbox/exporter/db_models/schema.sql b/toolbox/database/schema.sql similarity index 92% rename from toolbox/exporter/db_models/schema.sql rename to toolbox/database/schema.sql index d6d9dc2..703990b 100644 --- a/toolbox/exporter/db_models/schema.sql +++ b/toolbox/database/schema.sql @@ -37,6 +37,32 @@ CREATE TABLE `center_has_publication` ( CONSTRAINT `fk_center_has_publication_publication1` FOREIGN KEY (`publication_id`) REFERENCES `publication` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; +CREATE TABLE `cron_last_run` ( + `center` varchar(255) NOT NULL, + `last_run` DATETIME NOT NULL, + PRIMARY KEY (`center`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + +CREATE TABLE `cron_data_enrichment` ( + `publication_id` int(10) unsigned NOT NULL, + `next_run` DATETIME NOT NULL, + PRIMARY KEY (`publication_id`), + CONSTRAINT `fk_cron_data_enrichment_publication1` FOREIGN KEY (`publication_id`) REFERENCES `publication` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + +CREATE INDEX cron_data_enrichment_next_run_index + ON cron_data_enrichment (next_run); + +CREATE TABLE `cron_assessment` ( + `publication_id` int(10) unsigned NOT NULL, + `next_run` DATETIME NOT NULL, + `last_assessment_start` DATETIME NOT NULL, + PRIMARY KEY (`publication_id`), + CONSTRAINT `fk_cron_assessment_publication1` FOREIGN KEY (`publication_id`) REFERENCES `publication` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; + +CREATE INDEX cron_assessment_next_run_index + ON cron_assessment (next_run); CREATE TABLE `fuji_score` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, @@ -191,16 +217,6 @@ CREATE TABLE `large_scale_facility_has_publication` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; -CREATE TABLE `metric` ( - `id` int(10) unsigned NOT NULL AUTO_INCREMENT, - `publication_id` int(10) unsigned NOT NULL, - `type` varchar(64) NOT NULL, - PRIMARY KEY (`id`), - KEY `fk_metric_publication1_idx` (`publication_id`), - CONSTRAINT `fk_metric_publication1` FOREIGN KEY (`publication_id`) REFERENCES `publication` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; - - CREATE TABLE `publication_authors` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `full_names` longtext NOT NULL, diff --git a/toolbox/exporter/__init__.py b/toolbox/exporter/__init__.py deleted file mode 100644 index acee01c..0000000 --- a/toolbox/exporter/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""A package for exporting the results of the HMC Toolbox.""" -from .db_exporter import DBExporter -from .json_exporter import JSONExporter diff --git a/toolbox/exporter/db_exporter.py b/toolbox/exporter/db_exporter.py index 22e5fdd..b12443a 100644 --- a/toolbox/exporter/db_exporter.py +++ b/toolbox/exporter/db_exporter.py @@ -2,51 +2,34 @@ Takes a dictionary and/or a list of dictionaries and writes them directly into a database. """ import logging -import os -from typing import Optional -from .db_models.mysql_connection import DbConfig -from .db_models.mysql_connection import MysqlConnection -from toolbox.exporter.db_models.exception.publication_exists_error import PublicationExistsError -from toolbox.exporter.db_models.publication_db import PublicationDb +import toolbox.database.models.publication as publication_db +from toolbox.database.models.exception.publication_exists_error import PublicationExistsError +from toolbox.database.mysql_connection import MysqlConnection from toolbox.type_definitions import DataMetadata from toolbox.type_definitions import LitMetadata -class DBExporter: +def export(pubs: list[LitMetadata | DataMetadata]) -> None: """ - Takes a dictionary and/or a list of dictionaries and writes them directly into a database + Write data for literature publication and associated data publications into the database. + :param pubs: list of the pubs + :return: None """ - def __init__(self, publication_model: Optional[PublicationDb] = None): - self.publication_model = publication_model if publication_model is not None else PublicationDb( - MysqlConnection( - DbConfig( - user=os.environ.get('TOOLBOX_DB_USER', ''), - password=os.environ.get('TOOLBOX_DB_PASS', ''), - host=os.environ.get('TOOLBOX_DB_HOST', ''), - database=os.environ.get('TOOLBOX_DB_NAME', '') - ) - ) - ) - def export(self, pubs: list[LitMetadata | DataMetadata]) -> None: - """ - Write data for literature publication and associated data publications into the database. - :param pubs: list of the pubs - :return: None - """ + # we put the lit pubs first to database, so we can link the data pubs later + for lit_pub in [pub for pub in pubs if isinstance(pub, LitMetadata)]: + _insert_or_update(lit_pub) - # we put the lit pubs first to database, so we can link the data pubs later - for lit_pub in [pub for pub in pubs if isinstance(pub, LitMetadata)]: - self._insert_or_update(lit_pub) + for data_pub in [pub for pub in pubs if isinstance(pub, DataMetadata)]: + _insert_or_update(data_pub) - for data_pub in [pub for pub in pubs if isinstance(pub, DataMetadata)]: - self._insert_or_update(data_pub) - def _insert_or_update(self, pub: LitMetadata | DataMetadata) -> None: +def _insert_or_update(pub: LitMetadata | DataMetadata) -> None: + with MysqlConnection().cursor as cursor: try: logging.debug("Insert pub %s", pub) - self.publication_model.insert(pub) + publication_db.insert(cursor, pub) except PublicationExistsError: logging.debug("Pub already exists, updating instead") - self.publication_model.update(pub) + publication_db.update(cursor, pub) diff --git a/toolbox/exporter/db_models/__init__.py b/toolbox/exporter/db_models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/toolbox/exporter/db_models/publication_db.py b/toolbox/exporter/db_models/publication_db.py deleted file mode 100644 index d586e0a..0000000 --- a/toolbox/exporter/db_models/publication_db.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -A model class representing a literature publication in a database. -""" -from toolbox.exporter.db_models.DataPublication import data_pub_insert -from toolbox.exporter.db_models.DataPublication import data_pub_update -from toolbox.exporter.db_models.LitPub import lit_pub_insert -from toolbox.exporter.db_models.LitPub import lit_pub_update -from toolbox.exporter.db_models.mysql_connection import MysqlConnection -from toolbox.type_definitions import DataMetadata -from toolbox.type_definitions import LitMetadata - -Publication = LitMetadata | DataMetadata - - -class PublicationDb: - """ - A model class representing a literature publication in a database. - """ - def __init__(self, connection: MysqlConnection): - """Connect to Database.""" - self.connection = connection.connection - - def insert(self, pub: Publication) -> int: - """ Insert a literature publication into the database. """ - with self.connection as connection: - connection.ping(reconnect=True) - connection.begin() - with connection.cursor() as cursor: - if isinstance(pub, LitMetadata): - publication_id = lit_pub_insert.insert( - cursor=cursor, - lit_pub=pub, - ) - - else: - publication_id = data_pub_insert.insert( - cursor=cursor, - data_pub=pub, - ) - - connection.commit() - return publication_id - - def update(self, pub: Publication) -> None: - """Update all data for a literature publication in the database.""" - with self.connection as connection: - connection.ping(reconnect=True) - connection.begin() - with connection.cursor() as cursor: - if isinstance(pub, LitMetadata): - lit_pub_update.update(cursor=cursor, new_lit_pub=pub) - else: - data_pub_update.update(cursor=cursor, new_data_pub=pub) - - connection.commit() diff --git a/toolbox/extractor/extract.py b/toolbox/extractor/extract.py new file mode 100644 index 0000000..7659b5e --- /dev/null +++ b/toolbox/extractor/extract.py @@ -0,0 +1,171 @@ +""" +Module to extract and export data from harvested xml files. +""" +import logging +import os +import shutil +import traceback +from concurrent.futures import as_completed +from concurrent.futures import Future +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +from pymysql import DatabaseError + +from toolbox.exporter import db_exporter +from toolbox.extractor.metadata_extractor import DublinCoreExtractor +from toolbox.extractor.metadata_extractor import MarcXMLExtractor +from toolbox.extractor.publication_validator import LitPubValidator +from toolbox.requester.scholexplorer_request import ScholexplorerRequest +from toolbox.type_definitions import DataMetadata +from toolbox.type_definitions import LitMetadata + +DCParseError = DublinCoreExtractor.FileParseError +MarcParseError = MarcXMLExtractor.FileParseError + + +def extract_center( + center_acr: str, + output_path: Path, + executor: ThreadPoolExecutor, + keep_sources: bool, + zip_output: bool, +) -> None: + """ + Process and assess the harvested data for the centers using Linked Data Finder + :param center_acr: center name + :param output_path: where to save files + :param executor: Thread + :param keep_sources: keep the source file + :param zip_output: compress data to zip + :return: None + """ + + center_path = output_path / center_acr + + try: + _extract_center(center_path, center_acr, executor) + except FileNotFoundError as error: + logging.error("No data found for center %s. %s", center_acr, error) + + logging.info("Processing done for center %s", center_acr) + + if zip_output: + logging.info("Zipping center folder %s", center_acr) + shutil.make_archive(str(center_path), 'zip', center_path) + + if not keep_sources: + logging.info("Deleting center folder %s", center_acr) + shutil.rmtree(center_path) + + +def _extract_center( + center_path: Path, + center_acr: str, + executor: ThreadPoolExecutor, +) -> None: + """ + Extract literature-publication metadata from xml-files + :param center_path: the path to the xml-files + :param center_acr: acronym of the Helmholtz center + :return: the results as a nested dictionary + """ + logging.info("Extracting center %s", center_acr) + schemas = os.listdir(center_path) + extractor: DublinCoreExtractor | MarcXMLExtractor + + if 'marcxml' in schemas: + schema_path = center_path / 'marcxml' + extractor = MarcXMLExtractor() + elif 'marc' in schemas: + schema_path = center_path / 'marc' + extractor = MarcXMLExtractor() + elif 'oai_dc' in schemas: + schema_path = center_path / 'oai_dc' + extractor = DublinCoreExtractor() + else: + raise FileNotFoundError( + "No oai_dc schema folder found but must exist for OAI-PMH." + ) + + records = [schema_path / str(file) for file in os.listdir(schema_path)] + total_records = len(records) + logging.info("Found %s xml files to extract metadata.", total_records) + + futures = [] + future_record_map: dict[Future, Path] = {} + + for record in records: + future = executor.submit( + extract_record, + extractor=extractor, + record=record, + center_acr=center_acr, + ) + future_record_map[future] = record + futures.append(future) + # DDOS protection for Crossref + + percentage_bound = 0.1 + + for index, future in enumerate(as_completed(futures)): + record = future_record_map.pop(future) + + if future.exception(): + logging.error( + "Error while extracting record %s: %s", record, + "".join(traceback.format_exception(future.exception())) + ) + + if index / total_records > percentage_bound: + logging.info("%s%% xml files extracted", round(percentage_bound * 100)) + percentage_bound += 0.1 + + +def extract_record( + extractor: MarcXMLExtractor | DublinCoreExtractor, + record: Path, + center_acr: str, +) -> None: + """ + Extracts the metadata from the record and exports using the (given) exporters. + :param extractor: The extractor which should be used to extract metadata from the given record + :param record: Path to a record holding metadata for a literature publication + :param center_acr: acronym of the Helmholtz center + :return: None + """ + try: + lit_pub = extractor.extract(record, center=center_acr) + except (DCParseError, MarcParseError): + logging.warning("File %s could not be parsed, malformed xml", record) + return + + if not lit_pub.LitPID: + logging.debug('No LitPID found for literature %s', record) + return + + logging.debug("Extracting done for record %s", record) + + pub_val = LitPubValidator(lit_pub) + lit_pub.Validated = pub_val.is_validated() + + if pub_val.doi_not_registered: + logging.info( + "Found doi %s for record %s but it's not registered anywhere", + lit_pub.LitPID, + record, + ) + pub_list: list[DataMetadata | LitMetadata] = [] + if pub_val.is_dataset(): + pub_list.extend( + ScholexplorerRequest(lit_pub.LitPID, as_target=True).get_literature() + ) + pub_list.append(pub_val.convert()) + else: + pub_list = [lit_pub] + + try: + db_exporter.export(pub_list) + except DatabaseError as error: + logging.error("Error while exporting record %s to database.", record) + logging.info("%s", error) diff --git a/toolbox/data_enricher/metadata_extractor/__init__.py b/toolbox/extractor/metadata_extractor/__init__.py similarity index 100% rename from toolbox/data_enricher/metadata_extractor/__init__.py rename to toolbox/extractor/metadata_extractor/__init__.py diff --git a/toolbox/data_enricher/metadata_extractor/abstract_metadata_extractor.py b/toolbox/extractor/metadata_extractor/abstract_metadata_extractor.py similarity index 100% rename from toolbox/data_enricher/metadata_extractor/abstract_metadata_extractor.py rename to toolbox/extractor/metadata_extractor/abstract_metadata_extractor.py diff --git a/toolbox/data_enricher/metadata_extractor/dublin_core_xml_metadata_extractor.py b/toolbox/extractor/metadata_extractor/dublin_core_xml_metadata_extractor.py similarity index 100% rename from toolbox/data_enricher/metadata_extractor/dublin_core_xml_metadata_extractor.py rename to toolbox/extractor/metadata_extractor/dublin_core_xml_metadata_extractor.py diff --git a/toolbox/data_enricher/metadata_extractor/marc_xml_metadata_extractor.py b/toolbox/extractor/metadata_extractor/marc_xml_metadata_extractor.py similarity index 100% rename from toolbox/data_enricher/metadata_extractor/marc_xml_metadata_extractor.py rename to toolbox/extractor/metadata_extractor/marc_xml_metadata_extractor.py diff --git a/toolbox/data_enricher/publication_validator.py b/toolbox/extractor/publication_validator.py similarity index 100% rename from toolbox/data_enricher/publication_validator.py rename to toolbox/extractor/publication_validator.py diff --git a/toolbox/exporter/db_models/TableModules/__init__.py b/toolbox/fair_meter/fuji_scorer/__init__.py similarity index 100% rename from toolbox/exporter/db_models/TableModules/__init__.py rename to toolbox/fair_meter/fuji_scorer/__init__.py diff --git a/toolbox/fair_meter/fuji_scorer/fuji_conf.py b/toolbox/fair_meter/fuji_scorer/fuji_conf.py new file mode 100644 index 0000000..641b35a --- /dev/null +++ b/toolbox/fair_meter/fuji_scorer/fuji_conf.py @@ -0,0 +1,19 @@ +""" +A collection of constants used by the class FujiScorer. +""" +from toolbox.config import config + +FUJI_API = f"{config.FUJI_PROTOCOL}://{config.FUJI_HOST}:{config.FUJI_PORT}/fuji/api/v1/evaluate" + +REQ_DICT = {'test_debug': True, 'use_datacite': True} + +HEADERS = { + 'accept': 'application/json', + 'Authorization': f"Basic {config.FUJI_PASS}", + 'Content-Type': 'application/json' +} + +INITIAL_TIMEOUT = 2400 +EXTRA_TIMEOUT = 600 +RETRY_WAITING_TIME = 60 +MAX_RETRY_COUNTER = 10 diff --git a/toolbox/fair_meter/fuji_scorer/fuji_scorer.py b/toolbox/fair_meter/fuji_scorer/fuji_scorer.py new file mode 100644 index 0000000..bcac5ae --- /dev/null +++ b/toolbox/fair_meter/fuji_scorer/fuji_scorer.py @@ -0,0 +1,68 @@ +""" +A module to get a F.A.I.R.-assessment (Fairness, Accessibility, Interoperability, Reusability) from +the F-UJI-Server for a given data-publication. +""" +import logging +from typing import Optional + +from .fuji_conf import EXTRA_TIMEOUT +from .fuji_conf import FUJI_API +from .fuji_conf import HEADERS +from .fuji_conf import INITIAL_TIMEOUT +from .fuji_conf import MAX_RETRY_COUNTER +from .fuji_conf import REQ_DICT +from .fuji_conf import RETRY_WAITING_TIME +from toolbox.utils.toolbox_request import ToolboxRequest + + +def get_fuji_score( + data_pid: str, + data_pid_type: str, +) -> Optional[dict]: + """ + Assess the F.A.I.R.-ness of a data-publication via the F-UJI server and return the resulting + F-UJI scores in a dictionary. + :param data_pid: the PID of a data-publication + :param data_pid_type: the PID-type, e.g. DOI or HANDLE + :return: a dictionary of F-UJI metrics for the data-publication. + """ + json = REQ_DICT | {'object_identifier': data_pid} + logging.debug("FUJI scorer started for data_pid %s", data_pid) + request = ToolboxRequest( + initial_timeout=INITIAL_TIMEOUT, + extra_timeout=EXTRA_TIMEOUT, + retry_waiting_time=RETRY_WAITING_TIME, + max_retry_counter=MAX_RETRY_COUNTER, + ) + + fuji_response = request.post(FUJI_API, json=json, headers=HEADERS) + + return _select_fuji_metrics(fuji_response, data_pid, data_pid_type) + + +def _select_fuji_metrics(rs_json: dict, pid: str, pid_type: str) -> Optional[dict]: + """ + Select important information from F-UJI response. + :param rs_json: the JSON got as a response from F-UJI + :param pid: the PID of the data-publication + :param pid: the PID-type, e.g. DOI or HANDLE + :return: + """ + try: + list_metric = { + 'pid': pid, + 'pid_type': pid_type, + 'metric_specification': rs_json['metric_specification'], + 'timestamp': rs_json['end_timestamp'] + } + + for score in rs_json['summary']: + for subscore in rs_json['summary'][score]: + metric = score + '_' + subscore + metric_score = rs_json['summary'][score][subscore] + list_metric[metric] = float(metric_score) + + return list_metric + + except KeyError: + return None diff --git a/toolbox/toolbox.py b/toolbox/toolbox.py index 58a3fe3..d13b96f 100644 --- a/toolbox/toolbox.py +++ b/toolbox/toolbox.py @@ -3,15 +3,12 @@ The Toolbox harvests literature-publication metadata from publications by Helmho looks for related data-publications, assesses the F.A.I.R.-scores of these and outputs all results as JSON and/or CSV-files. """ -import datetime import logging -import logging.config as logconfig -import os -import shutil -import time from concurrent.futures import as_completed from concurrent.futures import Future from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from datetime import timedelta from pathlib import Path from typing import Optional @@ -19,34 +16,33 @@ import typer import yaml from pymysql import DatabaseError -from .data_enricher.fair_meter.fuji_scorer import FujiScorer -from .data_enricher.linked_data_finder import LinkedDataFinder -from .data_enricher.metadata_extractor import DublinCoreExtractor -from .data_enricher.metadata_extractor import MarcXMLExtractor -from .exporter import JSONExporter -from .harvester import SickleHarvester -from .type_definitions import DataMetadata -from .type_definitions import LitMetadata -from toolbox.data_enricher.publication_validator import LitPubValidator -from toolbox.exporter.db_exporter import DBExporter -from toolbox.requester.scholexplorer_request import ScholexplorerRequest - -DCParseError = DublinCoreExtractor.FileParseError -MarcParseError = MarcXMLExtractor.FileParseError - -log_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'logging.conf') -logconfig.fileConfig(log_file_path) +from toolbox.config import config +from toolbox.data_enricher.linked_data_finder import find_linked_data_publications +from toolbox.database.models.cron_assessment import delete_next_assessment +from toolbox.database.models.cron_assessment import get_pub_ids_to_assess +from toolbox.database.models.cron_assessment import set_start_assessment +from toolbox.database.models.cron_data_enrichment import get_pub_ids_to_enrich +from toolbox.database.models.cron_data_enrichment import set_next_data_enrichment +from toolbox.database.models.cron_last_run import get_last_run_dates +from toolbox.database.models.cron_last_run import set_centers_last_run +from toolbox.database.models.DataPublication import data_pub_get +from toolbox.database.models.LitPub import lit_pub_get +from toolbox.database.models.TableModules import fuji_score +from toolbox.database.mysql_connection import MysqlConnection +from toolbox.exporter import db_exporter +from toolbox.extractor.extract import extract_center +from toolbox.extractor.metadata_extractor import DublinCoreExtractor +from toolbox.extractor.metadata_extractor import MarcXMLExtractor +from toolbox.fair_meter.fuji_scorer.fuji_scorer import get_fuji_score +from toolbox.harvester.sickle_harvester import SickleHarvester +from toolbox.type_definitions import DataMetadata +from toolbox.type_definitions import LitMetadata app = typer.Typer() app.add_typer(SickleHarvester.app, name='sickle-harvester') app.add_typer(DublinCoreExtractor.app, name='dc-xml-extractor') app.add_typer(MarcXMLExtractor.app, name='marc-xml-extractor') -try: - max_threads = int(os.environ.get('MAX_THREADS', 1)) -except ValueError: - max_threads = 1 - class ToolboxException(Exception): """Is raised when toolbox fails.""" @@ -58,44 +54,37 @@ class Center(dict): endpoint: str -@app.command('run') -def main( +@app.command() +def harvest( output_path: Path = Path('./output'), - from_date: Optional[str] = None, centers_conf: Path = Path('centers.yaml'), + from_date: Optional[str] = None, keep_sources: bool = True, zip_output: bool = False, - skip_harvest: bool = False, ) -> None: """ - Harvest literature-publication metadata from publications by Helmholtz centers, look - for related data-publications, assesses the F.A.I.R.-scores of these and output all results as - JSON and/or CSV-files. - - \b - :param: output_path: the output directory for the harvested data - :param: from_date: the date from which onwards data are to be harvested, if not given, all - available data are harvested - :param: centers_conf: the config file specifying which centers to harvest and how - :return: None + Harvest from OAI-PMH-APIs, extract the metadata and export to database. """ - program_start = time.time() + harvest_start = datetime.now() - with open(centers_conf, 'r', encoding='utf8') as file: - helmholtz_centers = yaml.safe_load(file) + with ( + open(centers_conf, 'r', encoding='utf8') as conf_file, MysqlConnection().cursor as + cursor, ThreadPoolExecutor(max_workers=config.MAX_THREADS) as executor + ): + helmholtz_centers = yaml.safe_load(conf_file) + center_last_run = get_last_run_dates(cursor) + logging.debug("LAST RUN: %s", center_last_run) - with ThreadPoolExecutor(max_workers=max_threads) as executor: futures = [] future_center_map: dict[Future, str] = {} for center_acr, center in helmholtz_centers.items(): + logging.info("Starting harvesting for center %s", center_acr) future = executor.submit( - harvest_center, - center_acr=center_acr, - center=center, - output_path=output_path, - from_date=from_date, - skip_harvest=skip_harvest + SickleHarvester.harvest, + url=center["endpoint"], + output_path=output_path / center_acr, + from_date=from_date or center_last_run.get(center_acr), ) futures.append(future) future_center_map[future] = center_acr @@ -108,224 +97,93 @@ def main( "Error while harvesting center %s: %s", center_acr, future.exception() ) - process_center(center_acr, output_path, executor, keep_sources, zip_output) - - logging.info( - "Toolbox execution time: %s.", - str(datetime.timedelta(seconds=time.time() - program_start)) - ) - - -def harvest_center( - center_acr: str, - center: Center, - output_path: Path, - from_date: Optional[str], - skip_harvest: bool, -) -> None: - """ - Harvesting given center using SickleHarvester - :param center_acr: center name - :param output_path: where to save files - :param center: center to harvest - :param from_date: date of publication - :param skip_harvest: harvest skip - :return: None - """ - - if skip_harvest: - logging.info("Skipping harvesting for center %s", center_acr) - return - - center_path = output_path / center_acr - logging.info("Starting harvesting for center %s", center_acr) - - SickleHarvester.harvest(center["endpoint"], center_path, from_date=from_date) - - logging.info("Harvesting done for center %s", center_acr) - - -def process_center( - center_acr: str, - output_path: Path, - executor: ThreadPoolExecutor, - keep_sources: bool, - zip_output: bool, -) -> None: - """ - Process and assess the harvested data for the centers using Linked Data Finder - :param center_acr: center name - :param output_path: where to save files - :param executor: Thread - :param keep_sources: keep the source file - :param zip_output: compress data to zip - :return: None - """ - - center_path = output_path / center_acr - - try: - _process_center(center_path, center_acr, executor) - except FileNotFoundError as error: - logging.error("No data found for center %s. %s", center_acr, error) - - logging.info("Processing done for center %s", center_acr) + logging.info( + "Harvesting done for center %s after %s", center_acr, + datetime.now() - harvest_start + ) - if zip_output: - logging.info("Zipping center folder %s", center_acr) - shutil.make_archive(str(center_path), 'zip', center_path) + extract_center(center_acr, output_path, executor, keep_sources, zip_output) + center_last_run[center_acr] = harvest_start - if not keep_sources: - logging.info("Deleting center folder %s", center_acr) - shutil.rmtree(center_path) + set_centers_last_run(cursor, center_last_run) + logging.info("Harvest done") -def _process_center( - center_path: Path, - center_acr: str, - executor: ThreadPoolExecutor, -) -> None: +@app.command() +def enrich(limit: int = 10): """ - Extract literature-publication metadata from xml-files, get linked data-publications, assess - the F.A.I.R.-ness of these and return the results. - :param center_path: the path to the xml-files - :param center_acr: acronym of the Helmholtz center - :return: the results as a nested dictionary + Get (literature) publications ready for enrichment (from cron_data_enrichment table in database), + find linked data and store them in database. """ - logging.info("Processing center %s", center_acr) - schemas = os.listdir(center_path) - extractor: DublinCoreExtractor | MarcXMLExtractor - - if 'marcxml' in schemas: - schema_path = center_path / 'marcxml' - extractor = MarcXMLExtractor() - elif 'marc' in schemas: - schema_path = center_path / 'marc' - extractor = MarcXMLExtractor() - elif 'oai_dc' in schemas: - schema_path = center_path / 'oai_dc' - extractor = DublinCoreExtractor() - else: - raise FileNotFoundError( - "No oai_dc schema folder found but must exist for OAI-PMH." - ) + with MysqlConnection().cursor as cursor: + pub_ids = get_pub_ids_to_enrich(cursor, limit) + logging.info("Got %s publications for enrichment", len(pub_ids)) + logging.debug("IDs from publications: %s", pub_ids) - records = [schema_path / str(file) for file in os.listdir(schema_path)] - total_records = len(records) - logging.info("Found %s xml files to process.", total_records) + data_pubs: list[LitMetadata | DataMetadata] = [] - futures = [] - future_record_map: dict[Future, Path] = {} + for pub_id in pub_ids: + lit_pub = lit_pub_get.get_by_pub_id(cursor, pub_id) - for record in records: - future = executor.submit( - process_record, - extractor=extractor, - record=record, - center_acr=center_acr, - center_path=center_path, - ) - future_record_map[future] = record - futures.append(future) - # DDOS protection for Crossref - - percentage_bound = 0.1 + if lit_pub.LitPID and lit_pub.LitPIDType: + logging.debug( + "Requesting DataPubs for litPID %s:%s", lit_pub.LitPIDType, + lit_pub.LitPID + ) + data_pubs.extend( + find_linked_data_publications(lit_pub.LitPID, lit_pub.LitPIDType) + ) - for index, future in enumerate(as_completed(futures)): - record = future_record_map.pop(future) + logging.info("Found %s data publications via enrichment", len(data_pubs)) - if future.exception(): + try: + db_exporter.export(data_pubs) + except DatabaseError as error: logging.error( - "Error while processing record %s: %s", record, future.exception() + "Error while exporting data pubs to database for pub_ids %s", pub_ids ) + logging.info("%s", error) - if index / total_records > percentage_bound: - logging.info("%s%% xml files processed", round(percentage_bound * 100)) - percentage_bound += 0.1 + set_next_data_enrichment( + cursor, + pub_ids, + datetime.now() + timedelta(days=config.DAYS_UNTIL_REENRICHMENT), + ) + logging.info("Enrichment done") -def process_record( - extractor: MarcXMLExtractor | DublinCoreExtractor, - record: Path, - center_acr: str, - center_path: Path, -) -> None: + +@app.command() +def assess(limit: int = 10): """ - Extracts the metadata from the record and looks for data publications related to the litpub. - Finally, everything is exported using the (given) exporters. - :param extractor: The extractor which should be used to extract metadata from the given record - :param record: Path to a record holding metadata for a literature publication - :param center_path: the path to the xml-files - :param center_acr: acronym of the Helmholtz center - :return: None + Get (data) publications ready for assessment (from cron_assessment table in database), + assess the F.A.I.R.-ness via F-UJI and store data in database. """ - try: - lit_pub = extractor.extract(record, center=center_acr) - except (DCParseError, MarcParseError): - logging.warning("File %s could not be parsed, malformed xml", record) - return - - if not lit_pub.LitPID: - logging.debug('No LitPID found for literature %s', record) - return + with MysqlConnection().cursor as cursor: - logging.debug("Extracting done for record %s", record) + if not (pub_ids := get_pub_ids_to_assess(cursor, limit)): + logging.info("No publications for assessment, exiting…") + return - fjs = FujiScorer() - ldf = LinkedDataFinder() + start_time = datetime.now() + logging.info("Got %s publications for assessing", len(pub_ids)) + logging.debug("IDs from publications: %s", pub_ids) + set_start_assessment(cursor, pub_ids) - pub_val = LitPubValidator(lit_pub) - lit_pub.Validated = pub_val.is_validated() + with ThreadPoolExecutor(max_workers=config.MAX_THREADS) as executor: + for pub_id in pub_ids: + executor.submit(_assess_datapub, pub_id=pub_id) - if pub_val.doi_not_registered: - logging.info( - "Found doi %s for record %s but it's not registered anywhere", - lit_pub.LitPID, - record, - ) - - if pub_val.is_dataset(): - lit_pubs = ScholexplorerRequest(lit_pub.LitPID, as_target=True).get_literature() - - if not lit_pubs: - data_pub = pub_val.convert() - export([data_pub], record, center_path) + logging.info("Assessment done for start time: %s", start_time) - for lit_pub in lit_pubs: - data_pubs = fjs.add_fuji_scores( - ldf.find_linked_data_publications(lit_pub.LitPID, lit_pub.LitPIDType) - ) - export([lit_pub, *data_pubs], record, center_path) +def _assess_datapub(pub_id: int): + with MysqlConnection().cursor as cursor: + data_pub = data_pub_get.get_by_pub_id(cursor, pub_id) - else: - data_pubs = fjs.add_fuji_scores( - ldf.find_linked_data_publications( - lit_pub.LitPID, lit_pub.LitPIDType, pub_val.response_scholix - ) - ) - - export([lit_pub, *data_pubs], record, center_path) - - -def export( - pubs: list[LitMetadata | DataMetadata], - record: Path, - center_path: Path, - as_json: bool = False -): - """ - Exports a literature publication, and it's related datasets - :param pubs: publications to be exported - :param record: given from process_record - :param center_path: Path given from process_record - :param as_json: flag for exporting data as JSON as well - """ - try: - DBExporter().export(pubs) - except DatabaseError as error: - logging.error("Error while exporting record %s to database.", record) - logging.info("%s", error) - - if as_json: - JSONExporter().export(pubs, center_path / 'output', record.stem) + if data_pub.DataPID and data_pub.DataPIDType and ( + fair_scores := get_fuji_score(data_pub.DataPID, data_pub.DataPIDType) + ): + fuji_score.insert(cursor, pub_id, fair_scores) + # no reassessment for now + delete_next_assessment(cursor, pub_id) diff --git a/toolbox/type_definitions/data_metadata_dict.py b/toolbox/type_definitions/data_metadata_dict.py index 2e58b42..021b185 100644 --- a/toolbox/type_definitions/data_metadata_dict.py +++ b/toolbox/type_definitions/data_metadata_dict.py @@ -40,7 +40,7 @@ class DataMetadata(AbstractMetadata): Creators: list[str] = field(default_factory=list) Publishers: list[str] = field(default_factory=list) PublicationDate: Optional[str] = None - Identifier: list[Identifier] = field(default_factory=list) + Identifiers: list[Identifier] = field(default_factory=list) DataPID: Optional[str] = None DataPIDType: Optional[str] = None DataInternalID: Optional[str] = None @@ -50,4 +50,57 @@ class DataMetadata(AbstractMetadata): LitPID: Optional[str] = None LitPIDType: Optional[str] = None RelationshipTypes: list[RelationshipType] = field(default_factory=list) - FAIRScores: Optional[dict] = None + + def __setattr__(self, name, value): + super().__setattr__(name, value) + + if name == 'Identifiers': + self.DataPID, self.DataPIDType = self._get_data_pid() + + def _get_data_pid(self) -> tuple[Optional[str], Optional[str]]: + """ + Get the DOI or if none is found, converts non-DOI-identifier to DOI if possible + :return: the DOI or None + """ + pid_priority = ['doi', 'hdl'] + pid_list = self._extract_identifier(self.Identifiers) + + for pid_type in pid_priority: + if pid_type in pid_list: + return pid_list[pid_type], pid_type + + # if we don't have a pid from our priority list we return any pid we have + for pid_type in pid_list: + return pid_list[pid_type], pid_type + + return None, None + + @staticmethod + def _extract_identifier(identifiers: list[Identifier]) -> dict[str, str]: + """ + Extract an identifier (PDB) from a dataset and return it as a dictionary. + :param identifiers: a list of Identifiers + :return: a dictionary containing one or more PIDs, the type of which is the key, while the PID is the + corresponding value. + """ + pid_dict: dict[str, str] = {} + + # first we collect all available PIDs + for identifier in identifiers: + if identifier.IDScheme and identifier.ID and identifier.IDScheme not in pid_dict: + pid_dict = pid_dict | { + identifier.IDScheme.lower(): identifier.ID, + } + + # transform pdb to doi if none exists + if 'pdb' in pid_dict and 'doi' not in pid_dict: + pid_dict = pid_dict | {'doi': f"10.2210/pdb{pid_dict['pdb']}/pdb"} + + # use URLs if no other identifier exists + for identifier in identifiers: + if identifier.IDScheme and identifier.IDURL and identifier.IDScheme not in pid_dict: + pid_dict = pid_dict | { + identifier.IDScheme.lower(): identifier.IDURL, + } + + return pid_dict -- GitLab From a02920f0a6c141b69bb8f95d4ce30be9b3f753fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Preu=C3=9F?= <gabriel.preuss@helmholtz-berlin.de> Date: Thu, 3 Apr 2025 10:35:41 +0200 Subject: [PATCH 2/4] fix tests --- .../fair_meter/fuji_scorer/test_fuji_scorer.py | 16 ++++++++++++---- toolbox/config.py | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py b/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py index 149e8a4..7479017 100644 --- a/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py +++ b/test/data_enricher/fair_meter/fuji_scorer/test_fuji_scorer.py @@ -10,7 +10,9 @@ from requests.exceptions import ConnectTimeout from requests.exceptions import ReadTimeout from requests_mock import Mocker +from toolbox.fair_meter.fuji_scorer.fuji_conf import EXTRA_TIMEOUT from toolbox.fair_meter.fuji_scorer.fuji_conf import FUJI_API +from toolbox.fair_meter.fuji_scorer.fuji_conf import INITIAL_TIMEOUT from toolbox.fair_meter.fuji_scorer.fuji_conf import RETRY_WAITING_TIME from toolbox.fair_meter.fuji_scorer.fuji_scorer import get_fuji_score @@ -66,9 +68,12 @@ class TestFujiScorer(unittest.TestCase): # calls get_fuji_score function answer = get_fuji_score(self.data_pid, self.data_pid_type) - # asserts Function was called 5 times, with the right sleep time on the last call and gets an answer + # function was called 5 times, with the right sleep time on the last call and gets an answer self.assertEqual(mock.call_count, 5) - self.assertEqual(mock.request_history[4].timeout, 2200) + self.assertEqual( + mock.request_history[4].timeout, + INITIAL_TIMEOUT + 4 * EXTRA_TIMEOUT, + ) sleep.assert_called_with(4 * RETRY_WAITING_TIME) self.assertEqual(answer["pid"], self.data_pid) @@ -80,9 +85,12 @@ class TestFujiScorer(unittest.TestCase): mock.post(FUJI_API, exc=ReadTimeout) answer = get_fuji_score(self.data_pid, self.data_pid_type) - # asserts Function was called 5 times, with the right timeout on the last call and gets an answer + # function was called 10 times, with the right timeout on the last call and gets no answer self.assertEqual(mock.call_count, 11) - self.assertEqual(mock.request_history[10].timeout, 4000) + self.assertEqual( + mock.request_history[10].timeout, + INITIAL_TIMEOUT + 10 * EXTRA_TIMEOUT, + ) sleep.assert_called_with(10 * RETRY_WAITING_TIME) self.assertIsNone(answer) diff --git a/toolbox/config.py b/toolbox/config.py index 91a500e..aad0f00 100644 --- a/toolbox/config.py +++ b/toolbox/config.py @@ -26,7 +26,7 @@ class Config: DB_PASS: str = os.environ.get('TOOLBOX_DB_PASS', '') DB_HOST: str = os.environ.get('TOOLBOX_DB_HOST', '') DB_NAME: str = os.environ.get('TOOLBOX_DB_NAME', '') - FUJI_HOST: str = os.environ.get('FUJI_HOST', '') + FUJI_HOST: str = os.environ.get('FUJI_HOST', 'localhost') FUJI_PORT: str = os.environ.get('FUJI_PORT', '80') FUJI_PASS: str = os.environ.get('FUJI_PASS', 'bWFydmVsOndvbmRlcndvbWFu') FUJI_PROTOCOL: str = os.environ.get('FUJI_PROTOCOL', 'http') -- GitLab From 21b3c04f73fa4bbf39ef931739a7829fdcfdba27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Preu=C3=9F?= <gabriel.preuss@helmholtz-berlin.de> Date: Thu, 3 Apr 2025 11:08:59 +0200 Subject: [PATCH 3/4] fix code format issue --- toolbox/toolbox.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/toolbox/toolbox.py b/toolbox/toolbox.py index d13b96f..93bb958 100644 --- a/toolbox/toolbox.py +++ b/toolbox/toolbox.py @@ -68,8 +68,9 @@ def harvest( harvest_start = datetime.now() with ( - open(centers_conf, 'r', encoding='utf8') as conf_file, MysqlConnection().cursor as - cursor, ThreadPoolExecutor(max_workers=config.MAX_THREADS) as executor + open(centers_conf, 'r', encoding='utf8') as conf_file, + MysqlConnection().cursor as cursor, + ThreadPoolExecutor(max_workers=config.MAX_THREADS) as executor, ): helmholtz_centers = yaml.safe_load(conf_file) center_last_run = get_last_run_dates(cursor) -- GitLab From 1a298525293bd8abf99ac426f37e45c272a87503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Preu=C3=9F?= <gabriel.preuss@helmholtz-berlin.de> Date: Thu, 3 Apr 2025 12:57:01 +0200 Subject: [PATCH 4/4] add somesy.toml to REUSE.toml --- REUSE.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/REUSE.toml b/REUSE.toml index 0ba0dc6..950fb31 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -28,7 +28,9 @@ path = [ "pyproject.toml", "docker/run.sh", ".gitlab-ci.yml", - "docker-compose.yml"] + "docker-compose.yml", + "somesy.toml", +] SPDX-FileCopyrightText = [ "2022 Helmholtz-Zentrum Berlin für Materialien und Energie GmbH, Berlin, Germany" ] -- GitLab