From 10f6f60b86f710f0e4f35345afba5215b4be62e1 Mon Sep 17 00:00:00 2001
From: Paul Millar <paul.millar@desy.de>
Date: Thu, 2 Jan 2025 23:53:14 +0100
Subject: [PATCH] update_oai-pmh Add support for querying DataCite resourceType

Motivation:

OAI-PMH, by itself, doesn't identify the nature of the resource; rather,
this is achieved by the metadata record itself.

Just to mention it, OAI-PMH sets don't provide any guaranteed semantics;
such semantics can be added through the set description, but there's no
consensus or practice in doing this.

Therefore, in order to categorise OAI-PMH items by type, we need to
obtain records: listing identifiers isn't sufficient.  Moreover, Dublin
Core (as used currently) doesn't support the fine-grain type semantics
we would like present.

The DataCite metadata schema provides `resourceType` metadata, with the
`resourceTypeGeneral` providing the course-grain type of the resource.
This is what we would like to use.

Modification:

Add support for querying all records using the DataCite records.  This
task is very similar to the existing code that lists all identifiers of
records with Dublin Core.

The patch adds support for querying DataCite metadata as mostly a
copy-and-paste of the existing code.  This is technical debt that future
patches MUST address, through refactorinng.

The OAI-PMH client code is updated to support ListRecords requests.
This is also a copy-n-paste, inducing further technical debt that future
patches must address.

Result:

The facilities YAML file now includes a breakdown of OAI-PMH items based
on their DataCite resourceTypeGeneral.
---
 _data/facilities.yml      | 158 +++++++++++++++++++++++
 scripts/update_oai-pmh.rb | 265 +++++++++++++++++++++++++++++++++++---
 2 files changed, 406 insertions(+), 17 deletions(-)

diff --git a/_data/facilities.yml b/_data/facilities.yml
index ec97dc1..2c04bbc 100644
--- a/_data/facilities.yml
+++ b/_data/facilities.yml
@@ -33,15 +33,21 @@
       - mis@cells.es
       items:
         count: 25
+        resource_types:
+          Dataset: 25
         sets:
           count: 2
           details:
             lifesci:
               name: Life Science
               count: 2
+              resource_types:
+                Dataset: 2
             spec:
               name: Spectroscopy
               count: 22
+              resource_types:
+                Dataset: 22
     pan-search-api:
       link: https://data.cells.es/iws/panosc/search-api
       status: Active
@@ -145,6 +151,8 @@
       - datapolicy@esrf.fr
       items:
         count: 7202
+        resource_types:
+          Collection: 7202
     pan-search-api:
       link: https://icatplus.esrf.fr/api
       status: Active
@@ -249,18 +257,27 @@
       - icatmaster@helmholtz-berlin.de
       items:
         count: 28953
+        resource_types:
+          Collection: 209
+          Dataset: 28744
         sets:
           count: 3
           details:
             pub:
               name: Data Publication
               count: 11
+              resource_types:
+                Dataset: 11
             raw_ds:
               name: Raw Dataset
               count: 28733
+              resource_types:
+                Dataset: 28733
             raw_inv:
               name: Investigation Raw Data
               count: 209
+              resource_types:
+                Collection: 209
 - short-name: HZDR
   pid:
   - type: ROR
@@ -291,128 +308,269 @@
       - rodare-admin@hzdr.de
       items:
         count: 1021
+        resource_types:
+          Audiovisual: 12
+          Dataset: 740
+          Image: 36
+          Other: 47
+          Software: 180
+          Text: 6
         sets:
           count: 39
           details:
             openaire_data:
               name: OpenAIRE data sets
               count: 1055
+              resource_types:
+                Dataset: 1055
             software:
               name: Software
               count: 212
+              resource_types:
+                Software: 212
             user-athena:
               name: ATHENA (Accelerator Technology HElmholtz iNfrAstructure)
               count: 3
+              resource_types:
+                Dataset: 3
             user-casus:
               name: CASUS
               count: 17
+              resource_types:
+                Audiovisual: 1
+                Dataset: 13
+                Software: 3
             user-direct-electron-beam-at-elbe:
               name: Electron-beam testing station for detectors (at ELBE)
               count: 2
+              resource_types:
+                Dataset: 2
             user-draco-elbe:
               name: High-power ultra-short pulse laser DRACO (at ELBE)
               count: 14
+              resource_types:
+                Dataset: 12
+                Software: 2
             user-dresdyn:
               name: DRESDYN (DREsden Sodium facility for DYNamo and thermohydraulic
                 studies)
               count: 1
+              resource_types:
+                Image: 1
             user-ecfunded:
               name: OpenAIRE
               count: 93
+              resource_types:
+                Audiovisual: 4
+                Dataset: 49
+                Image: 3
+                Software: 37
             user-elbe:
               name: " ELBE (Electron Linac for beams with high Brilliance and low
                 Emittance)"
               count: 83
+              resource_types:
+                Dataset: 72
+                Image: 2
+                Other: 1
+                Software: 8
             user-energy:
               name: 'Research field: Energy'
               count: 103
+              resource_types:
+                Dataset: 53
+                Image: 1
+                Other: 3
+                Software: 46
             user-felbe:
               name: Free-Electron Laser (FELBE)
               count: 5
+              resource_types:
+                Dataset: 4
+                Other: 1
             user-fwc:
               name: Department of Information Services and Computing
               count: 7
+              resource_types:
+                Dataset: 1
+                Image: 1
+                Other: 1
+                Software: 4
             user-fwd:
               name: Institute of Fluid Dynamics
               count: 160
+              resource_types:
+                Audiovisual: 2
+                Dataset: 103
+                Other: 6
+                Software: 47
+                Text: 2
             user-fwg:
               name: Helmholtz Institute Freiberg for Resource Technology
               count: 2
+              resource_types:
+                Dataset: 2
             user-fwi:
               name: Institute of Ion Beam Physics and Materials Research
               count: 100
+              resource_types:
+                Audiovisual: 1
+                Dataset: 86
+                Image: 8
+                Other: 4
+                Software: 1
             user-fwk:
               name: Institute of Radiation Physics
               count: 14
+              resource_types:
+                Dataset: 9
+                Other: 4
+                Software: 1
             user-fwkk:
               name: Nuclear Physics Department
               count: 1
+              resource_types:
+                Dataset: 1
             user-fwm:
               name: Institute of Radiooncology – OncoRay
               count: 1
+              resource_types:
+                Other: 1
             user-fwo:
               name: Institute of Resource Ecology
               count: 7
+              resource_types:
+                Dataset: 7
             user-health:
               name: 'Research field: Health'
               count: 35
+              resource_types:
+                Audiovisual: 1
+                Dataset: 28
+                Image: 2
+                Other: 1
+                Software: 3
             user-hibef:
               name: Helmholtz International Beamline for Extreme Fields (HIBEF)
               count: 4
+              resource_types:
+                Dataset: 2
+                Other: 2
             user-hzdr:
               name: Helmholtz-Zentrum Dresden-Rossendorf
               count: 216
+              resource_types:
+                Audiovisual: 7
+                Dataset: 124
+                Image: 7
+                Other: 19
+                Software: 54
+                Text: 5
             user-ibc:
               name: Ion Beam Center
               count: 80
+              resource_types:
+                Audiovisual: 1
+                Dataset: 68
+                Image: 8
+                Other: 3
             user-matter:
               name: 'Research field: Matter'
               count: 89
+              resource_types:
+                Audiovisual: 3
+                Dataset: 68
+                Image: 5
+                Other: 4
+                Software: 8
+                Text: 1
             user-mu2e:
               name: Mu2e
               count: 7
+              resource_types:
+                Dataset: 5
+                Software: 2
             user-nelbe:
               name: Neutron Time-Of-Flight Measurements
               count: 1
+              resource_types:
+                Dataset: 1
             user-oncoray:
               name: Institute of Radiooncology - OncoRay
               count: 15
+              resource_types:
+                Dataset: 14
+                Other: 1
             user-openfoam:
               name: OpenFOAM
               count: 29
+              resource_types:
+                Software: 29
             user-panosc:
               name: The Photon and Neutron Open Science Cluster (PaNOSC)
               count: 3
+              resource_types:
+                Dataset: 3
             user-pelbe:
               name: Positrons (pELBE)
               count: 8
+              resource_types:
+                Dataset: 8
             user-pet-center:
               name: Center for Positron Emission Tomography
               count: 6
+              resource_types:
+                Dataset: 6
             user-robl:
               name: ROBL – The Rossendorf Beamline at ESRF
               count: 2
+              resource_types:
+                Dataset: 2
             user-rodare:
               name: Rodare
               count: 990
+              resource_types:
+                Audiovisual: 12
+                Dataset: 721
+                Image: 36
+                Other: 47
+                Software: 168
+                Text: 6
             user-rofex:
               name: ROFEX - Ultrafast electron beam X-ray computed tomography
               count: 15
+              resource_types:
+                Dataset: 12
+                Other: 3
             user-superconducting-electron-linear-accelerator-elbe:
               name: The Superconducting Electron Linear Accelerator (at ELBE)
               count: 4
+              resource_types:
+                Dataset: 4
             user-telbe:
               name: Superradiant THz source (TELBE)
               count: 11
+              resource_types:
+                Dataset: 9
+                Other: 1
+                Software: 1
             user-topflow:
               name: TOPFLOW -Transient Two Phase FlowTest Facility
               count: 79
+              resource_types:
+                Dataset: 76
+                Other: 1
+                Software: 2
             user-zrt:
               name: 'ZRT - Institute of Radiopharmaceutical Cancer Research '
               count: 4
+              resource_types:
+                Dataset: 4
             user-γelbe:
               name: 'Bremsstrahlung (γELBE) '
               count: 2
+              resource_types:
+                Dataset: 2
     pan-search-api:
       link: https://scicat.hzdr.de/panosc-api
       status: Active
diff --git a/scripts/update_oai-pmh.rb b/scripts/update_oai-pmh.rb
index 9bfd000..a784d6a 100644
--- a/scripts/update_oai-pmh.rb
+++ b/scripts/update_oai-pmh.rb
@@ -6,6 +6,19 @@ require 'date'
 require 'nokogiri'
 require 'persistent_httparty'
 
+# OAI-PMH Metadata format namespaces for DataCite where the DataCite record
+# is the metadata.
+DATACITE_NAMESPACES = [ "http://datacite.org/schema/kernel-3",
+    "http://datacite.org/schema/kernel-3/",
+    "http://datacite.org/schema/kernel-4"]
+
+
+# OAI-PMH Metadata format namespaces for DataCite where the DataCite record
+# is wrapped and appears as the payload.
+DATACITE_WRAPPED_NAMESPACES = [ "http://schema.datacite.org/oai/oai-1.1/" ]
+
+
+
 class FailedOaiPmhRequest < StandardError
     def initialize(type, msg)
         super(type + ": " + msg)
@@ -118,6 +131,68 @@ class OaiPmhClient
         handle_response("ListIdentifiers", response)
     end
 
+    def list_records(metadata_prefix: nil, resumption_token: nil)
+        raise ArgumentError, 'Need to specify either metadata_prefix or resumption_token' unless metadata_prefix || resumption_token
+        raise ArgumentError, 'Must specify one of metadata_prefix or resumption_token' if metadata_prefix && resumption_token
+
+        if metadata_prefix
+            arg="metadataPrefix="+metadata_prefix
+        else
+            arg="resumptionToken="+resumption_token
+        end
+        # Whether the server has already responded successfully in the past.
+        # Using the presence of the resumptionToken to determine this is,
+        # perhaps, questionable.
+        hasServerResponded = resumption_token
+
+        response = nil
+        attempts = 0
+        while !response
+            begin
+                attempts += 1
+
+                starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+                response = self.class.get(@base_uri + "?verb=ListRecords&" + arg)
+                ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+                $stats.accept(ending - starting)
+
+                if response.code >= 500
+                    # 5xx status code indicates a server error.  Retrying might
+                    # help.
+                    raise HTTParty::Error.new "Received status code #{response.code}"
+                elsif response.code >= 400
+                    # 4xx status code indicates a client error.  Retrying with
+                    # the same request will not help.
+                    raise FailedOaiPmhRequest.new("ListRecords", "Received status code #{response.code}")
+                end
+            rescue HTTParty::Error, SocketError, Timeout::Error => e
+                # FIXME output needs to take into account current whether to
+                # insert a new-line.  As a quick hack, use the presence of
+                # resumptionToken in the URL to indicate a newline is needed
+                # (this works most of the time, but not always)
+                if hasServerResponded && attempts == 1
+                    print "\n"
+                end
+                print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}")
+
+                if attempts >= MAX_HTTP_GET_RETRIES
+                    # REVISIT include the URL in the message?
+                    raise FailedOaiPmhRequest.new("ListRecords", "Too many GET requests failed")
+                end
+
+                delay = FAILED_REQUEST_INITIAL
+                if hasServerResponded
+                    delay += attempts * FAILED_REQUEST_BACKOFF
+                end
+
+                sleep(delay);
+                response = nil
+            end
+        end
+
+        handle_response("ListRecords", response)
+    end
+
     def handle_response(request_type, response)
         if !response.success?
             raise FailedOaiPmhRequest.new(request_type, "HTTP status code #{response.code}")
@@ -314,15 +389,56 @@ def list_sets(endpoint)
 end
 
 
-def metadata_prefix_of(endpoint, namespace)
+def metadata_formats()
     xml_response = $client.list_metadata_formats
 
-    format_description = xml_response.at_xpath("//xmlns:metadataFormat[xmlns:metadataNamespace=\"#{namespace}\"]/xmlns:metadataPrefix/text()")
-    if !format_description
-        raise StandardError.new "ListMetadataFormats missing #{namespace}"
+    formats = {}
+
+    metadataFormats = xml_response.xpath("/xmlns:OAI-PMH/xmlns:ListMetadataFormats/xmlns:metadataFormat")
+    if !metadataFormats
+        raise FailedOaiPmhRequest.new "ListMetadataFormats", "missing metadataFormat elements."
+    end
+
+    metadataFormats.each do |metadataFormat|
+        prefix = nil
+        schema = nil
+        namespace = nil
+
+        metadataFormat.children.each do |child|
+            next unless child.element?
+            if child.name == "metadataPrefix"
+                prefix = child.content
+            elsif child.name == "schema"
+                schema = child.content
+            elsif child.name == "metadataNamespace"
+                namespace = child.content
+            end
+        end
+
+        if prefix
+            formats[prefix] = {schema: schema, namespace: namespace}
+        else
+            puts "Found metadataFormat without a prefix"
+        end
     end
 
-    return format_description.text
+    return formats
+end
+
+
+def select_datacite(formats)
+    formats.each do |(key,value)| # REVISIT, given a choice, which one to pick?
+        ns = value[:namespace]
+        if DATACITE_NAMESPACES.include? ns
+            return key, false
+        end
+
+        if DATACITE_WRAPPED_NAMESPACES.include? ns
+            return key, true
+        end
+    end
+
+    return nil, nil
 end
 
 
@@ -388,7 +504,7 @@ def count_identifiers(endpoint, prefix)
                 overall_stats.accept_from $stats
                 $stats = SeriesStatistics.new
             end
-            print "    Harvesting: "
+            print "    Harvesting Dublin Core identifiers: "
         end
         i += 1
         print "."
@@ -407,7 +523,98 @@ def count_identifiers(endpoint, prefix)
         end
     end
     overall_stats.accept_from $stats
-    print_with_time("Overall timing/s #{overall_stats}")
+    print_with_time("Dublin Core identifiers overall timing/s #{overall_stats}")
+
+    return overall_total, overall_set_counts
+end
+
+RESOURCE_TYPE_EXPRESSION_WRAPPED = '*[local-name()="metadata"]/*[local-name()="oai_datacite"]/*[local-name()="payload"]/*[local-name()="resource"]/*[local-name()="resourceType"]'
+RESOURCE_TYPE_EXPRESSION_UNWRAPPED = '*[local-name()="metadata"]/*[local-name()="resource"]/*[local-name()="resourceType"]'
+
+def list_datacite_records(is_wrapped: false, metadata_prefix: nil, resumption_token: nil)
+
+    xml_response = $client.list_records(metadata_prefix: metadata_prefix, resumption_token: resumption_token)
+
+    total = {}
+    set_counts = {}
+    resumptionToken = nil
+
+    resource_type_expression = is_wrapped ? RESOURCE_TYPE_EXPRESSION_WRAPPED : RESOURCE_TYPE_EXPRESSION_UNWRAPPED
+    records = xml_response.xpath("/o:OAI-PMH/o:ListRecords/o:record",
+            {'o' => 'http://www.openarchives.org/OAI/2.0/'})
+    if records
+        records.each do |record|
+            resourceTypeElements = record.xpath(resource_type_expression)
+            next if resourceTypeElements.count != 1
+            resourceType = resourceTypeElements[0].attr('resourceTypeGeneral')
+            next if !resourceType
+
+            total[resourceType] ||= 0
+            total[resourceType] += 1
+
+            setSpecElements = record.xpath('o:header/o:setSpec', {'o' => 'http://www.openarchives.org/OAI/2.0/'})
+            next if setSpecElements.empty?
+
+            setSpecElements.each do |setSpecElement|
+                setSpec = setSpecElement.content
+                set_counts[setSpec] ||= {}
+                set_counts[setSpec] [resourceType] ||= 0
+                set_counts[setSpec] [resourceType] += 1
+            end
+        end
+    end
+
+    token = xml_response.at_xpath('/o:OAI-PMH/o:ListRecords/o:resumptionToken/text()',
+            {'o' => 'http://www.openarchives.org/OAI/2.0/'})
+    resumptionToken = token ? token.text : nil
+
+    return total, set_counts, resumptionToken
+end
+
+def count_datacite_records(prefix, wrapped)
+    overall_stats = SeriesStatistics.new
+    $stats = SeriesStatistics.new
+    overall_total, overall_set_counts, resumptionToken = list_datacite_records(is_wrapped: wrapped, metadata_prefix: prefix)
+
+    i = 0
+    while resumptionToken
+        if i % 40 == 0
+            if i > 0
+                print("\n")
+                print_with_time("#{overall_total.values.reduce(:+)} records so far; timing/s #{$stats}")
+                overall_stats.accept_from $stats
+                $stats = SeriesStatistics.new
+            end
+            print "    Harvesting DataCite records: "
+        end
+        i += 1
+        print "."
+        delta_total, delta_set_counts, resumptionToken = list_datacite_records(is_wrapped: wrapped, resumption_token: resumptionToken)
+
+        delta_total.each do |(type,count)|
+            overall_total[type] ||= 0
+            overall_total[type] += count
+        end
+
+        delta_set_counts.each do |(set,set_counts)|
+            overall_set_counts[set] ||= {}
+            overall_set = overall_set_counts[set]
+
+            set_counts.each do |(type,count)|
+                overall_set[type] ||= 0
+                overall_set[type] += count
+            end
+        end
+    end
+
+    if i > 0
+        print "\n"
+        if overall_stats.count > 0
+            print_with_time("#{overall_total.values.reduce(:+)} records so far; timing/s #{$stats}")
+        end
+    end
+    overall_stats.accept_from $stats
+    print_with_time("DataCite records overall timing/s #{overall_stats}")
 
     return overall_total, overall_set_counts
 end
@@ -419,13 +626,6 @@ def query_oai_pmh_endpoint(endpoint, do_harvesting)
         return status, [], {}, 0, {}
     end
 
-    begin
-        dc_prefix = metadata_prefix_of(endpoint, 'http://www.openarchives.org/OAI/2.0/oai_dc/')
-    rescue StandardError => e
-        print_with_time("Error: metadata_prefix_of failed: #{e.message}")
-        return "Error", [], {}, 0, {}
-    end
-
     set_names = list_sets(endpoint)
 
     if !do_harvesting
@@ -433,14 +633,30 @@ def query_oai_pmh_endpoint(endpoint, do_harvesting)
     end
 
     begin
-        total_count, set_counts = count_identifiers(endpoint, dc_prefix)
+        total_count, set_counts = count_identifiers(endpoint, "oai_dc")
     rescue StandardError => e
         print_with_time("Error: ListIdentifiers failed: #{e.message}")
         total_count = 0
         set_counts = {}
     end
 
-    return status, adminAddress, set_names, total_count, set_counts
+    formats = metadata_formats()
+    prefix, is_wrapped = select_datacite(formats)
+    if prefix
+        begin
+            datacite_totals, set_datacite_counts = count_datacite_records(prefix, is_wrapped)
+        rescue StandardError => e
+            print_with_time("Error: DataCite harvesting failed: #{e.message}")
+            datacite_totals = {}
+            set_datacite_counts = {}
+        end
+    else
+        puts "    Skipping DataCite harvesting as DataCite is not supported."
+        datacite_totals = {}
+        set_datacite_counts = {}
+    end
+
+    return status, adminAddress, set_names, total_count, set_counts, datacite_totals, set_datacite_counts
 end
 
 
@@ -472,7 +688,7 @@ facilities.each do |facility|
     name = facility['short-name']
     puts "Checking OAI-PMH endpoint for #{name}: #{oai_pmh_endpoint}"
     do_harvesting = !oai_pmh['skip-harvesting']
-    status, adminAddress, set_names, total_count, set_count = query_oai_pmh_endpoint(oai_pmh_endpoint, do_harvesting)
+    status, adminAddress, set_names, total_count, set_count, datacite_totals, set_datacite_counts = query_oai_pmh_endpoint(oai_pmh_endpoint, do_harvesting)
 
     oai_pmh['status'] = status
 
@@ -487,6 +703,13 @@ facilities.each do |facility|
         oai_pmh['items'] = items
         items['count'] = total_count
 
+        if !datacite_totals.empty?
+            items['resource_types'] = {}
+            datacite_totals.sort_by { |k, _| k }.each do |(key,value)|
+                items['resource_types'][key] = value
+            end
+        end
+
         if !set_count.empty?
             sets = {}
             items['sets'] = sets
@@ -504,6 +727,14 @@ facilities.each do |facility|
                     set['name'] = set_names[set_id]
                 end
                 set['count'] = count
+
+                if set_datacite_counts.key?(set_id)
+                    set['resource_types'] = {}
+
+                    set_datacite_counts[set_id].sort_by { |k, _| k }.each do |(key,value)|
+                        set['resource_types'][key] = value
+                    end
+                end
             end
         end
     end
-- 
GitLab