diff --git a/_data/facilities.yml b/_data/facilities.yml index ec97dc1bb3a73f2de0cad43b3b4585689b6a246a..2c04bbcc53bfcb97d89388b8bddf2d259b6f1fff 100644 --- a/_data/facilities.yml +++ b/_data/facilities.yml @@ -33,15 +33,21 @@ - mis@cells.es items: count: 25 + resource_types: + Dataset: 25 sets: count: 2 details: lifesci: name: Life Science count: 2 + resource_types: + Dataset: 2 spec: name: Spectroscopy count: 22 + resource_types: + Dataset: 22 pan-search-api: link: https://data.cells.es/iws/panosc/search-api status: Active @@ -145,6 +151,8 @@ - datapolicy@esrf.fr items: count: 7202 + resource_types: + Collection: 7202 pan-search-api: link: https://icatplus.esrf.fr/api status: Active @@ -249,18 +257,27 @@ - icatmaster@helmholtz-berlin.de items: count: 28953 + resource_types: + Collection: 209 + Dataset: 28744 sets: count: 3 details: pub: name: Data Publication count: 11 + resource_types: + Dataset: 11 raw_ds: name: Raw Dataset count: 28733 + resource_types: + Dataset: 28733 raw_inv: name: Investigation Raw Data count: 209 + resource_types: + Collection: 209 - short-name: HZDR pid: - type: ROR @@ -291,128 +308,269 @@ - rodare-admin@hzdr.de items: count: 1021 + resource_types: + Audiovisual: 12 + Dataset: 740 + Image: 36 + Other: 47 + Software: 180 + Text: 6 sets: count: 39 details: openaire_data: name: OpenAIRE data sets count: 1055 + resource_types: + Dataset: 1055 software: name: Software count: 212 + resource_types: + Software: 212 user-athena: name: ATHENA (Accelerator Technology HElmholtz iNfrAstructure) count: 3 + resource_types: + Dataset: 3 user-casus: name: CASUS count: 17 + resource_types: + Audiovisual: 1 + Dataset: 13 + Software: 3 user-direct-electron-beam-at-elbe: name: Electron-beam testing station for detectors (at ELBE) count: 2 + resource_types: + Dataset: 2 user-draco-elbe: name: High-power ultra-short pulse laser DRACO (at ELBE) count: 14 + resource_types: + Dataset: 12 + Software: 2 user-dresdyn: name: DRESDYN (DREsden Sodium facility for DYNamo and thermohydraulic studies) count: 1 + resource_types: + Image: 1 user-ecfunded: name: OpenAIRE count: 93 + resource_types: + Audiovisual: 4 + Dataset: 49 + Image: 3 + Software: 37 user-elbe: name: " ELBE (Electron Linac for beams with high Brilliance and low Emittance)" count: 83 + resource_types: + Dataset: 72 + Image: 2 + Other: 1 + Software: 8 user-energy: name: 'Research field: Energy' count: 103 + resource_types: + Dataset: 53 + Image: 1 + Other: 3 + Software: 46 user-felbe: name: Free-Electron Laser (FELBE) count: 5 + resource_types: + Dataset: 4 + Other: 1 user-fwc: name: Department of Information Services and Computing count: 7 + resource_types: + Dataset: 1 + Image: 1 + Other: 1 + Software: 4 user-fwd: name: Institute of Fluid Dynamics count: 160 + resource_types: + Audiovisual: 2 + Dataset: 103 + Other: 6 + Software: 47 + Text: 2 user-fwg: name: Helmholtz Institute Freiberg for Resource Technology count: 2 + resource_types: + Dataset: 2 user-fwi: name: Institute of Ion Beam Physics and Materials Research count: 100 + resource_types: + Audiovisual: 1 + Dataset: 86 + Image: 8 + Other: 4 + Software: 1 user-fwk: name: Institute of Radiation Physics count: 14 + resource_types: + Dataset: 9 + Other: 4 + Software: 1 user-fwkk: name: Nuclear Physics Department count: 1 + resource_types: + Dataset: 1 user-fwm: name: Institute of Radiooncology – OncoRay count: 1 + resource_types: + Other: 1 user-fwo: name: Institute of Resource Ecology count: 7 + resource_types: + Dataset: 7 user-health: name: 'Research field: Health' count: 35 + resource_types: + Audiovisual: 1 + Dataset: 28 + Image: 2 + Other: 1 + Software: 3 user-hibef: name: Helmholtz International Beamline for Extreme Fields (HIBEF) count: 4 + resource_types: + Dataset: 2 + Other: 2 user-hzdr: name: Helmholtz-Zentrum Dresden-Rossendorf count: 216 + resource_types: + Audiovisual: 7 + Dataset: 124 + Image: 7 + Other: 19 + Software: 54 + Text: 5 user-ibc: name: Ion Beam Center count: 80 + resource_types: + Audiovisual: 1 + Dataset: 68 + Image: 8 + Other: 3 user-matter: name: 'Research field: Matter' count: 89 + resource_types: + Audiovisual: 3 + Dataset: 68 + Image: 5 + Other: 4 + Software: 8 + Text: 1 user-mu2e: name: Mu2e count: 7 + resource_types: + Dataset: 5 + Software: 2 user-nelbe: name: Neutron Time-Of-Flight Measurements count: 1 + resource_types: + Dataset: 1 user-oncoray: name: Institute of Radiooncology - OncoRay count: 15 + resource_types: + Dataset: 14 + Other: 1 user-openfoam: name: OpenFOAM count: 29 + resource_types: + Software: 29 user-panosc: name: The Photon and Neutron Open Science Cluster (PaNOSC) count: 3 + resource_types: + Dataset: 3 user-pelbe: name: Positrons (pELBE) count: 8 + resource_types: + Dataset: 8 user-pet-center: name: Center for Positron Emission Tomography count: 6 + resource_types: + Dataset: 6 user-robl: name: ROBL – The Rossendorf Beamline at ESRF count: 2 + resource_types: + Dataset: 2 user-rodare: name: Rodare count: 990 + resource_types: + Audiovisual: 12 + Dataset: 721 + Image: 36 + Other: 47 + Software: 168 + Text: 6 user-rofex: name: ROFEX - Ultrafast electron beam X-ray computed tomography count: 15 + resource_types: + Dataset: 12 + Other: 3 user-superconducting-electron-linear-accelerator-elbe: name: The Superconducting Electron Linear Accelerator (at ELBE) count: 4 + resource_types: + Dataset: 4 user-telbe: name: Superradiant THz source (TELBE) count: 11 + resource_types: + Dataset: 9 + Other: 1 + Software: 1 user-topflow: name: TOPFLOW -Transient Two Phase FlowTest Facility count: 79 + resource_types: + Dataset: 76 + Other: 1 + Software: 2 user-zrt: name: 'ZRT - Institute of Radiopharmaceutical Cancer Research ' count: 4 + resource_types: + Dataset: 4 user-γelbe: name: 'Bremsstrahlung (γELBE) ' count: 2 + resource_types: + Dataset: 2 pan-search-api: link: https://scicat.hzdr.de/panosc-api status: Active diff --git a/open-data-resources.html b/open-data-resources.html index ee1f6b5410e0d3e4b52c6ce11c987d0f658f629f..16019af878a2f12e539f0caa1595560f59667474 100644 --- a/open-data-resources.html +++ b/open-data-resources.html @@ -36,6 +36,8 @@ title: Open data resources {% assign odr-facilities = site.data.facilities | where_exp:"facility", "facility.odr" %} {% assign total_search_api_dataset_count = 0 %} {% assign total_oai_pmh_dataset_count = 0 %} + {% assign total_oai_pmh_datasets = 0 %} + {% assign total_oai_pmh_collections = 0 %} {% for facility in odr-facilities %} <tr> <td>{{ facility.short-name }}</td> @@ -75,15 +77,49 @@ title: Open data resources <table> <tr> <th>Set</th><th>Items</th> + {% if facility.odr.oai-pmh-endpoint.items.resource_types %} + <th>Types</th> + {% endif %} </tr> {% for set in facility.odr.oai-pmh-endpoint.items.sets.details %} <tr> <td>{{set[1].name}}</td><td>{{set[1].count | thousands_separated}}</td> + {% if facility.odr.oai-pmh-endpoint.items.resource_types %} + <td> + {% for set_type in set[1].resource_types %} + {{set_type[0]}}: {{set_type[1] | thousands_separated}}<br/> + {% endfor %} + </td> + {% endif %} </tr> {% endfor %} </table> </div>{% endif %}{% + if facility.odr.oai-pmh-endpoint.items.resource_types%} + <br/> + <span class="caret">Types </span> + <div class="toggled"> + <table> + <tr> + <th>Type</th> + <th>Items</th> + </tr> + {% for type in facility.odr.oai-pmh-endpoint.items.resource_types %}{% + if type[0] == 'Dataset' %}{% + assign total_oai_pmh_datasets = total_oai_pmh_datasets | plus: type[1] %}{% + endif %}{% + if type[0] == 'Collection' %}{% + assign total_oai_pmh_collections = total_oai_pmh_collections | plus: type[1] %}{% + endif %} + <tr> + <td>{{ type[0] }}</td> + <td>{{ type[1] | thousands_separated }}</td> + </tr> + {% endfor %} + </table> + </div>{% + endif %}{% endif %} </div> <span class="tooltip">last check: {{ facility.odr.oai-pmh-endpoint.last-check | date: "%Y-%m-%d" }}</span> @@ -116,10 +152,13 @@ title: Open data resources </tr> {% endfor %} <tr> - <td><b>Totals</b></td> - <td></td> - <td><b>{{total_oai_pmh_dataset_count | thousands_separated}}</b></td> - <td><b>{{total_search_api_dataset_count | thousands_separated}}</b></td> + <td colspan="2"><b>Totals</b></td> + <td>Items: <b>{{total_oai_pmh_dataset_count | thousands_separated}}</b><br/> + Datasets: <b>{{total_oai_pmh_datasets | thousands_separated }}</b><br/> + Collections: <b>{{total_oai_pmh_collections | thousands_separated }}</b><br/> + Datasets<br/>+Collections: <b>{{total_oai_pmh_collections | plus: total_oai_pmh_datasets | thousands_separated }}</b><br/> + </b></td> + <td>Datasets: <b>{{total_search_api_dataset_count | thousands_separated}}</b></td> </tr> </table> diff --git a/scripts/update_oai-pmh.rb b/scripts/update_oai-pmh.rb index 9bfd0006740fc3d5d2d0d796cfb063541df52d12..a784d6a1e28ba0c1ceeb4d797667a8698d0e0f1a 100644 --- a/scripts/update_oai-pmh.rb +++ b/scripts/update_oai-pmh.rb @@ -6,6 +6,19 @@ require 'date' require 'nokogiri' require 'persistent_httparty' +# OAI-PMH Metadata format namespaces for DataCite where the DataCite record +# is the metadata. +DATACITE_NAMESPACES = [ "http://datacite.org/schema/kernel-3", + "http://datacite.org/schema/kernel-3/", + "http://datacite.org/schema/kernel-4"] + + +# OAI-PMH Metadata format namespaces for DataCite where the DataCite record +# is wrapped and appears as the payload. +DATACITE_WRAPPED_NAMESPACES = [ "http://schema.datacite.org/oai/oai-1.1/" ] + + + class FailedOaiPmhRequest < StandardError def initialize(type, msg) super(type + ": " + msg) @@ -118,6 +131,68 @@ class OaiPmhClient handle_response("ListIdentifiers", response) end + def list_records(metadata_prefix: nil, resumption_token: nil) + raise ArgumentError, 'Need to specify either metadata_prefix or resumption_token' unless metadata_prefix || resumption_token + raise ArgumentError, 'Must specify one of metadata_prefix or resumption_token' if metadata_prefix && resumption_token + + if metadata_prefix + arg="metadataPrefix="+metadata_prefix + else + arg="resumptionToken="+resumption_token + end + # Whether the server has already responded successfully in the past. + # Using the presence of the resumptionToken to determine this is, + # perhaps, questionable. + hasServerResponded = resumption_token + + response = nil + attempts = 0 + while !response + begin + attempts += 1 + + starting = Process.clock_gettime(Process::CLOCK_MONOTONIC) + response = self.class.get(@base_uri + "?verb=ListRecords&" + arg) + ending = Process.clock_gettime(Process::CLOCK_MONOTONIC) + $stats.accept(ending - starting) + + if response.code >= 500 + # 5xx status code indicates a server error. Retrying might + # help. + raise HTTParty::Error.new "Received status code #{response.code}" + elsif response.code >= 400 + # 4xx status code indicates a client error. Retrying with + # the same request will not help. + raise FailedOaiPmhRequest.new("ListRecords", "Received status code #{response.code}") + end + rescue HTTParty::Error, SocketError, Timeout::Error => e + # FIXME output needs to take into account current whether to + # insert a new-line. As a quick hack, use the presence of + # resumptionToken in the URL to indicate a newline is needed + # (this works most of the time, but not always) + if hasServerResponded && attempts == 1 + print "\n" + end + print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}") + + if attempts >= MAX_HTTP_GET_RETRIES + # REVISIT include the URL in the message? + raise FailedOaiPmhRequest.new("ListRecords", "Too many GET requests failed") + end + + delay = FAILED_REQUEST_INITIAL + if hasServerResponded + delay += attempts * FAILED_REQUEST_BACKOFF + end + + sleep(delay); + response = nil + end + end + + handle_response("ListRecords", response) + end + def handle_response(request_type, response) if !response.success? raise FailedOaiPmhRequest.new(request_type, "HTTP status code #{response.code}") @@ -314,15 +389,56 @@ def list_sets(endpoint) end -def metadata_prefix_of(endpoint, namespace) +def metadata_formats() xml_response = $client.list_metadata_formats - format_description = xml_response.at_xpath("//xmlns:metadataFormat[xmlns:metadataNamespace=\"#{namespace}\"]/xmlns:metadataPrefix/text()") - if !format_description - raise StandardError.new "ListMetadataFormats missing #{namespace}" + formats = {} + + metadataFormats = xml_response.xpath("/xmlns:OAI-PMH/xmlns:ListMetadataFormats/xmlns:metadataFormat") + if !metadataFormats + raise FailedOaiPmhRequest.new "ListMetadataFormats", "missing metadataFormat elements." + end + + metadataFormats.each do |metadataFormat| + prefix = nil + schema = nil + namespace = nil + + metadataFormat.children.each do |child| + next unless child.element? + if child.name == "metadataPrefix" + prefix = child.content + elsif child.name == "schema" + schema = child.content + elsif child.name == "metadataNamespace" + namespace = child.content + end + end + + if prefix + formats[prefix] = {schema: schema, namespace: namespace} + else + puts "Found metadataFormat without a prefix" + end end - return format_description.text + return formats +end + + +def select_datacite(formats) + formats.each do |(key,value)| # REVISIT, given a choice, which one to pick? + ns = value[:namespace] + if DATACITE_NAMESPACES.include? ns + return key, false + end + + if DATACITE_WRAPPED_NAMESPACES.include? ns + return key, true + end + end + + return nil, nil end @@ -388,7 +504,7 @@ def count_identifiers(endpoint, prefix) overall_stats.accept_from $stats $stats = SeriesStatistics.new end - print " Harvesting: " + print " Harvesting Dublin Core identifiers: " end i += 1 print "." @@ -407,7 +523,98 @@ def count_identifiers(endpoint, prefix) end end overall_stats.accept_from $stats - print_with_time("Overall timing/s #{overall_stats}") + print_with_time("Dublin Core identifiers overall timing/s #{overall_stats}") + + return overall_total, overall_set_counts +end + +RESOURCE_TYPE_EXPRESSION_WRAPPED = '*[local-name()="metadata"]/*[local-name()="oai_datacite"]/*[local-name()="payload"]/*[local-name()="resource"]/*[local-name()="resourceType"]' +RESOURCE_TYPE_EXPRESSION_UNWRAPPED = '*[local-name()="metadata"]/*[local-name()="resource"]/*[local-name()="resourceType"]' + +def list_datacite_records(is_wrapped: false, metadata_prefix: nil, resumption_token: nil) + + xml_response = $client.list_records(metadata_prefix: metadata_prefix, resumption_token: resumption_token) + + total = {} + set_counts = {} + resumptionToken = nil + + resource_type_expression = is_wrapped ? RESOURCE_TYPE_EXPRESSION_WRAPPED : RESOURCE_TYPE_EXPRESSION_UNWRAPPED + records = xml_response.xpath("/o:OAI-PMH/o:ListRecords/o:record", + {'o' => 'http://www.openarchives.org/OAI/2.0/'}) + if records + records.each do |record| + resourceTypeElements = record.xpath(resource_type_expression) + next if resourceTypeElements.count != 1 + resourceType = resourceTypeElements[0].attr('resourceTypeGeneral') + next if !resourceType + + total[resourceType] ||= 0 + total[resourceType] += 1 + + setSpecElements = record.xpath('o:header/o:setSpec', {'o' => 'http://www.openarchives.org/OAI/2.0/'}) + next if setSpecElements.empty? + + setSpecElements.each do |setSpecElement| + setSpec = setSpecElement.content + set_counts[setSpec] ||= {} + set_counts[setSpec] [resourceType] ||= 0 + set_counts[setSpec] [resourceType] += 1 + end + end + end + + token = xml_response.at_xpath('/o:OAI-PMH/o:ListRecords/o:resumptionToken/text()', + {'o' => 'http://www.openarchives.org/OAI/2.0/'}) + resumptionToken = token ? token.text : nil + + return total, set_counts, resumptionToken +end + +def count_datacite_records(prefix, wrapped) + overall_stats = SeriesStatistics.new + $stats = SeriesStatistics.new + overall_total, overall_set_counts, resumptionToken = list_datacite_records(is_wrapped: wrapped, metadata_prefix: prefix) + + i = 0 + while resumptionToken + if i % 40 == 0 + if i > 0 + print("\n") + print_with_time("#{overall_total.values.reduce(:+)} records so far; timing/s #{$stats}") + overall_stats.accept_from $stats + $stats = SeriesStatistics.new + end + print " Harvesting DataCite records: " + end + i += 1 + print "." + delta_total, delta_set_counts, resumptionToken = list_datacite_records(is_wrapped: wrapped, resumption_token: resumptionToken) + + delta_total.each do |(type,count)| + overall_total[type] ||= 0 + overall_total[type] += count + end + + delta_set_counts.each do |(set,set_counts)| + overall_set_counts[set] ||= {} + overall_set = overall_set_counts[set] + + set_counts.each do |(type,count)| + overall_set[type] ||= 0 + overall_set[type] += count + end + end + end + + if i > 0 + print "\n" + if overall_stats.count > 0 + print_with_time("#{overall_total.values.reduce(:+)} records so far; timing/s #{$stats}") + end + end + overall_stats.accept_from $stats + print_with_time("DataCite records overall timing/s #{overall_stats}") return overall_total, overall_set_counts end @@ -419,13 +626,6 @@ def query_oai_pmh_endpoint(endpoint, do_harvesting) return status, [], {}, 0, {} end - begin - dc_prefix = metadata_prefix_of(endpoint, 'http://www.openarchives.org/OAI/2.0/oai_dc/') - rescue StandardError => e - print_with_time("Error: metadata_prefix_of failed: #{e.message}") - return "Error", [], {}, 0, {} - end - set_names = list_sets(endpoint) if !do_harvesting @@ -433,14 +633,30 @@ def query_oai_pmh_endpoint(endpoint, do_harvesting) end begin - total_count, set_counts = count_identifiers(endpoint, dc_prefix) + total_count, set_counts = count_identifiers(endpoint, "oai_dc") rescue StandardError => e print_with_time("Error: ListIdentifiers failed: #{e.message}") total_count = 0 set_counts = {} end - return status, adminAddress, set_names, total_count, set_counts + formats = metadata_formats() + prefix, is_wrapped = select_datacite(formats) + if prefix + begin + datacite_totals, set_datacite_counts = count_datacite_records(prefix, is_wrapped) + rescue StandardError => e + print_with_time("Error: DataCite harvesting failed: #{e.message}") + datacite_totals = {} + set_datacite_counts = {} + end + else + puts " Skipping DataCite harvesting as DataCite is not supported." + datacite_totals = {} + set_datacite_counts = {} + end + + return status, adminAddress, set_names, total_count, set_counts, datacite_totals, set_datacite_counts end @@ -472,7 +688,7 @@ facilities.each do |facility| name = facility['short-name'] puts "Checking OAI-PMH endpoint for #{name}: #{oai_pmh_endpoint}" do_harvesting = !oai_pmh['skip-harvesting'] - status, adminAddress, set_names, total_count, set_count = query_oai_pmh_endpoint(oai_pmh_endpoint, do_harvesting) + status, adminAddress, set_names, total_count, set_count, datacite_totals, set_datacite_counts = query_oai_pmh_endpoint(oai_pmh_endpoint, do_harvesting) oai_pmh['status'] = status @@ -487,6 +703,13 @@ facilities.each do |facility| oai_pmh['items'] = items items['count'] = total_count + if !datacite_totals.empty? + items['resource_types'] = {} + datacite_totals.sort_by { |k, _| k }.each do |(key,value)| + items['resource_types'][key] = value + end + end + if !set_count.empty? sets = {} items['sets'] = sets @@ -504,6 +727,14 @@ facilities.each do |facility| set['name'] = set_names[set_id] end set['count'] = count + + if set_datacite_counts.key?(set_id) + set['resource_types'] = {} + + set_datacite_counts[set_id].sort_by { |k, _| k }.each do |(key,value)| + set['resource_types'][key] = value + end + end end end end