Skip to content
Snippets Groups Projects
Commit 1e3b1751 authored by Paul Millar's avatar Paul Millar
Browse files

Enhance the OAI-PMH update script to count datasets

parent e05f5c15
No related branches found
No related tags found
1 merge request!55Enhance the OAI-PMH update script to count datasets
Pipeline #474897 passed with warnings
......@@ -28,7 +28,6 @@ def check_oai_pmh_endpoint(endpoint_url)
return "Error"
end
puts " Endpoint is active."
return "Active"
rescue StandardError => e
puts " Error: Identify request failed: #{e.message}"
......@@ -36,6 +35,165 @@ def check_oai_pmh_endpoint(endpoint_url)
end
end
def list_sets(endpoint)
sets_info = {}
querySets_url = endpoint + "?verb=ListSets"
begin
response = HTTParty.get(querySets_url)
if response.body.nil? || response.body.empty?
raise StandardError.new "Identify response is empty."
end
xml_response = Nokogiri::XML(response.body)
sets = xml_response.xpath('//xmlns:set')
sets.each do |set|
set_name = nil
set_spec = nil
set.elements().each do |child|
if child.name == 'setName'
set_name = child.content()
elsif child.name == 'setSpec'
set_spec = child.content()
elsif child.name == 'setDescription'
# TODO: parse description to extract DublinCore dc:description
end
end
if set_spec
sets_info.store(set_spec, set_name ? set_name : set_spec)
end
end
rescue StandardError => e
puts " Error: ListSets request failed: #{e.message}"
end
return sets_info
end
def metadata_prefix_of(endpoint, namespace)
listMetadataFormats_url = endpoint + "?verb=ListMetadataFormats"
response = HTTParty.get(listMetadataFormats_url) # raises StandardError
if response.body.nil? || response.body.empty?
raise StandardError.new "ListMetadataFormats response is empty."
end
xml_response = Nokogiri::XML(response.body)
format_description = xml_response.at_xpath("//xmlns:metadataFormat[xmlns:metadataNamespace=\"#{namespace}\"]/xmlns:metadataPrefix/text()")
if !format_description
raise StandardError.new "ListMetadataFormats missing #{namespace}"
end
return format_description.text
end
def list_identifiers(url)
resumptionToken = nil
total = 0
set_counts = {}
response = HTTParty.get(url) # raises StandardError
if response.body.nil? || response.body.empty?
raise StandardError.new "response is empty."
end
xml_response = Nokogiri::XML(response.body)
total = 0
set_counts = {}
# NB At 2024-12-20, ILL and PSI endpoints returns malformed responses with
# the expected "header" elements contained within rogue "record" elements;
# for example,
# [...]
# <record>
# <header>
# <identifier>10.5291/ILL-DATA.TEST-3172</identifier>
# <datestamp>2021-09-23T12:08:08Z</datestamp>
# </header>
# </record>
# </ListIdentifiers>
# </OAI-PMH>
headers = xml_response.xpath("/xmlns:OAI-PMH/xmlns:ListIdentifiers/xmlns:header")
if headers
headers.each do |header|
next if header.attr('status') == 'deleted'
total += 1
header.children.each do |child|
next unless child.element? && child.name == "setSpec"
setSpec = child.content
if set_counts.key?(setSpec)
set_counts[setSpec] += 1
else
set_counts[setSpec] = 1
end
end
end
end
token = xml_response.at_xpath('/xmlns:OAI-PMH/xmlns:ListIdentifiers/xmlns:resumptionToken/text()')
resumptionToken = token ? token.text : nil
return total, set_counts, resumptionToken
end
def count_identifiers(endpoint, prefix)
url = endpoint + "?verb=ListIdentifiers&metadataPrefix=" + prefix
overall_total, overall_set_counts, resumptionToken = list_identifiers(url)
i = 0
while resumptionToken
if i % 40 == 0
if i > 0
print "\n #{Time.now.strftime("%d/%m/%Y %H:%M:%S")} harvested #{overall_total} records so far.\n"
end
print " Harvesting: "
end
i += 1
print "."
url = endpoint + "?verb=ListIdentifiers&resumptionToken=" + resumptionToken
delta_total, delta_set_counts, resumptionToken = list_identifiers(url)
overall_total += delta_total
delta_set_counts.each do |set_id, delta_count|
new_count = overall_set_counts.key?(set_id) ? (overall_set_counts[set_id] + delta_count) : delta_count
overall_set_counts[set_id] = new_count
end
end
if i > 0
print "\n"
end
return overall_total, overall_set_counts
end
def query_oai_pmh_endpoint(endpoint)
status = check_oai_pmh_endpoint(endpoint)
if status == "Error"
return status, {}, 0, {}
end
dc_prefix = metadata_prefix_of(endpoint, 'http://www.openarchives.org/OAI/2.0/oai_dc/')
set_names = list_sets(endpoint)
begin
total_count, set_counts = count_identifiers(endpoint, dc_prefix)
rescue StandardError => e
puts " Error: ListIdentifiers failed #{e.message}"
end
return status, set_names, total_count, set_counts
end
up_to_date = true
file_path = '_data/facilities.yml'
facilities = YAML.safe_load(File.read(file_path), permitted_classes: [Date])
......@@ -54,14 +212,34 @@ facilities.each do |facility|
name = facility['short-name']
puts "Checking OAI-PMH endpoint for #{name}: #{oai_pmh_endpoint}"
status = check_oai_pmh_endpoint(oai_pmh_endpoint)
oai_pmh['last-check'] = Date.today #Time.now.strftime("%Y-%m-%d").to_date
next if expected_status == status
status, set_names, total_count, set_count = query_oai_pmh_endpoint(oai_pmh_endpoint)
oai_pmh['status'] = status
puts " #{facility['short-name']}: was #{expected_status}, now #{status}."
up_to_date = false
oai_pmh.delete('datasets')
if status == "Active"
datasets = {}
oai_pmh['datasets'] = datasets
datasets['count'] = total_count
if !set_count.empty?
sets = {}
datasets['sets'] = sets
set_count.sort_by { |k, _| k }.each do |set_id, count|
set = {}
sets[set_id] = set
if set_names.key?(set_id)
set['name'] = set_names[set_id]
end
set['count'] = count
end
end
end
oai_pmh['last-check'] = Date.today
end
File.open(file_path, 'w') {|f| f.write facilities.to_yaml }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment