From aee25f9fd4369ccb78fea906fa65b37149f32d73 Mon Sep 17 00:00:00 2001 From: Paul Millar <paul.millar@desy.de> Date: Thu, 2 Jan 2025 11:53:21 +0100 Subject: [PATCH] oai-pmh: add custom HTTParty client Motivation: Currently, there is a lot of HTTP/networking code mixed in with application code. It would be good to seperate these concerns, particularly as we want to introduce connection reuse. Modification: Introduce new class that uses HTTParty as a mixin. Update code to take advantage of this new class. Result: Reduced code duplication; now easier to add new features. --- scripts/update_oai-pmh.rb | 213 ++++++++++++++++++++++---------------- 1 file changed, 123 insertions(+), 90 deletions(-) diff --git a/scripts/update_oai-pmh.rb b/scripts/update_oai-pmh.rb index 0786cab..c57fd2f 100644 --- a/scripts/update_oai-pmh.rb +++ b/scripts/update_oai-pmh.rb @@ -5,6 +5,118 @@ require 'yaml' require 'date' require 'nokogiri' +class FailedOaiPmhRequest < StandardError + def initialize(type, msg) + super(type + ": " + msg) + end +end + +class OaiPmhClient + include HTTParty + + headers {"User-Agent" => "LEAPS-WG3-client/0.1"} + + def initialize(oai_pmh_endpoint, from_id=nil) + self.class.base_uri oai_pmh_endpoint + # TODO add from_id support as 'From' request header + end + + def identify() + response = self.class.get("?verb=Identify") + handle_response("Identify", response) + end + + def list_sets() + response = self.class.get("?verb=ListSets") + handle_response("ListSets", response) + end + + def list_metadata_formats() + response = self.class.get("?verb=ListMetadataFormats") + handle_response("ListMetadataFormats", response) + end + + def list_identifiers(metadata_prefix: nil, resumption_token: nil) + raise ArgumentError, 'Need to specify either metadata_prefix or resumption_token' unless metadata_prefix || resumption_token + raise ArgumentError, 'Must specify one of metadata_prefix or resumption_token' if metadata_prefix && resumption_token + + if metadata_prefix + arg="metadataPrefix="+metadata_prefix + else + arg="resumptionToken="+resumption_token + end + + # Whether the server has already responded successfully in the past. + # Using the presence of the resumptionToken to determine this is, + # perhaps, questionable. + hasServerResponded = resumption_token + + response = nil + attempts = 0 + while !response + begin + attempts += 1 + + starting = Process.clock_gettime(Process::CLOCK_MONOTONIC) + response = self.class.get("?verb=ListIdentifiers&" + arg) + ending = Process.clock_gettime(Process::CLOCK_MONOTONIC) + $stats.accept(ending - starting) + + if response.code >= 500 + # 5xx status code indicates a server error. Retrying might + # help. + raise HTTParty::Error.new "Received status code #{response.code}" + elsif response.code >= 400 + # 4xx status code indicates a client error. Retrying with + # the same request will not help. + raise FailedOaiPmhRequest.new("ListIdentifiers", "Received status code #{response.code}") + end + rescue HTTParty::Error, SocketError, Timeout::Error => e + # FIXME output needs to take into account current whether to + # insert a new-line. As a quick hack, use the presence of + # resumptionToken in the URL to indicate a newline is needed + # (this works most of the time, but not always) + if hasServerResponded && attempts == 1 + print "\n" + end + print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}") + + if attempts >= MAX_HTTP_GET_RETRIES + # REVISIT include the URL in the message? + raise FailedOaiPmhRequest.new("ListIdentifiers", "Too many GET requests failed") + end + + delay = FAILED_REQUEST_INITIAL + if hasServerResponded + delay += attempts * FAILED_REQUEST_BACKOFF + end + + sleep(delay); + response = nil + end + end + + handle_response("ListIdentifiers", response) + end + + def handle_response(request_type, response) + if !response.success? + raise FailedOaiPmhRequest.new(request_type, "HTTP status code #{response.code}") + end + + if response.body.nil? || response.body.empty? + raise FailedOaiPmhRequest.new(request_type, "empty entity") + end + + begin + xml_response = Nokogiri::XML(response.body) + rescue StandardError => e + raise FailedOaiPmhRequest.new(request_type, "XML parsing failed: #{e.message}") + end + return xml_response + end +end + MAX_HTTP_GET_RETRIES = 10 FAILED_REQUEST_INITIAL = 60 @@ -108,18 +220,8 @@ end def check_oai_pmh_endpoint(endpoint_url) queryIdentify_url = endpoint_url + "?verb=Identify" begin - response = HTTParty.head(queryIdentify_url) - - if !response.success? - raise StandardError.new "HTTP status code #{response.code}" - end - - response = HTTParty.get(queryIdentify_url) - if response.body.nil? || response.body.empty? - raise StandardError.new "empty entity" - end + xml_response = $client.identify() - xml_response = Nokogiri::XML(response.body) oai_pmh_tag = xml_response.at_xpath('//*[name()="OAI-PMH"]') if !oai_pmh_tag raise StandardError.new "no OAI-PMH tag." @@ -134,7 +236,7 @@ def check_oai_pmh_endpoint(endpoint_url) return "Active", addresses rescue StandardError => e - print_with_time("Error: Identify request failed: #{e.message}") + print_with_time("Error: #{e.message}") return "Error", [] end end @@ -142,15 +244,8 @@ end def list_sets(endpoint) sets_info = {} - querySets_url = endpoint + "?verb=ListSets" begin - response = HTTParty.get(querySets_url) - - if response.body.nil? || response.body.empty? - raise StandardError.new "Identify response is empty." - end - - xml_response = Nokogiri::XML(response.body) + xml_response = $client.list_sets sets = xml_response.xpath('//xmlns:set') sets.each do |set| @@ -178,14 +273,7 @@ end def metadata_prefix_of(endpoint, namespace) - listMetadataFormats_url = endpoint + "?verb=ListMetadataFormats" - - response = HTTParty.get(listMetadataFormats_url) # raises StandardError - if response.body.nil? || response.body.empty? - raise StandardError.new "ListMetadataFormats response is empty." - end - - xml_response = Nokogiri::XML(response.body) + xml_response = $client.list_metadata_formats format_description = xml_response.at_xpath("//xmlns:metadataFormat[xmlns:metadataNamespace=\"#{namespace}\"]/xmlns:metadataPrefix/text()") if !format_description @@ -196,68 +284,13 @@ def metadata_prefix_of(endpoint, namespace) end -def list_identifiers(url) - resumptionToken = nil - total = 0 - set_counts = {} - - # Whether the server has already responded successfully in the past. Using - # the presence of the resumptionToken to determine this is, perhaps, - # questionable. - hasServerResponded = url.include?('resumptionToken=') - - response = nil - attempts = 0 - while !response - begin - attempts += 1 - - starting = Process.clock_gettime(Process::CLOCK_MONOTONIC) - response = HTTParty.get(url) - ending = Process.clock_gettime(Process::CLOCK_MONOTONIC) - $stats.accept(ending - starting) - - if response.code >= 500 - # 5xx status code indicates a server error. Retrying might - # help. - raise HTTParty::Error.new "Received status code #{response.code}" - elsif response.code >= 400 - # 4xx status code indicates a client error. Retrying with the - # same request will not help. - raise StandardError.new "Received status code #{response.code}" - end - rescue HTTParty::Error, SocketError, Timeout::Error => e - # FIXME output needs to take into account current whether to - # insert a new-line. As a quick hack, use the presence of - # resumptionToken in the URL to indicate a newline is needed (this - # works most of the time, but not always) - if hasServerResponded && attempts == 1 - print "\n" - end - print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}") - - if attempts >= MAX_HTTP_GET_RETRIES - raise StandardError.new "Too many GET requests failed for #{url}" - end - - delay = FAILED_REQUEST_INITIAL - if hasServerResponded - delay += attempts * FAILED_REQUEST_BACKOFF - end +def list_identifiers(metadata_prefix: nil, resumption_token: nil) - sleep(delay); - response = nil - end - end - - if response.body.nil? || response.body.empty? - raise StandardError.new "response is empty." - end - - xml_response = Nokogiri::XML(response.body) + xml_response = $client.list_identifiers(metadata_prefix: metadata_prefix, resumption_token: resumption_token) total = 0 set_counts = {} + resumptionToken = nil # NB At 2024-12-20, ILL and PSI endpoints returns malformed responses with # the expected "header" elements contained within rogue "record" elements; @@ -302,8 +335,7 @@ end def count_identifiers(endpoint, prefix) overall_stats = SeriesStatistics.new $stats = SeriesStatistics.new - url = endpoint + "?verb=ListIdentifiers&metadataPrefix=" + prefix - overall_total, overall_set_counts, resumptionToken = list_identifiers(url) + overall_total, overall_set_counts, resumptionToken = list_identifiers(metadata_prefix: prefix) i = 0 while resumptionToken @@ -318,8 +350,7 @@ def count_identifiers(endpoint, prefix) end i += 1 print "." - url = endpoint + "?verb=ListIdentifiers&resumptionToken=" + resumptionToken - delta_total, delta_set_counts, resumptionToken = list_identifiers(url) + delta_total, delta_set_counts, resumptionToken = list_identifiers(resumption_token: resumptionToken) overall_total += delta_total delta_set_counts.each do |set_id, delta_count| @@ -385,6 +416,8 @@ facilities.each do |facility| oai_pmh_endpoint = oai_pmh['link'] next unless oai_pmh_endpoint + $client = OaiPmhClient.new(oai_pmh_endpoint) + expected_status = oai_pmh['status'] name = facility['short-name'] -- GitLab