Skip to content
Snippets Groups Projects
Commit aee25f9f authored by Paul Millar's avatar Paul Millar
Browse files

oai-pmh: add custom HTTParty client

Motivation:

Currently, there is a lot of HTTP/networking code mixed in with
application code.  It would be good to seperate these concerns,
particularly as we want to introduce connection reuse.

Modification:

Introduce new class that uses HTTParty as a mixin.  Update code to take
advantage of this new class.

Result:

Reduced code duplication; now easier to add new features.
parent e3f28fde
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,118 @@ require 'yaml'
require 'date'
require 'nokogiri'
class FailedOaiPmhRequest < StandardError
def initialize(type, msg)
super(type + ": " + msg)
end
end
class OaiPmhClient
include HTTParty
headers {"User-Agent" => "LEAPS-WG3-client/0.1"}
def initialize(oai_pmh_endpoint, from_id=nil)
self.class.base_uri oai_pmh_endpoint
# TODO add from_id support as 'From' request header
end
def identify()
response = self.class.get("?verb=Identify")
handle_response("Identify", response)
end
def list_sets()
response = self.class.get("?verb=ListSets")
handle_response("ListSets", response)
end
def list_metadata_formats()
response = self.class.get("?verb=ListMetadataFormats")
handle_response("ListMetadataFormats", response)
end
def list_identifiers(metadata_prefix: nil, resumption_token: nil)
raise ArgumentError, 'Need to specify either metadata_prefix or resumption_token' unless metadata_prefix || resumption_token
raise ArgumentError, 'Must specify one of metadata_prefix or resumption_token' if metadata_prefix && resumption_token
if metadata_prefix
arg="metadataPrefix="+metadata_prefix
else
arg="resumptionToken="+resumption_token
end
# Whether the server has already responded successfully in the past.
# Using the presence of the resumptionToken to determine this is,
# perhaps, questionable.
hasServerResponded = resumption_token
response = nil
attempts = 0
while !response
begin
attempts += 1
starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
response = self.class.get("?verb=ListIdentifiers&" + arg)
ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
$stats.accept(ending - starting)
if response.code >= 500
# 5xx status code indicates a server error. Retrying might
# help.
raise HTTParty::Error.new "Received status code #{response.code}"
elsif response.code >= 400
# 4xx status code indicates a client error. Retrying with
# the same request will not help.
raise FailedOaiPmhRequest.new("ListIdentifiers", "Received status code #{response.code}")
end
rescue HTTParty::Error, SocketError, Timeout::Error => e
# FIXME output needs to take into account current whether to
# insert a new-line. As a quick hack, use the presence of
# resumptionToken in the URL to indicate a newline is needed
# (this works most of the time, but not always)
if hasServerResponded && attempts == 1
print "\n"
end
print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}")
if attempts >= MAX_HTTP_GET_RETRIES
# REVISIT include the URL in the message?
raise FailedOaiPmhRequest.new("ListIdentifiers", "Too many GET requests failed")
end
delay = FAILED_REQUEST_INITIAL
if hasServerResponded
delay += attempts * FAILED_REQUEST_BACKOFF
end
sleep(delay);
response = nil
end
end
handle_response("ListIdentifiers", response)
end
def handle_response(request_type, response)
if !response.success?
raise FailedOaiPmhRequest.new(request_type, "HTTP status code #{response.code}")
end
if response.body.nil? || response.body.empty?
raise FailedOaiPmhRequest.new(request_type, "empty entity")
end
begin
xml_response = Nokogiri::XML(response.body)
rescue StandardError => e
raise FailedOaiPmhRequest.new(request_type, "XML parsing failed: #{e.message}")
end
return xml_response
end
end
MAX_HTTP_GET_RETRIES = 10
FAILED_REQUEST_INITIAL = 60
......@@ -108,18 +220,8 @@ end
def check_oai_pmh_endpoint(endpoint_url)
queryIdentify_url = endpoint_url + "?verb=Identify"
begin
response = HTTParty.head(queryIdentify_url)
if !response.success?
raise StandardError.new "HTTP status code #{response.code}"
end
response = HTTParty.get(queryIdentify_url)
if response.body.nil? || response.body.empty?
raise StandardError.new "empty entity"
end
xml_response = $client.identify()
xml_response = Nokogiri::XML(response.body)
oai_pmh_tag = xml_response.at_xpath('//*[name()="OAI-PMH"]')
if !oai_pmh_tag
raise StandardError.new "no OAI-PMH tag."
......@@ -134,7 +236,7 @@ def check_oai_pmh_endpoint(endpoint_url)
return "Active", addresses
rescue StandardError => e
print_with_time("Error: Identify request failed: #{e.message}")
print_with_time("Error: #{e.message}")
return "Error", []
end
end
......@@ -142,15 +244,8 @@ end
def list_sets(endpoint)
sets_info = {}
querySets_url = endpoint + "?verb=ListSets"
begin
response = HTTParty.get(querySets_url)
if response.body.nil? || response.body.empty?
raise StandardError.new "Identify response is empty."
end
xml_response = Nokogiri::XML(response.body)
xml_response = $client.list_sets
sets = xml_response.xpath('//xmlns:set')
sets.each do |set|
......@@ -178,14 +273,7 @@ end
def metadata_prefix_of(endpoint, namespace)
listMetadataFormats_url = endpoint + "?verb=ListMetadataFormats"
response = HTTParty.get(listMetadataFormats_url) # raises StandardError
if response.body.nil? || response.body.empty?
raise StandardError.new "ListMetadataFormats response is empty."
end
xml_response = Nokogiri::XML(response.body)
xml_response = $client.list_metadata_formats
format_description = xml_response.at_xpath("//xmlns:metadataFormat[xmlns:metadataNamespace=\"#{namespace}\"]/xmlns:metadataPrefix/text()")
if !format_description
......@@ -196,68 +284,13 @@ def metadata_prefix_of(endpoint, namespace)
end
def list_identifiers(url)
resumptionToken = nil
total = 0
set_counts = {}
# Whether the server has already responded successfully in the past. Using
# the presence of the resumptionToken to determine this is, perhaps,
# questionable.
hasServerResponded = url.include?('resumptionToken=')
response = nil
attempts = 0
while !response
begin
attempts += 1
starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
response = HTTParty.get(url)
ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
$stats.accept(ending - starting)
if response.code >= 500
# 5xx status code indicates a server error. Retrying might
# help.
raise HTTParty::Error.new "Received status code #{response.code}"
elsif response.code >= 400
# 4xx status code indicates a client error. Retrying with the
# same request will not help.
raise StandardError.new "Received status code #{response.code}"
end
rescue HTTParty::Error, SocketError, Timeout::Error => e
# FIXME output needs to take into account current whether to
# insert a new-line. As a quick hack, use the presence of
# resumptionToken in the URL to indicate a newline is needed (this
# works most of the time, but not always)
if hasServerResponded && attempts == 1
print "\n"
end
print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}")
if attempts >= MAX_HTTP_GET_RETRIES
raise StandardError.new "Too many GET requests failed for #{url}"
end
delay = FAILED_REQUEST_INITIAL
if hasServerResponded
delay += attempts * FAILED_REQUEST_BACKOFF
end
def list_identifiers(metadata_prefix: nil, resumption_token: nil)
sleep(delay);
response = nil
end
end
if response.body.nil? || response.body.empty?
raise StandardError.new "response is empty."
end
xml_response = Nokogiri::XML(response.body)
xml_response = $client.list_identifiers(metadata_prefix: metadata_prefix, resumption_token: resumption_token)
total = 0
set_counts = {}
resumptionToken = nil
# NB At 2024-12-20, ILL and PSI endpoints returns malformed responses with
# the expected "header" elements contained within rogue "record" elements;
......@@ -302,8 +335,7 @@ end
def count_identifiers(endpoint, prefix)
overall_stats = SeriesStatistics.new
$stats = SeriesStatistics.new
url = endpoint + "?verb=ListIdentifiers&metadataPrefix=" + prefix
overall_total, overall_set_counts, resumptionToken = list_identifiers(url)
overall_total, overall_set_counts, resumptionToken = list_identifiers(metadata_prefix: prefix)
i = 0
while resumptionToken
......@@ -318,8 +350,7 @@ def count_identifiers(endpoint, prefix)
end
i += 1
print "."
url = endpoint + "?verb=ListIdentifiers&resumptionToken=" + resumptionToken
delta_total, delta_set_counts, resumptionToken = list_identifiers(url)
delta_total, delta_set_counts, resumptionToken = list_identifiers(resumption_token: resumptionToken)
overall_total += delta_total
delta_set_counts.each do |set_id, delta_count|
......@@ -385,6 +416,8 @@ facilities.each do |facility|
oai_pmh_endpoint = oai_pmh['link']
next unless oai_pmh_endpoint
$client = OaiPmhClient.new(oai_pmh_endpoint)
expected_status = oai_pmh['status']
name = facility['short-name']
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment