From aee25f9fd4369ccb78fea906fa65b37149f32d73 Mon Sep 17 00:00:00 2001
From: Paul Millar <paul.millar@desy.de>
Date: Thu, 2 Jan 2025 11:53:21 +0100
Subject: [PATCH] oai-pmh: add custom HTTParty client

Motivation:

Currently, there is a lot of HTTP/networking code mixed in with
application code.  It would be good to seperate these concerns,
particularly as we want to introduce connection reuse.

Modification:

Introduce new class that uses HTTParty as a mixin.  Update code to take
advantage of this new class.

Result:

Reduced code duplication; now easier to add new features.
---
 scripts/update_oai-pmh.rb | 213 ++++++++++++++++++++++----------------
 1 file changed, 123 insertions(+), 90 deletions(-)

diff --git a/scripts/update_oai-pmh.rb b/scripts/update_oai-pmh.rb
index 0786cab..c57fd2f 100644
--- a/scripts/update_oai-pmh.rb
+++ b/scripts/update_oai-pmh.rb
@@ -5,6 +5,118 @@ require 'yaml'
 require 'date'
 require 'nokogiri'
 
+class FailedOaiPmhRequest < StandardError
+    def initialize(type, msg)
+        super(type + ": " + msg)
+    end
+end
+
+class OaiPmhClient
+    include HTTParty
+
+    headers {"User-Agent" => "LEAPS-WG3-client/0.1"}
+
+    def initialize(oai_pmh_endpoint, from_id=nil)
+        self.class.base_uri oai_pmh_endpoint
+        # TODO add from_id support as 'From' request header
+    end
+
+    def identify()
+        response = self.class.get("?verb=Identify")
+        handle_response("Identify", response)
+    end
+
+    def list_sets()
+        response = self.class.get("?verb=ListSets")
+        handle_response("ListSets", response)
+    end
+
+    def list_metadata_formats()
+        response = self.class.get("?verb=ListMetadataFormats")
+        handle_response("ListMetadataFormats", response)
+    end
+
+    def list_identifiers(metadata_prefix: nil, resumption_token: nil)
+        raise ArgumentError, 'Need to specify either metadata_prefix or resumption_token' unless metadata_prefix || resumption_token
+        raise ArgumentError, 'Must specify one of metadata_prefix or resumption_token' if metadata_prefix && resumption_token
+
+        if metadata_prefix
+            arg="metadataPrefix="+metadata_prefix
+        else
+            arg="resumptionToken="+resumption_token
+        end
+
+        # Whether the server has already responded successfully in the past.
+        # Using the presence of the resumptionToken to determine this is,
+        # perhaps, questionable.
+        hasServerResponded = resumption_token
+
+        response = nil
+        attempts = 0
+        while !response
+            begin
+                attempts += 1
+
+                starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+                response = self.class.get("?verb=ListIdentifiers&" + arg)
+                ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+                $stats.accept(ending - starting)
+
+                if response.code >= 500
+                    # 5xx status code indicates a server error.  Retrying might
+                    # help.
+                    raise HTTParty::Error.new "Received status code #{response.code}"
+                elsif response.code >= 400
+                    # 4xx status code indicates a client error.  Retrying with
+                    # the same request will not help.
+                    raise FailedOaiPmhRequest.new("ListIdentifiers", "Received status code #{response.code}")
+                end
+            rescue HTTParty::Error, SocketError, Timeout::Error => e
+                # FIXME output needs to take into account current whether to
+                # insert a new-line.  As a quick hack, use the presence of
+                # resumptionToken in the URL to indicate a newline is needed
+                # (this works most of the time, but not always)
+                if hasServerResponded && attempts == 1
+                    print "\n"
+                end
+                print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}")
+
+                if attempts >= MAX_HTTP_GET_RETRIES
+                    # REVISIT include the URL in the message?
+                    raise FailedOaiPmhRequest.new("ListIdentifiers", "Too many GET requests failed")
+                end
+
+                delay = FAILED_REQUEST_INITIAL
+                if hasServerResponded
+                    delay += attempts * FAILED_REQUEST_BACKOFF
+                end
+
+                sleep(delay);
+                response = nil
+            end
+        end
+
+        handle_response("ListIdentifiers", response)
+    end
+
+    def handle_response(request_type, response)
+        if !response.success?
+            raise FailedOaiPmhRequest.new(request_type, "HTTP status code #{response.code}")
+        end
+
+        if response.body.nil? || response.body.empty?
+            raise FailedOaiPmhRequest.new(request_type, "empty entity")
+        end
+
+        begin
+            xml_response = Nokogiri::XML(response.body)
+        rescue StandardError => e
+            raise FailedOaiPmhRequest.new(request_type, "XML parsing failed: #{e.message}")
+        end
+        return xml_response
+    end
+end
+
 MAX_HTTP_GET_RETRIES = 10
 
 FAILED_REQUEST_INITIAL = 60
@@ -108,18 +220,8 @@ end
 def check_oai_pmh_endpoint(endpoint_url)
     queryIdentify_url = endpoint_url + "?verb=Identify"
     begin
-        response = HTTParty.head(queryIdentify_url)
-
-        if !response.success?
-            raise StandardError.new "HTTP status code #{response.code}"
-        end
-
-        response = HTTParty.get(queryIdentify_url)
-        if response.body.nil? || response.body.empty?
-            raise StandardError.new "empty entity"
-        end
+        xml_response = $client.identify()
 
-        xml_response = Nokogiri::XML(response.body)
         oai_pmh_tag = xml_response.at_xpath('//*[name()="OAI-PMH"]')
         if !oai_pmh_tag
             raise StandardError.new "no OAI-PMH tag."
@@ -134,7 +236,7 @@ def check_oai_pmh_endpoint(endpoint_url)
 
         return "Active", addresses
     rescue StandardError => e
-        print_with_time("Error: Identify request failed: #{e.message}")
+        print_with_time("Error: #{e.message}")
         return "Error", []
     end
 end
@@ -142,15 +244,8 @@ end
 
 def list_sets(endpoint)
     sets_info = {}
-    querySets_url = endpoint + "?verb=ListSets"
     begin
-        response = HTTParty.get(querySets_url)
-
-        if response.body.nil? || response.body.empty?
-            raise StandardError.new "Identify response is empty."
-        end
-
-        xml_response = Nokogiri::XML(response.body)
+        xml_response = $client.list_sets
 
         sets = xml_response.xpath('//xmlns:set')
         sets.each do |set|
@@ -178,14 +273,7 @@ end
 
 
 def metadata_prefix_of(endpoint, namespace)
-    listMetadataFormats_url = endpoint + "?verb=ListMetadataFormats"
-
-    response = HTTParty.get(listMetadataFormats_url) # raises StandardError
-    if response.body.nil? || response.body.empty?
-        raise StandardError.new "ListMetadataFormats response is empty."
-    end
-
-    xml_response = Nokogiri::XML(response.body)
+    xml_response = $client.list_metadata_formats
 
     format_description = xml_response.at_xpath("//xmlns:metadataFormat[xmlns:metadataNamespace=\"#{namespace}\"]/xmlns:metadataPrefix/text()")
     if !format_description
@@ -196,68 +284,13 @@ def metadata_prefix_of(endpoint, namespace)
 end
 
 
-def list_identifiers(url)
-    resumptionToken = nil
-    total = 0
-    set_counts = {}
-
-    # Whether the server has already responded successfully in the past.  Using
-    # the presence of the resumptionToken to determine this is, perhaps,
-    # questionable.
-    hasServerResponded = url.include?('resumptionToken=')
-
-    response = nil
-    attempts = 0
-    while !response
-        begin
-            attempts += 1
-
-            starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-            response = HTTParty.get(url)
-            ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-            $stats.accept(ending - starting)
-
-            if response.code >= 500
-                # 5xx status code indicates a server error.  Retrying might
-                # help.
-                raise HTTParty::Error.new "Received status code #{response.code}"
-            elsif response.code >= 400
-                # 4xx status code indicates a client error.  Retrying with the
-                # same request will not help.
-                raise StandardError.new "Received status code #{response.code}"
-            end
-        rescue HTTParty::Error, SocketError, Timeout::Error => e
-            # FIXME output needs to take into account current whether to
-            # insert a new-line.  As a quick hack, use the presence of
-            # resumptionToken in the URL to indicate a newline is needed (this
-            # works most of the time, but not always)
-            if hasServerResponded && attempts == 1
-                print "\n"
-            end
-            print_with_time("Attempt #{attempts} of #{MAX_HTTP_GET_RETRIES} failed: #{e.message}")
-
-            if attempts >= MAX_HTTP_GET_RETRIES
-                raise StandardError.new "Too many GET requests failed for #{url}"
-            end
-
-            delay = FAILED_REQUEST_INITIAL
-            if hasServerResponded
-                delay += attempts * FAILED_REQUEST_BACKOFF
-            end
+def list_identifiers(metadata_prefix: nil, resumption_token: nil)
 
-            sleep(delay);
-            response = nil
-        end
-    end
-
-    if response.body.nil? || response.body.empty?
-        raise StandardError.new "response is empty."
-    end
-
-    xml_response = Nokogiri::XML(response.body)
+    xml_response = $client.list_identifiers(metadata_prefix: metadata_prefix, resumption_token: resumption_token)
 
     total = 0
     set_counts = {}
+    resumptionToken = nil
 
     # NB At 2024-12-20, ILL and PSI endpoints returns malformed responses with
     # the expected "header" elements contained within rogue "record" elements;
@@ -302,8 +335,7 @@ end
 def count_identifiers(endpoint, prefix)
     overall_stats = SeriesStatistics.new
     $stats = SeriesStatistics.new
-    url = endpoint + "?verb=ListIdentifiers&metadataPrefix=" + prefix
-    overall_total, overall_set_counts, resumptionToken = list_identifiers(url)
+    overall_total, overall_set_counts, resumptionToken = list_identifiers(metadata_prefix: prefix)
 
     i = 0
     while resumptionToken
@@ -318,8 +350,7 @@ def count_identifiers(endpoint, prefix)
         end
         i += 1
         print "."
-        url = endpoint + "?verb=ListIdentifiers&resumptionToken=" + resumptionToken
-        delta_total, delta_set_counts, resumptionToken = list_identifiers(url)
+        delta_total, delta_set_counts, resumptionToken = list_identifiers(resumption_token: resumptionToken)
 
         overall_total += delta_total
         delta_set_counts.each do |set_id, delta_count|
@@ -385,6 +416,8 @@ facilities.each do |facility|
     oai_pmh_endpoint = oai_pmh['link']
     next unless oai_pmh_endpoint
 
+    $client = OaiPmhClient.new(oai_pmh_endpoint)
+
     expected_status = oai_pmh['status']
 
     name = facility['short-name']
-- 
GitLab