From 6f6b4c75e62713f957e26e9eee78bb722f916233 Mon Sep 17 00:00:00 2001
From: Paul Millar <paul.millar@desy.de>
Date: Thu, 2 Jan 2025 13:49:07 +0100
Subject: [PATCH] api-pmh Add connection caching

Motivation:

The OAI-PMH endpoints being queried behave so that the client makes a
large number of requests, each returning a relatively small amount of
data.

When requests are processed by the OAI-PMH server quickly, the overhead
for establishing the TCP and TLS connections can be very significant.

Connection caching (sometimes called HTTP Keep Alive) involves sending
multiple HTTP requests over a single TCP connection, allowing us to
ameliorate the connection overhead by (effectively) spreading the cost
over all OAI-PMH requests.

Modification:

Update client to use `persistent_http` connection pool, via the
`persistent_httparty` adapter.

A bug was discovered, where the host entity is cached between successive
requests.

Result:

OAI-PMH requests are now faster.  Some observed speedups per request are
(0.12 +/- 0.02) s, (0.16 +/- 0.01) s and (0.16 +/- 0.03) s for ESRF, HZB
and HZDR respectively (measured with ListIdentifiers request on Dublin
Core, following the resumptionToken).

The overall impact of this improvement depends on how long the OAI-PMH
endpoint takes to process a request.  For end above endpoints, the
percentage improvements (per request) are 12%, 42% and 70% respectively.
---
 Gemfile                   |  1 +
 Gemfile.lock              |  8 ++++++++
 scripts/update_oai-pmh.rb | 17 +++++++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/Gemfile b/Gemfile
index 3bfb88e..b2a1231 100644
--- a/Gemfile
+++ b/Gemfile
@@ -5,6 +5,7 @@ gem "jekyll"
 
 # add dependencies for HTTP requests
 gem "httparty"
+gem 'persistent_httparty'
 
 # add dependencies to work with yaml and xml files
 gem "yaml"
diff --git a/Gemfile.lock b/Gemfile.lock
index c232de9..3cc110a 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -11,6 +11,8 @@ GEM
     eventmachine (1.2.7)
     ffi (1.15.5)
     forwardable-extended (2.6.0)
+    gene_pool (1.5.0)
+      concurrent-ruby (>= 1.0)
     google-protobuf (3.23.3-x86_64-linux)
     http_parser.rb (0.8.0)
     httparty (0.21.0)
@@ -53,6 +55,11 @@ GEM
       racc (~> 1.4)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
+    persistent_http (1.0.6)
+      gene_pool (>= 1.3)
+    persistent_httparty (0.1.2)
+      httparty (~> 0.9)
+      persistent_http (< 2)
     public_suffix (5.0.1)
     racc (1.7.3)
     rake (13.0.6)
@@ -78,6 +85,7 @@ DEPENDENCIES
   httparty
   jekyll
   nokogiri
+  persistent_httparty
   yaml
 
 BUNDLED WITH
diff --git a/scripts/update_oai-pmh.rb b/scripts/update_oai-pmh.rb
index c57fd2f..d94a368 100644
--- a/scripts/update_oai-pmh.rb
+++ b/scripts/update_oai-pmh.rb
@@ -4,6 +4,7 @@ require 'httparty'
 require 'yaml'
 require 'date'
 require 'nokogiri'
+require 'persistent_httparty'
 
 class FailedOaiPmhRequest < StandardError
     def initialize(type, msg)
@@ -14,11 +15,27 @@ end
 class OaiPmhClient
     include HTTParty
 
+    persistent_connection_adapter
+
     headers {"User-Agent" => "LEAPS-WG3-client/0.1"}
 
     def initialize(oai_pmh_endpoint, from_id=nil)
         self.class.base_uri oai_pmh_endpoint
+
         # TODO add from_id support as 'From' request header
+
+        # WORK_AROUND: The connection adapter is persistent across multiple
+        # URLs (which is good), but the adapter+connection-caching somehow
+        # results in all requests being sent to the same host.
+        #
+        # To work-around this, we clear the connection caching code, forcing
+        # a new connection caching.
+        #
+        # This is acceptable, as we process all requests from a single endpoint
+        # before moving onto the next request and we process endpoints
+        # sequentially.
+        conn_adapter = self.class.default_options[:connection_adapter]
+        conn_adapter.persistent_http = nil
     end
 
     def identify()
-- 
GitLab