From 08af574cb14ec889ea49a89926a57ef7926f749b Mon Sep 17 00:00:00 2001 From: Paul Millar <paul.millar@desy.de> Date: Thu, 2 Jan 2025 15:01:42 +0100 Subject: [PATCH] update_oai-pmh Improve HTTP request headers Motivation: The `User-Agent` request header was not being sent due to a bug. The havester lacks support for the `From` request header. Both HTTP request headers are recommented by the Harvester's guidelines: https://www.openarchives.org/OAI/2.0/guidelines-harvester.htm Modification: Fix bug with `User-Agent` header. Add support for `From` header. The `From` header takes an email address as an argument. Rather than requiring a configuration file, the code looks for likely places where the user may have already configured their email address. Result: The OAI-PMH harvester more closely followes the corresponding guidelines. --- scripts/update_oai-pmh.rb | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/scripts/update_oai-pmh.rb b/scripts/update_oai-pmh.rb index d94a368..e98ab25 100644 --- a/scripts/update_oai-pmh.rb +++ b/scripts/update_oai-pmh.rb @@ -17,12 +17,11 @@ class OaiPmhClient persistent_connection_adapter - headers {"User-Agent" => "LEAPS-WG3-client/0.1"} + headers 'User-Agent' => 'LEAPS-WG3-client/0.1' def initialize(oai_pmh_endpoint, from_id=nil) self.class.base_uri oai_pmh_endpoint - - # TODO add from_id support as 'From' request header + self.class.headers "From" => from_id if from_id # WORK_AROUND: The connection adapter is persistent across multiple # URLs (which is good), but the adapter+connection-caching somehow @@ -233,6 +232,27 @@ class SeriesStatistics end end +# Try to discover someone's email address by looking in various likely +# configuration files. Returns the email address if it's unambiguous. +def find_email_address + candidate_emails = [] + + # Look in Zoom + zoomConfig = File.expand_path("~/.config/zoomus.conf") + if File.file?(zoomConfig) + File.open( zoomConfig ).each do |line| + if line =~ /^userEmailAddress=(.*?)$/ + candidate_emails.append($1) + end + end + end + + # TODO Look in Thunderbird + #find ~.thunderbird -name prefs.js | xargs sed -n 's/user_pref."mail.identity.id1.useremail", "\([^"]*\)".*/\1/p' + + # Return email address if there is no ambiguity. + return candidate_emails.uniq.count == 1 ? candidate_emails[0] : nil +end def check_oai_pmh_endpoint(endpoint_url) queryIdentify_url = endpoint_url + "?verb=Identify" @@ -423,6 +443,13 @@ up_to_date = true file_path = '_data/facilities.yml' facilities = YAML.safe_load(File.read(file_path), permitted_classes: [Date]) +email_address = find_email_address +if email_address + puts "INFO: identifying OAI-PMH activity as from <#{email_address}>" +else + puts "WARNING: unable to discover your email address. Carrying on under a cloak of anonymity." +end + facilities.each do |facility| odr = facility['odr'] next unless odr && odr.key?('oai-pmh-endpoint') @@ -433,7 +460,7 @@ facilities.each do |facility| oai_pmh_endpoint = oai_pmh['link'] next unless oai_pmh_endpoint - $client = OaiPmhClient.new(oai_pmh_endpoint) + $client = OaiPmhClient.new(oai_pmh_endpoint, email_address) expected_status = oai_pmh['status'] -- GitLab