From 078a74ace5ca83c9899129b8bb7c62948920aeb9 Mon Sep 17 00:00:00 2001
From: Martin Voigt <m.voigt@hzdr.de>
Date: Tue, 25 Nov 2025 12:46:55 +0100
Subject: [PATCH] Add OAI-PMH Ingestors

---
 lib/ingestors/ingestor_factory.rb            |   4 +-
 lib/ingestors/oai_pmh_bioschemas_ingestor.rb | 157 +++++++++++++++++++
 lib/ingestors/oai_pmh_ingestor.rb            | 104 ++++++++++++
 3 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 lib/ingestors/oai_pmh_bioschemas_ingestor.rb
 create mode 100644 lib/ingestors/oai_pmh_ingestor.rb

diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb
index 67e818d02..913deacb6 100644
--- a/lib/ingestors/ingestor_factory.rb
+++ b/lib/ingestors/ingestor_factory.rb
@@ -10,6 +10,8 @@ def self.ingestors
         Ingestors::MaterialCsvIngestor,
         Ingestors::TessEventIngestor,
         Ingestors::ZenodoIngestor,
+        Ingestors::OaiPmhIngestor,
+        Ingestors::OaiPmhBioschemasIngestor
       ] + taxila_ingestors + llm_ingestors
     end
 
@@ -35,7 +37,7 @@ def self.taxila_ingestors
         Ingestors::Taxila::OsciIngestor,
         Ingestors::Taxila::DccIngestor,
         Ingestors::Taxila::SenseIngestor,
-        Ingestors::Taxila::VuMaterialIngestor,
+        Ingestors::Taxila::VuMaterialIngestor
       ]
     end
 
diff --git a/lib/ingestors/oai_pmh_bioschemas_ingestor.rb b/lib/ingestors/oai_pmh_bioschemas_ingestor.rb
new file mode 100644
index 000000000..925b982f6
--- /dev/null
+++ b/lib/ingestors/oai_pmh_bioschemas_ingestor.rb
@@ -0,0 +1,157 @@
+require 'open-uri'
+require 'tess_rdf_extractors'
+
+module Ingestors
+  class OaiPmhBioschemasIngestor < Ingestor
+    DUMMY_URL = 'https://example.com'
+
+    attr_reader :verbose
+
+    def self.config
+      {
+        key: 'oai_pmh_bioschemas',
+        title: 'OAI-PMH (Bioschemas RDF)',
+        user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
+        mail: Rails.configuration.tess['contact_email']
+      }
+    end
+
+    def read(source_url)
+      provider_events = []
+      provider_materials = []
+      totals = Hash.new(0)
+
+      client = OAI::Client.new source_url, headers: { 'From' => config[:mail] }
+      client.list_records(metadata_prefix: 'rdf').full.each do |record|
+        metadata_tag = Nokogiri::XML(record.metadata.to_s)
+        bioschemas_xml = metadata_tag.at_xpath('metadata/rdf:RDF', 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')&.to_s
+        output = read_content(bioschemas_xml)
+        next unless output
+
+        provider_events += output[:resources][:events]
+        provider_materials += output[:resources][:materials]
+        output[:totals].each do |key, value|
+          totals[key] += value
+        end
+      end
+
+      if totals.keys.any?
+        bioschemas_summary = "Bioschemas summary:\n"
+        totals.each do |type, count|
+          bioschemas_summary << "\n - #{type}: #{count}"
+        end
+        @messages << bioschemas_summary
+      end
+
+      deduplicate(provider_events).each do |event_params|
+        add_event(event_params)
+      end
+
+      deduplicate(provider_materials).each do |material_params|
+        add_material(material_params)
+      end
+    end
+
+    def read_content(content)
+      output = {
+        resources: {
+          events: [],
+          materials: []
+        },
+        totals: Hash.new(0)
+      }
+
+      return output unless content
+
+      begin
+        events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |p|
+          convert_params(p)
+        end
+        courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |p|
+          convert_params(p)
+        end
+        course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |p|
+          convert_params(p)
+        end
+        learning_resources = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |p|
+          convert_params(p)
+        end
+        output[:totals]['Events'] += events.count
+        output[:totals]['Courses'] += courses.count
+        output[:totals]['CourseInstances'] += course_instances.count
+        output[:totals]['LearningResources'] += learning_resources.count
+
+        deduplicate(events + courses + course_instances).each do |event|
+          output[:resources][:events] << event
+        end
+
+        deduplicate(learning_resources).each do |material|
+          output[:resources][:materials] << material
+        end
+      rescue StandardError => e
+        Rails.logger.error("#{e.class}: #{e.message}")
+        Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any?
+        error = 'An error'
+        comment = nil
+        if e.is_a?(RDF::ReaderError)
+          error = 'A parsing error'
+          comment = 'Please check your page contains valid RDF/XML.'
+        end
+        message = "#{error} occurred while reading the source."
+        message << " #{comment}" if comment
+        @messages << message
+      end
+
+      output
+    end
+
+    # ---- This is copied unchanged from bioschemas_ingestor.rb and needs to be refactored. ----
+
+    # If duplicate resources have been extracted, prefer ones with the most metadata.
+    def deduplicate(resources)
+      return [] unless resources.any?
+
+      puts "De-duplicating #{resources.count} resources" if verbose
+      hash = {}
+      scores = {}
+      resources.each do |resource|
+        resource_url = resource[:url]
+        puts "  Considering: #{resource_url}" if verbose
+        if hash[resource_url]
+          score = metadata_score(resource)
+          # Replace the resource if this resource has a higher metadata score
+          puts "    Duplicate! Comparing #{score} vs. #{scores[resource_url]}" if verbose
+          if score > scores[resource_url]
+            puts '    Replacing resource' if verbose
+            hash[resource_url] = resource
+            scores[resource_url] = score
+          end
+        else
+          puts '    Not present, adding' if verbose
+          hash[resource_url] = resource
+          scores[resource_url] = metadata_score(resource)
+        end
+      end
+
+      puts "#{hash.values.count} resources after de-duplication" if verbose
+
+      hash.values
+    end
+
+    # Score based on number of metadata fields available
+    def metadata_score(resource)
+      score = 0
+      resource.each_value do |value|
+        score += 1 unless value.nil? || value == {} || value == [] || (value.is_a?(String) && value.strip == '')
+      end
+
+      score
+    end
+
+    def convert_params(params)
+      params[:description] = convert_description(params[:description]) if params.key?(:description)
+
+      params
+    end
+  end
+end
diff --git a/lib/ingestors/oai_pmh_ingestor.rb b/lib/ingestors/oai_pmh_ingestor.rb
new file mode 100644
index 000000000..c8edab5a9
--- /dev/null
+++ b/lib/ingestors/oai_pmh_ingestor.rb
@@ -0,0 +1,104 @@
+require 'open-uri'
+require 'tess_rdf_extractors'
+require 'oai'
+require 'nokogiri'
+
+module Ingestors
+  class OaiPmhIngestor < Ingestor
+    DUMMY_URL = 'https://example.com'
+
+    attr_reader :verbose
+
+    def self.config
+      {
+        key: 'oai_pmh',
+        title: 'OAI-PMH',
+        user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
+        mail: Rails.configuration.tess['contact_email']
+      }
+    end
+
+    def ns
+      {
+        'dc' => 'http://purl.org/dc/elements/1.1/',
+        'oai_dc' => 'http://www.openarchives.org/OAI/2.0/oai_dc/'
+      }
+    end
+
+    def read(source_url)
+      client = OAI::Client.new source_url, headers: { 'From' => config[:mail] }
+      count = 0
+      client.list_records.full.each do |record|
+        read_dublin_core(record.metadata.to_s)
+        count += 1
+      end
+      @messages << "found #{count} records"
+    end
+
+    def read_dublin_core(xml_string)
+      doc = Nokogiri::XML(xml_string)
+
+      types = doc.xpath('//dc:type', ns).map(&:text)
+      if types.include?('http://purl.org/dc/dcmitype/Event')
+        read_dublin_core_event(doc)
+      else
+        read_dublin_core_material(doc)
+      end
+    end
+
+    def read_dublin_core_material(xml_doc)
+      material = OpenStruct.new
+      material.title        = xml_doc.at_xpath('//dc:title', ns)&.text
+      material.description  = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text)
+      material.authors      = xml_doc.xpath('//dc:creator', ns).map(&:text)
+      material.contributors = xml_doc.xpath('//dc:contributor', ns).map(&:text)
+      material.licence      = xml_doc.at_xpath('//dc:rights', ns)&.text
+
+      dates = xml_doc.xpath('//dc:date', ns).map(&:text)
+      parsed_dates = dates.map do |d|
+        Date.parse(d)
+      rescue StandardError
+        nil
+      end.compact
+      material.date_created = parsed_dates.first
+      material.date_modified = parsed_dates.last if parsed_dates.size > 1
+
+      identifiers = xml_doc.xpath('//dc:identifier', ns).map(&:text)
+      doi = identifiers.find { |id| id.start_with?('10.') || id.include?('doi.org') }
+      if doi
+        doi = doi&.sub(%r{https?://doi\.org/}, '')
+        material.doi = "https://doi.org/#{doi}"
+      end
+      material.url = identifiers.find { |id| id.start_with?('http://', 'https://') }
+
+      material.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text)
+      material.resource_type = xml_doc.xpath('//dc:type', ns).map(&:text)
+      material.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text
+
+      add_material material
+    end
+
+    def read_dublin_core_event(_xml_doc)
+      event = OpenStruct.new
+
+      event.title       = doc.at_xpath('//dc:title', ns)&.text
+      event.description = convert_description(doc.at_xpath('//dc:description', ns)&.text)
+      event.url         = doc.xpath('//dc:identifier', ns).map(&:text).find { |id| id.start_with?('http://', 'https://') }
+      event.contact     = doc.at_xpath('//dc:publisher', ns)&.text
+      event.organizer   = doc.at_xpath('//dc:creator', ns)&.text
+      event.keywords = doc.xpath('//dc:subject', ns).map(&:text)
+      event.event_types = types
+
+      dates = doc.xpath('//dc:date', ns).map(&:text)
+      parsed_dates = dates.map do |d|
+        Date.parse(d)
+      rescue StandardError
+        nil
+      end.compact
+      event.start = parsed_dates.first
+      event.end   = parsed_dates.last
+
+      add_event event
+    end
+  end
+end