From 5af12ee676a406033f2af086e6f32c5549b21172 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Tue, 7 Apr 2026 18:39:19 +0200 Subject: [PATCH 01/14] Refactor Dublin Core ingestion from OAI-PMH ingestor --- lib/ingestors/dublin_core_ingestion.rb | 84 ++++++++++++++++++++++++++ lib/ingestors/oai_pmh_ingestor.rb | 81 +++++++------------------ 2 files changed, 105 insertions(+), 60 deletions(-) create mode 100644 lib/ingestors/dublin_core_ingestion.rb diff --git a/lib/ingestors/dublin_core_ingestion.rb b/lib/ingestors/dublin_core_ingestion.rb new file mode 100644 index 000000000..39cdca7fd --- /dev/null +++ b/lib/ingestors/dublin_core_ingestion.rb @@ -0,0 +1,84 @@ +module Ingestors + module DublinCoreIngestion + def build_material_from_dublin_core_data(dc) + material = OpenStruct.new + + material.title = dc[:title] + material.description = convert_description(dc[:description]) + material.authors = normalize_dublin_core_values(dc[:creators]) + material.contributors = normalize_dublin_core_values(dc[:contributors]) + + rights = normalize_dublin_core_values(dc[:rights]) + material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' + + parsed_dates = parse_dublin_core_dates(dc[:dates]) + material.date_created = parsed_dates.first + material.date_modified = parsed_dates.last if parsed_dates.size > 1 + + identifiers = normalize_dublin_core_values(dc[:identifiers]) + material.doi = extract_dublin_core_doi(identifiers) + material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + material.keywords = normalize_dublin_core_values(dc[:subjects]) + material.resource_type = normalize_dublin_core_values(dc[:types]) + material.contact = dublin_core_text(dc[:publisher]) + + material + end + + def build_event_from_dublin_core_data(dc) + event = OpenStruct.new + + event.title = dc[:title] + event.description = convert_description(dc[:description]) + event.organizer = normalize_dublin_core_values(dc[:creators]).first + event.contact = dublin_core_text(dc[:publisher]) || event.organizer + event.keywords = normalize_dublin_core_values(dc[:subjects]) + event.event_types = normalize_dublin_core_values(dc[:types]) + + dates = parse_dublin_core_dates(dc[:dates]) + event.start = dates.first + event.end = dates.last || dates.first + + identifiers = normalize_dublin_core_values(dc[:identifiers]) + event.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + event + end + + def parse_dublin_core_dates(dates) + normalize_dublin_core_values(dates).map do |date_value| + Date.parse(date_value) + rescue StandardError + nil + end.compact + end + + def extract_dublin_core_doi(identifiers) + doi = normalize_dublin_core_values(identifiers).find do |id| + id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') + end + return nil unless doi + + normalized = doi.sub(%r{https?://doi\.org/}, '') + "https://doi.org/#{normalized}" + end + + def normalize_dublin_core_values(values) + Array(values).map { |v| dublin_core_text(v) } + .map(&:to_s) + .map(&:strip) + .reject(&:blank?) + .uniq + end + + # this method is also used by RSS ingestion under an alias + def dublin_core_text(value) + return nil if value.nil? + return value.content if value.respond_to?(:content) + return value.text if value.respond_to?(:text) && !value.is_a?(String) + + value.to_s + end + end +end diff --git a/lib/ingestors/oai_pmh_ingestor.rb b/lib/ingestors/oai_pmh_ingestor.rb index a4261fe9d..9ce325e67 100644 --- a/lib/ingestors/oai_pmh_ingestor.rb +++ b/lib/ingestors/oai_pmh_ingestor.rb @@ -2,6 +2,8 @@ module Ingestors class OaiPmhIngestor < Ingestor + include DublinCoreIngestion + def self.config { key: 'oai_pmh', @@ -36,20 +38,36 @@ def ns } end + def extract_dublin_core_from_xml(xml_doc) + { + title: xml_doc.at_xpath('//dc:title', ns)&.text, + description: xml_doc.at_xpath('//dc:description', ns)&.text, + creators: xml_doc.xpath('//dc:creator', ns).map(&:text), + contributors: xml_doc.xpath('//dc:contributor', ns).map(&:text), + rights: xml_doc.xpath('//dc:rights', ns).map(&:text), + dates: xml_doc.xpath('//dc:date', ns).map(&:text), + identifiers: xml_doc.xpath('//dc:identifier', ns).map(&:text), + subjects: xml_doc.xpath('//dc:subject', ns).map(&:text), + types: xml_doc.xpath('//dc:type', ns).map(&:text), + publisher: xml_doc.at_xpath('//dc:publisher', ns)&.text + } + end + def read_oai_dublin_core(client) count = 0 client.list_records(metadata_prefix: 'oai_dc').full.each do |record| xml_string = record.metadata.to_s doc = Nokogiri::XML(xml_string) + dc = extract_dublin_core_from_xml(doc) - types = doc.xpath('//dc:type', ns).map(&:text) + types = normalize_dublin_core_values(dc[:types]) # this event detection heuristic captures in particular # - http://purl.org/dc/dcmitype/Event (the standard way of typing an event in dublin core) # - https://schema.org/Event if types.any? { |t| t.downcase.include? 'event' } - read_dublin_core_event(doc) + add_event(build_event_from_dublin_core_data(dc)) else - read_dublin_core_material(doc) + add_material(build_material_from_dublin_core_data(dc)) end count += 1 @@ -57,63 +75,6 @@ def read_oai_dublin_core(client) @messages << "found #{count} records" end - def read_dublin_core_material(xml_doc) - material = OpenStruct.new - material.title = xml_doc.at_xpath('//dc:title', ns)&.text - material.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) - material.authors = xml_doc.xpath('//dc:creator', ns).map(&:text) - material.contributors = xml_doc.xpath('//dc:contributor', ns).map(&:text) - - rights = xml_doc.xpath('//dc:rights', ns).map { |n| n.text&.strip }.reject(&:empty?) - material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' - - dates = xml_doc.xpath('//dc:date', ns).map(&:text) - parsed_dates = dates.map do |d| - Date.parse(d) - rescue StandardError - nil - end.compact - material.date_created = parsed_dates.first - material.date_modified = parsed_dates.last if parsed_dates.size > 1 - - identifiers = xml_doc.xpath('//dc:identifier', ns).map(&:text) - doi = identifiers.find { |id| id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') } - if doi - doi = doi&.sub(%r{https?://doi\.org/}, '') - material.doi = "https://doi.org/#{doi}" - end - material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } - - material.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) - material.resource_type = xml_doc.xpath('//dc:type', ns).map(&:text) - material.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text - - add_material material - end - - def read_dublin_core_event(xml_doc) - event = OpenStruct.new - - event.title = xml_doc.at_xpath('//dc:title', ns)&.text - event.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) - event.url = xml_doc.xpath('//dc:identifier', ns).map(&:text).find { |id| id.start_with?('http://', 'https://') } - event.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text - event.organizer = xml_doc.at_xpath('//dc:creator', ns)&.text - event.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) - event.event_types = xml_doc.xpath('//dc:type', ns).map(&:text) - - dates = xml_doc.xpath('//dc:date', ns).map(&:text) - parsed_dates = dates.map do |d| - Date.parse(d) - rescue StandardError - nil - end.compact - event.start = parsed_dates.first - event.end = parsed_dates.last - - add_event event - end - def read_oai_rdf(client) provider_events = [] provider_materials = [] From e36dbc27d5a53c78f5316700a4c293becfdb0791 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Tue, 7 Apr 2026 18:39:52 +0200 Subject: [PATCH 02/14] Add RSS ingestion for materials and events --- lib/ingestors/event_rss_ingestor.rb | 107 +++++++++++++++++ lib/ingestors/material_rss_ingestor.rb | 105 +++++++++++++++++ lib/ingestors/rss_ingestion.rb | 152 +++++++++++++++++++++++++ 3 files changed, 364 insertions(+) create mode 100644 lib/ingestors/event_rss_ingestor.rb create mode 100644 lib/ingestors/material_rss_ingestor.rb create mode 100644 lib/ingestors/rss_ingestion.rb diff --git a/lib/ingestors/event_rss_ingestor.rb b/lib/ingestors/event_rss_ingestor.rb new file mode 100644 index 000000000..f53267d4e --- /dev/null +++ b/lib/ingestors/event_rss_ingestor.rb @@ -0,0 +1,107 @@ +require 'rss' +require 'tess_rdf_extractors' + +module Ingestors + class EventRssIngestor < Ingestor + include RssIngestion + + def initialize + super + + @bioschemas_manager = BioschemasIngestor.new + end + + def self.config + { + key: 'event_rss', + title: 'RSS / Atom Feed', + category: :events + } + end + + def read(url) + feed, content = fetch_feed(url) + return if feed.nil? + + if feed.is_a?(RSS::Rss) + @messages << "Parsing RSS feed: #{feed_title(feed)}" + feed.items.each { |item| add_event(build_event_from_rss_item(item)) } + elsif feed.is_a?(RSS::RDF) + @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" + rss_events = feed.items.map { |item| build_event_from_rss_item(item).to_h } + bioschemas_events = extract_rdf_bioschemas_events(content) + merge_with_bioschemas_priority(bioschemas_events, rss_events).each do |event| + add_event(event) + end + elsif feed.is_a?(RSS::Atom::Feed) + @messages << "Parsing ATOM feed: #{feed_title(feed)}" + feed.items.each { |item| add_event(build_event_from_atom_item(item)) } + else + @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" + @messages << 'unsupported feed format' + end + end + + private + + def extract_rdf_bioschemas_events(content) + return [] unless content.present? + + events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + + @bioschemas_manager.deduplicate(events + courses + course_instances) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? + @messages << 'An error occurred while extracting Bioschemas Events.' + [] + end + + def build_event_from_rss_item(item) + event = build_event_from_dublin_core_data(extract_dublin_core(item)) + + event.title ||= text_value(item.title) + native_url = text_value(item.link) + event.url = native_url if native_url.present? + event.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded)) + event.keywords = merge_unique(event.keywords, extract_rss_keywords(item)) + organizer = text_value(item.respond_to?(:author) ? item.author : nil) + event.organizer ||= organizer + event.contact ||= organizer + + item_date = parse_time(item.respond_to?(:pubDate) ? item.pubDate : nil) || parse_time(item.respond_to?(:date) ? item.date : nil) + event.start = prefer_precise_time(event.start, item_date) + event.end = prefer_precise_time(event.end, item_date) + + event + end + + def build_event_from_atom_item(item) + event = build_event_from_dublin_core_data(extract_dublin_core(item)) + + event.title ||= text_value(item.title) + native_url = extract_atom_link(item) + event.url = native_url if native_url.present? + event.description ||= convert_description(text_value(item.summary) || text_value(item.content)) + event.keywords = merge_unique(event.keywords, extract_atom_keywords(item)) + organizer = extract_atom_authors(item).first + event.organizer ||= organizer + event.contact ||= organizer + + published = parse_time(item.respond_to?(:published) ? item.published : nil) + updated = parse_time(item.respond_to?(:updated) ? item.updated : nil) + event.start = prefer_precise_time(event.start, published || updated) + event.end = prefer_precise_time(event.end, updated || published) + + event + end + end +end diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb new file mode 100644 index 000000000..e445c8141 --- /dev/null +++ b/lib/ingestors/material_rss_ingestor.rb @@ -0,0 +1,105 @@ +require 'rss' +require 'tess_rdf_extractors' + +module Ingestors + class MaterialRssIngestor < Ingestor + include RssIngestion + + def initialize + super + + @bioschemas_manager = BioschemasIngestor.new + end + + def self.config + { + key: 'material_rss', + title: 'RSS / Atom Feed', + category: :materials + } + end + + def read(url) + feed, content = fetch_feed(url) + return if feed.nil? + + if feed.is_a?(RSS::Rss) + @messages << "Parsing RSS feed: #{feed_title(feed)}" + feed.items.each { |item| add_material(build_material_from_rss_item(item)) } + elsif feed.is_a?(RSS::RDF) + @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" + rss_materials = feed.items.map { |item| build_material_from_rss_item(item).to_h } + bioschemas_materials = extract_rdf_bioschemas_materials(content) + merge_with_bioschemas_priority(bioschemas_materials, rss_materials).each do |material| + add_material(material) + end + elsif feed.is_a?(RSS::Atom::Feed) + @messages << "Parsing ATOM feed: #{feed_title(feed)}" + feed.items.each { |item| add_material(build_material_from_atom_item(item)) } + else + @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" + @messages << 'unsupported feed format' + end + end + + private + + def extract_rdf_bioschemas_materials(content) + return [] unless content.present? + + materials = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + + @bioschemas_manager.deduplicate(materials) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? + @messages << 'An error occurred while extracting Bioschemas LearningResources.' + [] + end + + def build_material_from_rss_item(item) + material = build_material_from_dublin_core_data(extract_dublin_core(item)) + + material.title ||= text_value(item.title) + native_url = text_value(item.link) + material.url = native_url if native_url.present? + material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded)) + material.keywords = merge_unique(material.keywords, extract_rss_keywords(item)) + author = item.respond_to?(:author) ? item.author : nil + material.authors = merge_unique(material.authors, [text_value(author)]) + material.contact ||= material.authors&.first + guid = item.respond_to?(:guid) ? item.guid : nil + material.doi ||= extract_dublin_core_doi([text_value(guid)]) + + item_date = parse_time(item.respond_to?(:pubDate) ? item.pubDate : nil) || parse_time(item.respond_to?(:date) ? item.date : nil) + material.date_published ||= item_date + material.date_created = prefer_precise_time(material.date_created, item_date) + material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.respond_to?(:date) ? item.date : nil)) + + material + end + + def build_material_from_atom_item(item) + material = build_material_from_dublin_core_data(extract_dublin_core(item)) + + material.title ||= text_value(item.title) + native_url = extract_atom_link(item) + material.url = native_url if native_url.present? + material.description ||= convert_description(text_value(item.summary) || text_value(item.content)) + material.keywords = merge_unique(material.keywords, extract_atom_keywords(item)) + material.authors = merge_unique(material.authors, extract_atom_authors(item)) + material.contact ||= material.authors&.first + material.doi ||= extract_dublin_core_doi([text_value(item.id)]) + + published = parse_time(item.respond_to?(:published) ? item.published : nil) + updated = parse_time(item.respond_to?(:updated) ? item.updated : nil) + material.date_created = prefer_precise_time(material.date_created, published) + material.date_published ||= published || updated + material.date_modified = prefer_precise_time(material.date_modified, updated) + + material + end + end +end diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb new file mode 100644 index 000000000..300f8c8a3 --- /dev/null +++ b/lib/ingestors/rss_ingestion.rb @@ -0,0 +1,152 @@ +module Ingestors + module RssIngestion + include DublinCoreIngestion + + # Fetches and parses a feed from the URL, with optional HTML feed discovery. + # Returns [feed, parsed_content] on success, where parsed_content is the XML/Atom string used. + # Returns [nil, nil] when the URL cannot be opened or parsing/discovery fails. + def fetch_feed(url) + io = open_url(url) + return [nil, nil] if io.nil? + + content = io.read + feed, parse_error_message = parse_feed(content) + return [feed, content] unless feed.nil? + + discovered_feed_url = discover_feed_url_from_html(content, url) + if discovered_feed_url.blank? + @messages << parse_error_message + return [nil, nil] + end + + @messages << "HTML page detected, following feed link: #{discovered_feed_url}" + discovered_io = open_url(discovered_feed_url) + return [nil, nil] if discovered_io.nil? + + discovered_content = discovered_io.read + discovered_feed, discovered_parse_error_message = parse_feed(discovered_content) + if discovered_feed.blank? + @messages << discovered_parse_error_message + return [nil, nil] + end + + [discovered_feed, discovered_content] + end + + def parse_feed(content) + feed = RSS::Parser.parse(content, { validate: false }) + return [feed, nil] if feed.present? + + [nil, 'parsing feed failed with: unrecognized feed content'] + rescue RSS::NotWellFormedError => e + [nil, "parsing feed failed with: #{e.message}"] + end + + def discover_feed_url_from_html(content, base_url) + doc = Nokogiri::HTML(content) + link = doc.css('link[rel]').find do |node| + rel = node['rel'].to_s.downcase + type = node['type'].to_s.downcase + rel.include?('alternate') && (type.include?('rss') || type.include?('atom')) + end + + href = link&.[]('href') + return nil if href.blank? + + URI.join(base_url, href).to_s + rescue StandardError + nil + end + + def feed_title(feed) + channel = feed.respond_to?(:channel) ? feed.channel : nil + return channel.title if channel.present? && channel.respond_to?(:title) + return text_value(feed.title) if feed.respond_to?(:title) + + 'Untitled feed' + end + + alias text_value dublin_core_text + + def parse_time(value) + value = value.content if value.respond_to?(:content) + + return value if value.is_a?(Time) || value.is_a?(Date) || value.is_a?(DateTime) + + text = text_value(value) + return nil if text.blank? + + Time.zone.parse(text) + rescue ArgumentError + nil + end + + def extract_dublin_core(item) + { + title: text_value(item.dc_title), + description: text_value(item.dc_description), + creators: Array(item.dc_creators), + contributors: Array(item.dc_contributors), + rights: Array(item.dc_rights_list), + dates: Array(item.dc_dates), + identifiers: Array(item.dc_identifiers), + subjects: Array(item.dc_subjects), + types: Array(item.dc_types), + publisher: item.dc_publisher + } + end + + def extract_rss_keywords(item) + return [] unless item.respond_to?(:categories) + + Array(item.categories).map { |c| text_value(c.respond_to?(:content) ? c.content : c) } + end + + def extract_atom_keywords(item) + return [] unless item.respond_to?(:categories) + + Array(item.categories).map { |c| text_value(c.respond_to?(:term) ? c.term : c) } + end + + def extract_atom_authors(item) + Array(item.authors).map { |author| text_value(author.respond_to?(:name) ? author.name : author) } + end + + def extract_atom_link(item) + item.links.map { |l| text_value(l.href) }.find(&:present?) + end + + def prefer_precise_time(existing_value, candidate_time) + return existing_value if candidate_time.blank? + return candidate_time if existing_value.blank? + + return candidate_time if existing_value.is_a?(Date) && !existing_value.is_a?(DateTime) && existing_value == candidate_time.to_date + + existing_value + end + + def merge_unique(existing_values, new_values) + normalize_dublin_core_values(Array(existing_values) + Array(new_values)) + end + + def merge_with_bioschemas_priority(bioschemas_records, rss_records) + rss_by_url = rss_records.index_by { |record| record[:url].to_s } + + merged = bioschemas_records.map do |bioschemas_record| + key = bioschemas_record[:url].to_s + rss_record = rss_by_url.delete(key) + merge_record_pair(bioschemas_record, rss_record) + end + + merged + rss_by_url.values + end + + def merge_record_pair(primary_record, secondary_record) + return primary_record if secondary_record.nil? + + secondary_record.merge(primary_record) do |_key, secondary_value, primary_value| + primary_value.present? ? primary_value : secondary_value + end + end + end +end From be76ff74d741fbd4397475c0b5499b55c300f2cb Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Tue, 7 Apr 2026 18:40:13 +0200 Subject: [PATCH 03/14] Add tests for RSS ingestors --- .../unit/ingestors/event_rss_ingestor_test.rb | 303 +++++++++++ .../ingestors/material_rss_ingestor_test.rb | 481 ++++++++++++++++++ 2 files changed, 784 insertions(+) create mode 100644 test/unit/ingestors/event_rss_ingestor_test.rb create mode 100644 test/unit/ingestors/material_rss_ingestor_test.rb diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb new file mode 100644 index 000000000..30bf3f7fb --- /dev/null +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -0,0 +1,303 @@ +require 'test_helper' +require 'stringio' + +class EventRssIngestorTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::EventRssIngestor.new + mock_timezone + end + + teardown do + reset_timezone + end + + test 'reads rss items from dublin core and native rss fields' do + rss_feed_xml = <<~XML + + + + RSS Event Feed + + + Native RSS event title + https://example.org/events/native + Native RSS event description + native.author@example.org (Native Event Author) + native-event-category + Sat, 01 Jun 2024 09:00:00 GMT + DC RSS event title + DC RSS event description + DC Event Creator + event-topic-a + workshop + 2024-06-01 + 2024-06-02 + https://example.org/events/dc-url + rss event publisher + + + + Fallback RSS event title + https://example.org/events/fallback + Fallback RSS event description + Fallback RSS Author + fallback-event-category + Mon, 03 Jun 2024 12:00:00 GMT + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 2, @ingestor.events.count + + dc_event = @ingestor.events.first + assert_equal 'DC RSS event title', dc_event.title + assert_equal 'https://example.org/events/native', dc_event.url + assert_equal 'DC RSS event description', dc_event.description + assert_equal 'DC Event Creator', dc_event.organizer + assert_equal 'rss event publisher', dc_event.contact + assert_equal %w[event-topic-a native-event-category], dc_event.keywords + assert_equal ['workshop'], dc_event.event_types + assert_equal Time.utc(2024, 6, 1, 9, 0, 0), dc_event.start.utc + assert_equal Date.new(2024, 6, 2), dc_event.end.to_date + + fallback_event = @ingestor.events.second + assert_equal 'Fallback RSS event title', fallback_event.title + assert_equal 'https://example.org/events/fallback', fallback_event.url + assert_equal 'Fallback RSS event description', fallback_event.description + assert_equal 'Fallback RSS Author', fallback_event.organizer + assert_equal 'Fallback RSS Author', fallback_event.contact + assert_equal ['fallback-event-category'], fallback_event.keywords + assert_equal [], fallback_event.event_types + assert_equal Time.utc(2024, 6, 3, 12, 0, 0), fallback_event.start.utc + assert_equal Time.utc(2024, 6, 3, 12, 0, 0), fallback_event.end.utc + end + + test 'reads atom items from dublin core and native atom fields' do + atom_feed_xml = <<~XML + + + Atom Event Feed + + + Native Atom event title + + Native Atom event summary + Native Atom Author + + 2024-07-01T10:00:00Z + 2024-07-02T11:00:00Z + DC Atom event title + DC Atom event description + DC Atom Creator + atom-event-topic + seminar + 2024-07-01 + 2024-07-02 + https://example.org/atom-events/dc-url + atom event publisher + + + + Fallback Atom event title + + Fallback Atom event content + Fallback Atom Author + + 2024-07-03T10:00:00Z + 2024-07-04T11:00:00Z + + + XML + + read_xml(atom_feed_xml) + + assert_equal 2, @ingestor.events.count + + dc_event = @ingestor.events.first + assert_equal 'DC Atom event title', dc_event.title + assert_equal 'https://example.org/atom-events/native', dc_event.url + assert_equal 'DC Atom event description', dc_event.description + assert_equal 'DC Atom Creator', dc_event.organizer + assert_equal 'atom event publisher', dc_event.contact + assert_equal %w[atom-event-topic native-atom-event-category], dc_event.keywords + assert_equal ['seminar'], dc_event.event_types + assert_equal Time.utc(2024, 7, 1, 10, 0, 0), dc_event.start.utc + assert_equal Time.utc(2024, 7, 2, 11, 0, 0), dc_event.end.utc + + fallback_event = @ingestor.events.second + assert_equal 'Fallback Atom event title', fallback_event.title + assert_equal 'https://example.org/atom-events/fallback', fallback_event.url + assert_equal 'Fallback Atom event content', fallback_event.description + assert_equal 'Fallback Atom Author', fallback_event.organizer + assert_equal 'Fallback Atom Author', fallback_event.contact + assert_equal ['fallback-atom-event-category'], fallback_event.keywords + assert_equal [], fallback_event.event_types + assert_equal Time.utc(2024, 7, 3, 10, 0, 0), fallback_event.start.utc + assert_equal Time.utc(2024, 7, 4, 11, 0, 0), fallback_event.end.utc + end + + test 'reads bioschemas event from rss 1.0 rdf feed' do + rss_10_bioschemas_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas event feed + https://example.org/rss10-bioschemas-events + desc + + + + + + + + Fallback RSS 1.0 event title + https://example.org/rss10-bioschemas/event-item + Fallback RSS 1.0 event description + + + + RSS 1.0 Bioschemas event title + + 2024-08-01 + 2024-08-02 + + + XML + + read_xml(rss_10_bioschemas_feed_xml) + + assert_equal 2, @ingestor.events.count + + event = @ingestor.events.detect { |e| e.url == 'https://example.org/rss10/bioschemas/event' } + refute_nil event + assert_equal 'RSS 1.0 Bioschemas event title', event.title + assert_equal 'https://example.org/rss10/bioschemas/event', event.url + + fallback_event = @ingestor.events.detect { |e| e.url == 'https://example.org/rss10-bioschemas/event-item' } + refute_nil fallback_event + assert_equal 'Fallback RSS 1.0 event title', fallback_event.title + end + + test 'merges rss properties into bioschemas event for same url with bioschemas priority' do + rss_10_bioschemas_merged_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas merged event feed + https://example.org/rss10-merged-events + desc + + + + + + + + + RSS 1.0 fallback event title + https://example.org/rss10/merged/event + RSS 1.0 fallback event description that should fill missing bioschemas value + RSS 1.0 Merged Event Creator + rss10-merged-event-subject + 2024-08-01 + + + + RSS 1.0 Bioschemas preferred event title + + + + XML + + read_xml(rss_10_bioschemas_merged_feed_xml) + + assert_equal 1, @ingestor.events.count + + event = @ingestor.events.first + assert_equal 'RSS 1.0 Bioschemas preferred event title', event.title + assert_equal 'https://example.org/rss10/merged/event', event.url + assert_equal 'RSS 1.0 fallback event description that should fill missing bioschemas value', event.description + assert_equal ['rss10-merged-event-subject'], event.keywords + assert_equal 'RSS 1.0 Merged Event Creator', event.organizer + assert_equal Date.new(2024, 8, 1), event.start.to_date + assert_equal Date.new(2024, 8, 1), event.end.to_date + end + + test 'reads feed from html alternate meta link' do + start_url = 'https://www.youtube.com/@event_channel' + feed_url = 'https://www.youtube.com/feeds/videos.xml?channel_id=UCevent123' + + html_with_alternate_feed_link = <<~HTML + + + + + + Channel page + + HTML + + atom_feed_xml = <<~XML + + + Minimal Atom event feed + + Alternate feed event + + Minimal content used for alternate-link test + Alternate Event Organizer + 2024-07-02T11:00:00Z + + + XML + + read_xml_map( + { + start_url => html_with_alternate_feed_link, + feed_url => atom_feed_xml + }, + start_url + ) + + assert_equal 1, @ingestor.events.count + assert_includes @ingestor.messages, "HTML page detected, following feed link: #{feed_url}" + assert_equal 'Alternate feed event', @ingestor.events.first.title + end + + test 'logs parse error for invalid feed input' do + read_xml('not valid rss or atom') + + assert_equal 1, @ingestor.messages.length + assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) + assert_empty @ingestor.events + end + + private + + def read_xml(xml, url = 'https://example.org/event-feed.xml') + @ingestor.stub(:open_url, StringIO.new(xml)) do + @ingestor.read(url) + end + end + + def read_xml_map(url_to_content, start_url) + @ingestor.stub(:open_url, lambda do |requested_url| + content = url_to_content[requested_url] + content.nil? ? nil : StringIO.new(content) + end) do + @ingestor.read(start_url) + end + end +end diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb new file mode 100644 index 000000000..f4ed074d0 --- /dev/null +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -0,0 +1,481 @@ +require 'test_helper' +require 'stringio' + +class MaterialRssIngestorTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::MaterialRssIngestor.new + mock_timezone + end + + teardown do + reset_timezone + end + + test 'reads rss items from dublin core and native rss fields' do + rss_feed_xml = <<~XML + + + + RSS material feed + + + Native RSS title + https://example.org/rss/native-link + Native RSS description + native.author@example.org (Native RSS Author) + native-category + 10.9999/native-rss-guid + Tue, 02 Jan 2024 03:04:05 GMT + DC RSS title + DC RSS description + DC Creator One + DC Creator Two + DC Contributor One + DC Contributor Two + plain rights + https://example.org/licenses/rss + 2024-01-01 + 2024-01-10 + https://example.org/rss/dc-url + 10.1234/rss-doi + dc-subject-a + dc-subject-b + dc-type-a + dc-type-b + rss publisher + + + + Plain Rights RSS title + https://example.org/rss/plain-rights + Plain rights RSS description + Plain Rights RSS Creator + plain-only-rights + not-a-date + 2024-01-11 + https://example.org/rss/plain-rights + plain-rights-subject + plain-rights-type + plain rights publisher + + + + Fallback RSS title + https://example.org/rss/fallback + Fallback RSS Author + fallback-category-a + fallback-category-b + 10.5555/fallback-rss-guid + Wed, 03 Jan 2024 04:05:06 GMT + + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 3, @ingestor.materials.count + + dc_material = @ingestor.materials.first + assert_equal 'DC RSS title', dc_material.title + assert_equal 'https://example.org/rss/native-link', dc_material.url + assert_equal 'DC RSS description', dc_material.description + assert_equal ['DC Creator One', 'DC Creator Two', 'native.author@example.org (Native RSS Author)'], dc_material.authors + assert_equal ['DC Contributor One', 'DC Contributor Two'], dc_material.contributors + assert_equal 'https://example.org/licenses/rss', dc_material.licence + assert_equal Date.new(2024, 1, 1), dc_material.date_created + assert_equal Time.utc(2024, 1, 2, 3, 4, 5), dc_material.date_published.utc + assert_equal Date.new(2024, 1, 10), dc_material.date_modified + assert_equal 'https://doi.org/10.1234/rss-doi', dc_material.doi + assert_equal %w[dc-subject-a dc-subject-b native-category], dc_material.keywords + assert_equal %w[dc-type-a dc-type-b], dc_material.resource_type + assert_equal 'rss publisher', dc_material.contact + + plain_rights_material = @ingestor.materials.second + assert_equal 'Plain Rights RSS title', plain_rights_material.title + assert_equal 'https://example.org/rss/plain-rights', plain_rights_material.url + assert_equal 'Plain rights RSS description', plain_rights_material.description + assert_equal ['Plain Rights RSS Creator'], plain_rights_material.authors + assert_equal [], plain_rights_material.contributors + assert_equal 'plain-only-rights', plain_rights_material.licence + assert_equal Date.new(2024, 1, 11), plain_rights_material.date_created + assert_nil plain_rights_material.date_modified + assert_nil plain_rights_material.doi + assert_equal ['plain-rights-subject'], plain_rights_material.keywords + assert_equal ['plain-rights-type'], plain_rights_material.resource_type + assert_equal 'plain rights publisher', plain_rights_material.contact + + fallback_material = @ingestor.materials.third + assert_equal 'Fallback RSS title', fallback_material.title + assert_equal 'https://example.org/rss/fallback', fallback_material.url + assert_equal 'Fallback RSS content encoded', fallback_material.description + assert_equal ['Fallback RSS Author'], fallback_material.authors + assert_equal [], fallback_material.contributors + assert_equal 'notspecified', fallback_material.licence + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_created.utc + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_published.utc + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_modified.utc + assert_equal 'https://doi.org/10.5555/fallback-rss-guid', fallback_material.doi + assert_equal %w[fallback-category-a fallback-category-b], fallback_material.keywords + assert_equal [], fallback_material.resource_type + assert_equal 'Fallback RSS Author', fallback_material.contact + end + + test 'reads atom items from dublin core and native atom fields' do + atom_feed_xml = <<~XML + + + Atom material feed + + + Native Atom title + + Native Atom summary + Native Atom Author + + 10.9999/native-atom-id + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + DC Atom title + DC Atom description + DC Atom Creator One + DC Atom Creator Two + DC Atom Contributor One + plain atom rights + https://example.org/licenses/atom + 2024-02-01 + 2024-02-05 + https://example.org/atom/dc-url + https://doi.org/10.1234/atom-doi + atom-dc-subject + atom-dc-type + atom publisher + + + + Plain Rights Atom title + + Plain rights Atom description + Plain Rights Atom Creator + plain-atom-rights + invalid-date + 2024-02-11 + https://example.org/atom/plain-rights + plain-atom-subject + plain-atom-type + plain atom publisher + + + + Fallback Atom title + + Fallback Atom content + Fallback Atom Author + + + 10.5555/fallback-atom-id + 2024-03-04T05:06:07Z + 2024-03-05T06:07:08Z + + + XML + + read_xml(atom_feed_xml) + + assert_equal 3, @ingestor.materials.count + + dc_material = @ingestor.materials.first + assert_equal 'DC Atom title', dc_material.title + assert_equal 'https://example.org/atom/native-link', dc_material.url + assert_equal 'DC Atom description', dc_material.description + assert_equal ['DC Atom Creator One', 'DC Atom Creator Two', 'Native Atom Author'], dc_material.authors + assert_equal ['DC Atom Contributor One'], dc_material.contributors + assert_equal 'https://example.org/licenses/atom', dc_material.licence + assert_equal Date.new(2024, 2, 1), dc_material.date_created + assert_equal Time.utc(2024, 2, 2, 3, 4, 5), dc_material.date_published.utc + assert_equal Date.new(2024, 2, 5), dc_material.date_modified + assert_equal 'https://doi.org/10.1234/atom-doi', dc_material.doi + assert_equal %w[atom-dc-subject native-atom-category], dc_material.keywords + assert_equal ['atom-dc-type'], dc_material.resource_type + assert_equal 'atom publisher', dc_material.contact + + plain_rights_material = @ingestor.materials.second + assert_equal 'Plain Rights Atom title', plain_rights_material.title + assert_equal 'https://example.org/atom/plain-rights', plain_rights_material.url + assert_equal 'Plain rights Atom description', plain_rights_material.description + assert_equal ['Plain Rights Atom Creator'], plain_rights_material.authors + assert_equal [], plain_rights_material.contributors + assert_equal 'plain-atom-rights', plain_rights_material.licence + assert_equal Date.new(2024, 2, 11), plain_rights_material.date_created + assert_nil plain_rights_material.date_modified + assert_nil plain_rights_material.doi + assert_equal ['plain-atom-subject'], plain_rights_material.keywords + assert_equal ['plain-atom-type'], plain_rights_material.resource_type + assert_equal 'plain atom publisher', plain_rights_material.contact + + fallback_material = @ingestor.materials.third + assert_equal 'Fallback Atom title', fallback_material.title + assert_equal 'https://example.org/atom/fallback', fallback_material.url + assert_equal 'Fallback Atom content', fallback_material.description + assert_equal ['Fallback Atom Author'], fallback_material.authors + assert_equal [], fallback_material.contributors + assert_equal 'notspecified', fallback_material.licence + assert_equal Time.utc(2024, 3, 4, 5, 6, 7), fallback_material.date_created.utc + assert_equal Time.utc(2024, 3, 4, 5, 6, 7), fallback_material.date_published.utc + assert_equal Time.utc(2024, 3, 5, 6, 7, 8), fallback_material.date_modified.utc + assert_equal 'https://doi.org/10.5555/fallback-atom-id', fallback_material.doi + assert_equal %w[fallback-atom-category-a fallback-atom-category-b], fallback_material.keywords + assert_equal [], fallback_material.resource_type + assert_equal 'Fallback Atom Author', fallback_material.contact + end + + test 'logs parse error for invalid feed input' do + read_xml('not valid rss or atom') + + assert_equal 1, @ingestor.messages.length + assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) + assert_empty @ingestor.materials + end + + test 'reads rss 0.91 feed' do + rss_091_feed_xml = <<~XML + + + + RSS 0.91 feed + https://example.org/rss091 + desc + + RSS 0.91 title + https://example.org/rss091/item + RSS 0.91 description + + + + XML + + read_xml(rss_091_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 0.91 title', material.title + assert_equal 'https://example.org/rss091/item', material.url + assert_equal 'RSS 0.91 description', material.description + assert_equal [], material.keywords + assert_equal 'notspecified', material.licence + assert_nil material.doi + assert_nil material.contact + end + + test 'reads rss 1.0 feed' do + rss_10_feed_xml = <<~XML + + + + RSS 1.0 feed + https://example.org/rss10 + desc + + + + + + + + RSS 1.0 title + https://example.org/rss10/item + RSS 1.0 description + RSS 1.0 Creator + rss10-subject + 10.1111/rss10doi + 2024-04-01 + + + XML + + read_xml(rss_10_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 1.0 title', material.title + assert_equal 'https://example.org/rss10/item', material.url + assert_equal 'RSS 1.0 description', material.description + assert_equal ['RSS 1.0 Creator'], material.authors + assert_equal ['rss10-subject'], material.keywords + assert_equal 'https://doi.org/10.1111/rss10doi', material.doi + assert_equal Date.new(2024, 4, 1), material.date_created.to_date + assert_equal Date.new(2024, 4, 1), material.date_modified.to_date + end + + test 'reads bioschemas learning resource from rss 1.0 rdf feed' do + rss_10_bioschemas_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas feed + https://example.org/rss10-bioschemas + desc + + + + + + + + Fallback RSS 1.0 title + https://example.org/rss10-bioschemas/item + Fallback RSS 1.0 description + + + + + + + RSS 1.0 Bioschemas title + + + + + XML + + read_xml(rss_10_bioschemas_feed_xml) + + assert_equal 2, @ingestor.materials.count + + material = @ingestor.materials.detect { |m| m.url == 'https://example.org/rss10/bioschemas/material' } + refute_nil material + assert_equal 'RSS 1.0 Bioschemas title', material.title + assert_equal 'https://example.org/rss10/bioschemas/material', material.url + assert_equal 'https://opensource.org/licenses/MIT', material.licence + + fallback_material = @ingestor.materials.detect { |m| m.url == 'https://example.org/rss10-bioschemas/item' } + refute_nil fallback_material + assert_equal 'Fallback RSS 1.0 title', fallback_material.title + end + + test 'merges rss properties into bioschemas material for same url with bioschemas priority' do + rss_10_bioschemas_merged_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas merged feed + https://example.org/rss10-merged + desc + + + + + + + + + RSS 1.0 fallback title + https://example.org/rss10/merged/material + RSS 1.0 fallback description that should fill missing bioschemas value + RSS 1.0 Merged Creator + rss10-merged-subject + 2024-05-01 + + + + + + + RSS 1.0 Bioschemas preferred title + + + + + XML + + read_xml(rss_10_bioschemas_merged_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 1.0 Bioschemas preferred title', material.title + assert_equal 'https://example.org/rss10/merged/material', material.url + assert_equal 'https://opensource.org/licenses/Apache-2.0', material.licence + assert_equal 'RSS 1.0 fallback description that should fill missing bioschemas value', material.description + assert_equal ['rss10-merged-subject'], material.keywords + assert_equal ['RSS 1.0 Merged Creator'], material.authors + assert_equal Date.new(2024, 5, 1), material.date_created.to_date + assert_equal Date.new(2024, 5, 1), material.date_modified.to_date + end + + test 'reads feed from html alternate meta link' do + start_url = 'https://www.youtube.com/@example' + feed_url = 'https://www.youtube.com/feeds/videos.xml?channel_id=UC123456789' + + html_with_alternate_feed_link = <<~HTML + + + + Channel + + + Channel page + + HTML + + atom_feed_xml = <<~XML + + + Minimal Atom material feed + + Alternate feed material + + Minimal content used for alternate-link test + Alternate Feed Author + 2024-02-02T03:04:05Z + + + XML + + read_xml_map( + { + start_url => html_with_alternate_feed_link, + feed_url => atom_feed_xml + }, + start_url + ) + + assert_equal 1, @ingestor.materials.count + assert_includes @ingestor.messages, "HTML page detected, following feed link: #{feed_url}" + assert_equal 'Alternate feed material', @ingestor.materials.first.title + end + + private + + def read_xml(xml, url = 'https://example.org/feed.xml') + @ingestor.stub(:open_url, StringIO.new(xml)) do + @ingestor.read(url) + end + end + + def read_xml_map(url_to_content, start_url) + @ingestor.stub(:open_url, lambda do |requested_url| + content = url_to_content[requested_url] + content.nil? ? nil : StringIO.new(content) + end) do + @ingestor.read(start_url) + end + end +end From 8c2880beb347b84598f78faac06ea24d633bcad6 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 09:19:58 +0200 Subject: [PATCH 04/14] Add ingestors to factory --- lib/ingestors/ingestor_factory.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 9bd8169fd..598e411b1 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -13,6 +13,8 @@ def self.ingestors Ingestors::ZenodoIngestor, Ingestors::OaiPmhIngestor, Ingestors::GithubIngestor, + Ingestors::EventRssIngestor, + Ingestors::MaterialRssIngestor ] + taxila_ingestors + llm_ingestors + heptraining_ingestors end From 54895a20b646b45eb650a37b729d50f05f97ebf4 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 11:32:55 +0200 Subject: [PATCH 05/14] Add support for common extensions --- lib/ingestors/material_rss_ingestor.rb | 27 ++-- lib/rss/media.rb | 24 ++++ lib/rss/media/atom.rb | 36 ++++++ .../ingestors/material_rss_ingestor_test.rb | 122 +++++++++++++++++- 4 files changed, 194 insertions(+), 15 deletions(-) create mode 100644 lib/rss/media.rb create mode 100644 lib/rss/media/atom.rb diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb index e445c8141..500e42539 100644 --- a/lib/ingestors/material_rss_ingestor.rb +++ b/lib/ingestors/material_rss_ingestor.rb @@ -1,4 +1,6 @@ require 'rss' +require 'rss/media' +require 'rss/itunes' require 'tess_rdf_extractors' module Ingestors @@ -65,18 +67,21 @@ def build_material_from_rss_item(item) material.title ||= text_value(item.title) native_url = text_value(item.link) material.url = native_url if native_url.present? - material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded)) + itunes_summary = text_value(item.itunes_summary) if item.respond_to?(:itunes_summary) + material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded) || itunes_summary) material.keywords = merge_unique(material.keywords, extract_rss_keywords(item)) - author = item.respond_to?(:author) ? item.author : nil - material.authors = merge_unique(material.authors, [text_value(author)]) + author = item.author if item.respond_to?(:author) + itunes_author = item.itunes_author if item.respond_to?(:itunes_author) + material.authors = merge_unique(material.authors, [text_value(author)] + [text_value(itunes_author)].compact) material.contact ||= material.authors&.first - guid = item.respond_to?(:guid) ? item.guid : nil + guid = item.guid if item.respond_to?(:guid) material.doi ||= extract_dublin_core_doi([text_value(guid)]) - item_date = parse_time(item.respond_to?(:pubDate) ? item.pubDate : nil) || parse_time(item.respond_to?(:date) ? item.date : nil) + item_date = parse_time(item.pubDate) if item.respond_to?(:pubDate) + item_date ||= parse_time(item.date) if item.respond_to?(:date) material.date_published ||= item_date material.date_created = prefer_precise_time(material.date_created, item_date) - material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.respond_to?(:date) ? item.date : nil)) + material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.date)) if item.respond_to?(:date) material end @@ -84,17 +89,19 @@ def build_material_from_rss_item(item) def build_material_from_atom_item(item) material = build_material_from_dublin_core_data(extract_dublin_core(item)) - material.title ||= text_value(item.title) + media_title = text_value(item.media_group&.media_title) + material.title ||= text_value(item.title) || media_title native_url = extract_atom_link(item) material.url = native_url if native_url.present? - material.description ||= convert_description(text_value(item.summary) || text_value(item.content)) + media_group_description = text_value(item.media_group&.media_description) + material.description ||= convert_description(text_value(item.summary) || text_value(item.content) || media_group_description) material.keywords = merge_unique(material.keywords, extract_atom_keywords(item)) material.authors = merge_unique(material.authors, extract_atom_authors(item)) material.contact ||= material.authors&.first material.doi ||= extract_dublin_core_doi([text_value(item.id)]) - published = parse_time(item.respond_to?(:published) ? item.published : nil) - updated = parse_time(item.respond_to?(:updated) ? item.updated : nil) + published = parse_time(item.published) + updated = parse_time(item.updated) material.date_created = prefer_precise_time(material.date_created, published) material.date_published ||= published || updated material.date_modified = prefer_precise_time(material.date_modified, updated) diff --git a/lib/rss/media.rb b/lib/rss/media.rb new file mode 100644 index 000000000..324d780a6 --- /dev/null +++ b/lib/rss/media.rb @@ -0,0 +1,24 @@ +require 'rss/atom' + +module RSS + MEDIA_PREFIX = 'media' + MEDIA_URI = 'http://search.yahoo.com/mrss/' + + module MediaGroupDescriptionModel + extend BaseModel + + def self.append_features(klass) + super + return if klass.instance_of?(Module) + + klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) + klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group') + end + end + + BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup') + BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title') + BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description') +end + +require_relative 'media/atom' diff --git a/lib/rss/media/atom.rb b/lib/rss/media/atom.rb new file mode 100644 index 000000000..797c9f484 --- /dev/null +++ b/lib/rss/media/atom.rb @@ -0,0 +1,36 @@ +module RSS + module Atom + Feed.install_ns(MEDIA_PREFIX, MEDIA_URI) + + class Feed + include MediaGroupDescriptionModel + class Entry + include MediaGroupDescriptionModel + + class MediaGroup < Element + include RSS09 + + @tag_name = 'group' + + class << self + def required_prefix + MEDIA_PREFIX + end + + def required_uri + MEDIA_URI + end + end + + install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) + install_text_element('title', MEDIA_URI, '?', 'media_title') + install_text_element('description', MEDIA_URI, '?', 'media_description') + end + end + end + + class Entry + include MediaGroupDescriptionModel + end + end +end diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index f4ed074d0..a315e0aed 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -4,11 +4,6 @@ class MaterialRssIngestorTest < ActiveSupport::TestCase setup do @ingestor = Ingestors::MaterialRssIngestor.new - mock_timezone - end - - teardown do - reset_timezone end test 'reads rss items from dublin core and native rss fields' do @@ -462,6 +457,123 @@ class MaterialRssIngestorTest < ActiveSupport::TestCase assert_equal 'Alternate feed material', @ingestor.materials.first.title end + test 'uses native atom title and description taking precedence over media extension' do + atom_feed_xml = <<~XML + + + Atom media precedence feed + + + yt:video:abc123 + Native Atom title wins + + Native Atom summary wins + Atom Author + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + + Media title ignored + Media description ignored + + + + XML + + read_xml(atom_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'Native Atom title wins', material.title + assert_equal 'Native Atom summary wins', material.description + end + + test 'uses media extension title and description for atom item when native ones are missing' do + atom_feed_xml = <<~XML + + + Atom media extension feed + + + yt:video:fallback123 + + Atom Author + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + + Media title used here + Media description used here + + + + XML + + read_xml(atom_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'Media title used here', material.title + assert_equal 'Media description used here', material.description + end + + test 'parses media group description through rss media extension' do + atom_feed_xml = <<~XML + + + Media extension feed + urn:feed:test + 2024-01-01T00:00:00Z + + + urn:entry:test + Media extension title + + 2024-01-01T00:00:00Z + + Media extension description + + + + XML + + feed = RSS::Parser.parse(atom_feed_xml, validate: false, ignore_unknown_element: true) + item = feed.items.first + + assert item.respond_to?(:media_group) + assert_equal 'Media extension description', item.media_group.media_description + end + + test 'uses itunes extension summary for rss item when native description is missing' do + rss_feed_xml = <<~XML + + + + RSS iTunes extension feed + + RSS item with iTunes summary + https://example.org/rss/itunes-summary + RSS Author + Fri, 02 Feb 2024 03:04:05 GMT + iTunes summary used here + iTunes Author + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'RSS item with iTunes summary', material.title + assert_equal 'iTunes summary used here', material.description + assert_includes material.authors, 'RSS Author' + assert_includes material.authors, 'iTunes Author' + end + private def read_xml(xml, url = 'https://example.org/feed.xml') From 3cea73b534bffdf333212f1d32751da56d8f5eb9 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 12:46:13 +0200 Subject: [PATCH 06/14] Fix Zeitwerk inflection problem with RSS --- config/initializers/inflections.rb | 4 ++ lib/ingestors/event_rss_ingestor.rb | 4 +- lib/ingestors/ingestor_factory.rb | 4 +- lib/ingestors/material_rss_ingestor.rb | 5 +- lib/ingestors/rss_ingestion.rb | 2 +- lib/rss/media.rb | 34 ++++++++----- lib/rss/media/atom.rb | 49 ++++++++++--------- .../unit/ingestors/event_rss_ingestor_test.rb | 4 +- .../ingestors/material_rss_ingestor_test.rb | 4 +- 9 files changed, 63 insertions(+), 47 deletions(-) diff --git a/config/initializers/inflections.rb b/config/initializers/inflections.rb index 3860f659e..a45df1401 100644 --- a/config/initializers/inflections.rb +++ b/config/initializers/inflections.rb @@ -14,3 +14,7 @@ # ActiveSupport::Inflector.inflections(:en) do |inflect| # inflect.acronym "RESTful" # end + +ActiveSupport::Inflector.inflections(:en) do |inflect| + inflect.acronym 'RSS' +end diff --git a/lib/ingestors/event_rss_ingestor.rb b/lib/ingestors/event_rss_ingestor.rb index f53267d4e..b25dc96e2 100644 --- a/lib/ingestors/event_rss_ingestor.rb +++ b/lib/ingestors/event_rss_ingestor.rb @@ -2,8 +2,8 @@ require 'tess_rdf_extractors' module Ingestors - class EventRssIngestor < Ingestor - include RssIngestion + class EventRSSIngestor < Ingestor + include RSSIngestion def initialize super diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 598e411b1..00fa48c26 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -13,8 +13,8 @@ def self.ingestors Ingestors::ZenodoIngestor, Ingestors::OaiPmhIngestor, Ingestors::GithubIngestor, - Ingestors::EventRssIngestor, - Ingestors::MaterialRssIngestor + Ingestors::EventRSSIngestor, + Ingestors::MaterialRSSIngestor ] + taxila_ingestors + llm_ingestors + heptraining_ingestors end diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb index 500e42539..44e761c17 100644 --- a/lib/ingestors/material_rss_ingestor.rb +++ b/lib/ingestors/material_rss_ingestor.rb @@ -1,11 +1,10 @@ require 'rss' require 'rss/media' -require 'rss/itunes' require 'tess_rdf_extractors' module Ingestors - class MaterialRssIngestor < Ingestor - include RssIngestion + class MaterialRSSIngestor < Ingestor + include RSSIngestion def initialize super diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb index 300f8c8a3..e6bb557c3 100644 --- a/lib/ingestors/rss_ingestion.rb +++ b/lib/ingestors/rss_ingestion.rb @@ -1,5 +1,5 @@ module Ingestors - module RssIngestion + module RSSIngestion include DublinCoreIngestion # Fetches and parses a feed from the URL, with optional HTML feed discovery. diff --git a/lib/rss/media.rb b/lib/rss/media.rb index 324d780a6..bbc209079 100644 --- a/lib/rss/media.rb +++ b/lib/rss/media.rb @@ -1,24 +1,32 @@ +# Extension for the Yahoo Media RSS namespace (xmlns:media="http://search.yahoo.com/mrss/"). +# Used by feeds that carry rich media metadata, e.g. YouTube channel feeds which include +# , , and elements. +# +# The extension is structured as RSS::Media (rather than a flat module inside RSS) so that +# Zeitwerk can autoload it correctly from lib/rss/media.rb. require 'rss/atom' module RSS - MEDIA_PREFIX = 'media' - MEDIA_URI = 'http://search.yahoo.com/mrss/' + module Media + MEDIA_PREFIX = 'media' + MEDIA_URI = 'http://search.yahoo.com/mrss/' - module MediaGroupDescriptionModel - extend BaseModel + module MediaGroupDescriptionModel + extend ::RSS::BaseModel - def self.append_features(klass) - super - return if klass.instance_of?(Module) + def self.append_features(klass) + super + return if klass.instance_of?(Module) - klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) - klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group') + klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) + klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group') + end end - end - BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup') - BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title') - BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description') + ::RSS::BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup') + ::RSS::BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title') + ::RSS::BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description') + end end require_relative 'media/atom' diff --git a/lib/rss/media/atom.rb b/lib/rss/media/atom.rb index 797c9f484..5a725da88 100644 --- a/lib/rss/media/atom.rb +++ b/lib/rss/media/atom.rb @@ -1,36 +1,41 @@ +# Patches RSS::Atom::Feed and RSS::Atom::Entry with Media namespace support (see ../media.rb). +# Kept as RSS::Media::Atom so Zeitwerk can autoload it from lib/rss/media/atom.rb. module RSS - module Atom - Feed.install_ns(MEDIA_PREFIX, MEDIA_URI) + module Media + module Atom + ::RSS::Atom::Feed.install_ns(MEDIA_PREFIX, MEDIA_URI) - class Feed - include MediaGroupDescriptionModel - class Entry - include MediaGroupDescriptionModel + class ::RSS::Atom::Feed + include ::RSS::Media::MediaGroupDescriptionModel - class MediaGroup < Element - include RSS09 + class Entry + include ::RSS::Media::MediaGroupDescriptionModel - @tag_name = 'group' + class MediaGroup < Element + include RSS09 - class << self - def required_prefix - MEDIA_PREFIX - end + @tag_name = 'group' + + class << self + def required_prefix + ::RSS::Media::MEDIA_PREFIX + end - def required_uri - MEDIA_URI + def required_uri + ::RSS::Media::MEDIA_URI + end end - end - install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) - install_text_element('title', MEDIA_URI, '?', 'media_title') - install_text_element('description', MEDIA_URI, '?', 'media_description') + install_must_call_validator(::RSS::Media::MEDIA_PREFIX, ::RSS::Media::MEDIA_URI) + install_text_element('title', ::RSS::Media::MEDIA_URI, '?', 'media_title') + install_text_element('description', ::RSS::Media::MEDIA_URI, '?', 'media_description') + end end end - end - class Entry - include MediaGroupDescriptionModel + class ::RSS::Atom::Entry + include ::RSS::Media::MediaGroupDescriptionModel + end end end end diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index 30bf3f7fb..058ffc1c5 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -1,9 +1,9 @@ require 'test_helper' require 'stringio' -class EventRssIngestorTest < ActiveSupport::TestCase +class EventRSSIngestorTest < ActiveSupport::TestCase setup do - @ingestor = Ingestors::EventRssIngestor.new + @ingestor = Ingestors::EventRSSIngestor.new mock_timezone end diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index a315e0aed..b012bf8f3 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -1,9 +1,9 @@ require 'test_helper' require 'stringio' -class MaterialRssIngestorTest < ActiveSupport::TestCase +class MaterialRSSIngestorTest < ActiveSupport::TestCase setup do - @ingestor = Ingestors::MaterialRssIngestor.new + @ingestor = Ingestors::MaterialRSSIngestor.new end test 'reads rss items from dublin core and native rss fields' do From 2c7c05e8a8b3b0f09e89ad5a0b25aaf016e60e48 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 13:04:58 +0200 Subject: [PATCH 07/14] Add support for relative urls --- lib/ingestors/event_rss_ingestor.rb | 16 +++++----- lib/ingestors/material_rss_ingestor.rb | 16 +++++----- lib/ingestors/rss_ingestion.rb | 30 ++++++++++++++----- .../unit/ingestors/event_rss_ingestor_test.rb | 22 ++++++++++++++ 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/lib/ingestors/event_rss_ingestor.rb b/lib/ingestors/event_rss_ingestor.rb index b25dc96e2..f0f511714 100644 --- a/lib/ingestors/event_rss_ingestor.rb +++ b/lib/ingestors/event_rss_ingestor.rb @@ -20,22 +20,22 @@ def self.config end def read(url) - feed, content = fetch_feed(url) + feed, content, source_url = fetch_feed(url) return if feed.nil? if feed.is_a?(RSS::Rss) @messages << "Parsing RSS feed: #{feed_title(feed)}" - feed.items.each { |item| add_event(build_event_from_rss_item(item)) } + feed.items.each { |item| add_event(build_event_from_rss_item(item, source_url)) } elsif feed.is_a?(RSS::RDF) @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" - rss_events = feed.items.map { |item| build_event_from_rss_item(item).to_h } + rss_events = feed.items.map { |item| build_event_from_rss_item(item, source_url).to_h } bioschemas_events = extract_rdf_bioschemas_events(content) merge_with_bioschemas_priority(bioschemas_events, rss_events).each do |event| add_event(event) end elsif feed.is_a?(RSS::Atom::Feed) @messages << "Parsing ATOM feed: #{feed_title(feed)}" - feed.items.each { |item| add_event(build_event_from_atom_item(item)) } + feed.items.each { |item| add_event(build_event_from_atom_item(item, source_url)) } else @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" @messages << 'unsupported feed format' @@ -65,11 +65,11 @@ def extract_rdf_bioschemas_events(content) [] end - def build_event_from_rss_item(item) + def build_event_from_rss_item(item, feed_url) event = build_event_from_dublin_core_data(extract_dublin_core(item)) event.title ||= text_value(item.title) - native_url = text_value(item.link) + native_url = resolve_feed_url(item.link, feed_url) event.url = native_url if native_url.present? event.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded)) event.keywords = merge_unique(event.keywords, extract_rss_keywords(item)) @@ -84,11 +84,11 @@ def build_event_from_rss_item(item) event end - def build_event_from_atom_item(item) + def build_event_from_atom_item(item, feed_url) event = build_event_from_dublin_core_data(extract_dublin_core(item)) event.title ||= text_value(item.title) - native_url = extract_atom_link(item) + native_url = resolve_feed_url(extract_atom_link(item), feed_url) event.url = native_url if native_url.present? event.description ||= convert_description(text_value(item.summary) || text_value(item.content)) event.keywords = merge_unique(event.keywords, extract_atom_keywords(item)) diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb index 44e761c17..4484aca9d 100644 --- a/lib/ingestors/material_rss_ingestor.rb +++ b/lib/ingestors/material_rss_ingestor.rb @@ -21,22 +21,22 @@ def self.config end def read(url) - feed, content = fetch_feed(url) + feed, content, source_url = fetch_feed(url) return if feed.nil? if feed.is_a?(RSS::Rss) @messages << "Parsing RSS feed: #{feed_title(feed)}" - feed.items.each { |item| add_material(build_material_from_rss_item(item)) } + feed.items.each { |item| add_material(build_material_from_rss_item(item, source_url)) } elsif feed.is_a?(RSS::RDF) @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" - rss_materials = feed.items.map { |item| build_material_from_rss_item(item).to_h } + rss_materials = feed.items.map { |item| build_material_from_rss_item(item, source_url).to_h } bioschemas_materials = extract_rdf_bioschemas_materials(content) merge_with_bioschemas_priority(bioschemas_materials, rss_materials).each do |material| add_material(material) end elsif feed.is_a?(RSS::Atom::Feed) @messages << "Parsing ATOM feed: #{feed_title(feed)}" - feed.items.each { |item| add_material(build_material_from_atom_item(item)) } + feed.items.each { |item| add_material(build_material_from_atom_item(item, source_url)) } else @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" @messages << 'unsupported feed format' @@ -60,11 +60,11 @@ def extract_rdf_bioschemas_materials(content) [] end - def build_material_from_rss_item(item) + def build_material_from_rss_item(item, feed_url) material = build_material_from_dublin_core_data(extract_dublin_core(item)) material.title ||= text_value(item.title) - native_url = text_value(item.link) + native_url = resolve_feed_url(item.link, feed_url) material.url = native_url if native_url.present? itunes_summary = text_value(item.itunes_summary) if item.respond_to?(:itunes_summary) material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded) || itunes_summary) @@ -85,12 +85,12 @@ def build_material_from_rss_item(item) material end - def build_material_from_atom_item(item) + def build_material_from_atom_item(item, feed_url) material = build_material_from_dublin_core_data(extract_dublin_core(item)) media_title = text_value(item.media_group&.media_title) material.title ||= text_value(item.title) || media_title - native_url = extract_atom_link(item) + native_url = resolve_feed_url(extract_atom_link(item), feed_url) material.url = native_url if native_url.present? media_group_description = text_value(item.media_group&.media_description) material.description ||= convert_description(text_value(item.summary) || text_value(item.content) || media_group_description) diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb index e6bb557c3..fcd3f89c6 100644 --- a/lib/ingestors/rss_ingestion.rb +++ b/lib/ingestors/rss_ingestion.rb @@ -3,34 +3,34 @@ module RSSIngestion include DublinCoreIngestion # Fetches and parses a feed from the URL, with optional HTML feed discovery. - # Returns [feed, parsed_content] on success, where parsed_content is the XML/Atom string used. - # Returns [nil, nil] when the URL cannot be opened or parsing/discovery fails. + # Returns [feed, raw_xml, source_url] on success. + # Returns [nil, nil, nil] when the URL cannot be opened or parsing/discovery fails. def fetch_feed(url) io = open_url(url) - return [nil, nil] if io.nil? + return [nil, nil, nil] if io.nil? content = io.read feed, parse_error_message = parse_feed(content) - return [feed, content] unless feed.nil? + return [feed, content, url] unless feed.nil? discovered_feed_url = discover_feed_url_from_html(content, url) if discovered_feed_url.blank? @messages << parse_error_message - return [nil, nil] + return [nil, nil, nil] end @messages << "HTML page detected, following feed link: #{discovered_feed_url}" discovered_io = open_url(discovered_feed_url) - return [nil, nil] if discovered_io.nil? + return [nil, nil, nil] if discovered_io.nil? discovered_content = discovered_io.read discovered_feed, discovered_parse_error_message = parse_feed(discovered_content) if discovered_feed.blank? @messages << discovered_parse_error_message - return [nil, nil] + return [nil, nil, nil] end - [discovered_feed, discovered_content] + [discovered_feed, discovered_content, discovered_feed_url] end def parse_feed(content) @@ -116,6 +116,20 @@ def extract_atom_link(item) item.links.map { |l| text_value(l.href) }.find(&:present?) end + def resolve_feed_url(candidate_url, feed_url) + candidate = text_value(candidate_url) + return nil if candidate.blank? + + URI.parse(candidate) + return candidate if URI::DEFAULT_PARSER.make_regexp(%w[http https]).match?(candidate) + + URI.join(feed_url, candidate).to_s + rescue URI::InvalidURIError + URI.join(feed_url, candidate).to_s + rescue StandardError + candidate + end + def prefer_precise_time(existing_value, candidate_time) return existing_value if candidate_time.blank? return candidate_time if existing_value.blank? diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index 058ffc1c5..3cfebb9cc 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -75,6 +75,28 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal Time.utc(2024, 6, 3, 12, 0, 0), fallback_event.end.utc end + test 'resolves relative rss item links against the feed url' do + rss_feed_xml = <<~XML + + + + Relative link feed + + Relative URL event + /community-calls/april-2026/ + Event from a feed with relative links + Wed, 08 Apr 2026 10:00:00 GMT + + + + XML + + read_xml(rss_feed_xml, 'https://nl-rse.org/feed.xml') + + assert_equal 1, @ingestor.events.count + assert_equal 'https://nl-rse.org/community-calls/april-2026/', @ingestor.events.first.url + end + test 'reads atom items from dublin core and native atom fields' do atom_feed_xml = <<~XML From a515e46d9420de2dac7ec862ec165dc2dd97fa99 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 13:38:04 +0200 Subject: [PATCH 08/14] Fixes from testing many RSS feeds --- lib/ingestors/rss_ingestion.rb | 3 ++- lib/rss/media/atom.rb | 15 ++++++++++++++- test/unit/ingestors/event_rss_ingestor_test.rb | 7 +++++-- test/unit/ingestors/material_rss_ingestor_test.rb | 7 +++++-- test/unit/rss_media_atom_test.rb | 12 ++++++++++++ 5 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 test/unit/rss_media_atom_test.rb diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb index fcd3f89c6..2615cdf1a 100644 --- a/lib/ingestors/rss_ingestion.rb +++ b/lib/ingestors/rss_ingestion.rb @@ -16,10 +16,11 @@ def fetch_feed(url) discovered_feed_url = discover_feed_url_from_html(content, url) if discovered_feed_url.blank? @messages << parse_error_message + @messages << "Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in: #{url}" return [nil, nil, nil] end - @messages << "HTML page detected, following feed link: #{discovered_feed_url}" + @messages << "Found RSS/Atom alternate feed link during HTML discovery, following: #{discovered_feed_url}" discovered_io = open_url(discovered_feed_url) return [nil, nil, nil] if discovered_io.nil? diff --git a/lib/rss/media/atom.rb b/lib/rss/media/atom.rb index 5a725da88..08bb76633 100644 --- a/lib/rss/media/atom.rb +++ b/lib/rss/media/atom.rb @@ -3,7 +3,20 @@ module RSS module Media module Atom - ::RSS::Atom::Feed.install_ns(MEDIA_PREFIX, MEDIA_URI) + def self.install_media_namespace! + # This extension can be evaluated more than once in reloader/autoload flows. + # RSS::Element.install_ns raises on duplicate prefixes, so treat same mapping as a no-op. + ns_pool = ::RSS::Atom::Feed::NSPOOL + existing_uri = ns_pool[MEDIA_PREFIX] + + return if existing_uri == MEDIA_URI + + raise ::RSS::OverlappedPrefixError, MEDIA_PREFIX unless existing_uri.nil? + + ::RSS::Atom::Feed.install_ns(MEDIA_PREFIX, MEDIA_URI) + end + + install_media_namespace! class ::RSS::Atom::Feed include ::RSS::Media::MediaGroupDescriptionModel diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index 3cfebb9cc..16479781d 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -294,15 +294,18 @@ class EventRSSIngestorTest < ActiveSupport::TestCase ) assert_equal 1, @ingestor.events.count - assert_includes @ingestor.messages, "HTML page detected, following feed link: #{feed_url}" + assert_includes @ingestor.messages, + "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" assert_equal 'Alternate feed event', @ingestor.events.first.title end test 'logs parse error for invalid feed input' do read_xml('not valid rss or atom') - assert_equal 1, @ingestor.messages.length + assert_equal 2, @ingestor.messages.length assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) + assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, + @ingestor.messages.second) assert_empty @ingestor.events end diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index b012bf8f3..a1623df27 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -231,8 +231,10 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase test 'logs parse error for invalid feed input' do read_xml('not valid rss or atom') - assert_equal 1, @ingestor.messages.length + assert_equal 2, @ingestor.messages.length assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) + assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, + @ingestor.messages.second) assert_empty @ingestor.materials end @@ -453,7 +455,8 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase ) assert_equal 1, @ingestor.materials.count - assert_includes @ingestor.messages, "HTML page detected, following feed link: #{feed_url}" + assert_includes @ingestor.messages, + "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" assert_equal 'Alternate feed material', @ingestor.materials.first.title end diff --git a/test/unit/rss_media_atom_test.rb b/test/unit/rss_media_atom_test.rb new file mode 100644 index 000000000..3faf0f589 --- /dev/null +++ b/test/unit/rss_media_atom_test.rb @@ -0,0 +1,12 @@ +require 'test_helper' + +class RSSMediaAtomTest < ActiveSupport::TestCase + test 'install_media_namespace! is idempotent for the media prefix' do + assert_nothing_raised do + RSS::Media::Atom.install_media_namespace! + RSS::Media::Atom.install_media_namespace! + end + + assert_equal RSS::Media::MEDIA_URI, RSS::Atom::Feed::NSPOOL[RSS::Media::MEDIA_PREFIX] + end +end From cd91db60878d36642d664b8d3ec52a0d289746fc Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 13:49:08 +0200 Subject: [PATCH 09/14] Remove start and end date for events based on date published in rss --- lib/ingestors/event_rss_ingestor.rb | 9 --------- test/unit/ingestors/event_rss_ingestor_test.rb | 18 +++++++++--------- .../ingestors/material_rss_ingestor_test.rb | 4 ++-- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/lib/ingestors/event_rss_ingestor.rb b/lib/ingestors/event_rss_ingestor.rb index f0f511714..d02332a08 100644 --- a/lib/ingestors/event_rss_ingestor.rb +++ b/lib/ingestors/event_rss_ingestor.rb @@ -77,10 +77,6 @@ def build_event_from_rss_item(item, feed_url) event.organizer ||= organizer event.contact ||= organizer - item_date = parse_time(item.respond_to?(:pubDate) ? item.pubDate : nil) || parse_time(item.respond_to?(:date) ? item.date : nil) - event.start = prefer_precise_time(event.start, item_date) - event.end = prefer_precise_time(event.end, item_date) - event end @@ -96,11 +92,6 @@ def build_event_from_atom_item(item, feed_url) event.organizer ||= organizer event.contact ||= organizer - published = parse_time(item.respond_to?(:published) ? item.published : nil) - updated = parse_time(item.respond_to?(:updated) ? item.updated : nil) - event.start = prefer_precise_time(event.start, published || updated) - event.end = prefer_precise_time(event.end, updated || published) - event end end diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index 16479781d..f4beb7b7b 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -60,7 +60,7 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 'rss event publisher', dc_event.contact assert_equal %w[event-topic-a native-event-category], dc_event.keywords assert_equal ['workshop'], dc_event.event_types - assert_equal Time.utc(2024, 6, 1, 9, 0, 0), dc_event.start.utc + assert_equal Date.new(2024, 6, 1), dc_event.start assert_equal Date.new(2024, 6, 2), dc_event.end.to_date fallback_event = @ingestor.events.second @@ -71,8 +71,8 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 'Fallback RSS Author', fallback_event.contact assert_equal ['fallback-event-category'], fallback_event.keywords assert_equal [], fallback_event.event_types - assert_equal Time.utc(2024, 6, 3, 12, 0, 0), fallback_event.start.utc - assert_equal Time.utc(2024, 6, 3, 12, 0, 0), fallback_event.end.utc + assert_nil fallback_event.start + assert_nil fallback_event.end end test 'resolves relative rss item links against the feed url' do @@ -146,8 +146,8 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 'atom event publisher', dc_event.contact assert_equal %w[atom-event-topic native-atom-event-category], dc_event.keywords assert_equal ['seminar'], dc_event.event_types - assert_equal Time.utc(2024, 7, 1, 10, 0, 0), dc_event.start.utc - assert_equal Time.utc(2024, 7, 2, 11, 0, 0), dc_event.end.utc + assert_equal Date.new(2024, 7, 1), dc_event.start + assert_equal Date.new(2024, 7, 2), dc_event.end fallback_event = @ingestor.events.second assert_equal 'Fallback Atom event title', fallback_event.title @@ -157,8 +157,8 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 'Fallback Atom Author', fallback_event.contact assert_equal ['fallback-atom-event-category'], fallback_event.keywords assert_equal [], fallback_event.event_types - assert_equal Time.utc(2024, 7, 3, 10, 0, 0), fallback_event.start.utc - assert_equal Time.utc(2024, 7, 4, 11, 0, 0), fallback_event.end.utc + assert_nil fallback_event.start + assert_nil fallback_event.end end test 'reads bioschemas event from rss 1.0 rdf feed' do @@ -295,7 +295,7 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 1, @ingestor.events.count assert_includes @ingestor.messages, - "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" + "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" assert_equal 'Alternate feed event', @ingestor.events.first.title end @@ -305,7 +305,7 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 2, @ingestor.messages.length assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, - @ingestor.messages.second) + @ingestor.messages.second) assert_empty @ingestor.events end diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index a1623df27..ff0e3b7b4 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -234,7 +234,7 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase assert_equal 2, @ingestor.messages.length assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, - @ingestor.messages.second) + @ingestor.messages.second) assert_empty @ingestor.materials end @@ -456,7 +456,7 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase assert_equal 1, @ingestor.materials.count assert_includes @ingestor.messages, - "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" + "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" assert_equal 'Alternate feed material', @ingestor.materials.first.title end From b8f19c6769ec74b8f0d820e103688967fd91e8b5 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 16:23:57 +0200 Subject: [PATCH 10/14] Add feed url discovery from youtube url --- lib/ingestors/rss_ingestion.rb | 37 ++++++++++++++++--- .../unit/ingestors/event_rss_ingestor_test.rb | 11 +++++- .../ingestors/material_rss_ingestor_test.rb | 2 +- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb index 2615cdf1a..eea927aaf 100644 --- a/lib/ingestors/rss_ingestion.rb +++ b/lib/ingestors/rss_ingestion.rb @@ -1,5 +1,7 @@ module Ingestors module RSSIngestion + require 'cgi' + include DublinCoreIngestion # Fetches and parses a feed from the URL, with optional HTML feed discovery. @@ -13,14 +15,12 @@ def fetch_feed(url) feed, parse_error_message = parse_feed(content) return [feed, content, url] unless feed.nil? - discovered_feed_url = discover_feed_url_from_html(content, url) + discovered_feed_url = discover_feed_url(content, url) if discovered_feed_url.blank? @messages << parse_error_message @messages << "Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in: #{url}" return [nil, nil, nil] end - - @messages << "Found RSS/Atom alternate feed link during HTML discovery, following: #{discovered_feed_url}" discovered_io = open_url(discovered_feed_url) return [nil, nil, nil] if discovered_io.nil? @@ -43,6 +43,20 @@ def parse_feed(content) [nil, "parsing feed failed with: #{e.message}"] end + def discover_feed_url(content, base_url) + if (url = discover_feed_url_from_html(content, base_url)) + @messages << "Found RSS/Atom feed link in HTML page, following: #{url}" + return url + end + + if (url = discover_feed_url_from_youtube_playlist_url(base_url)) + @messages << "Found Atom feed link from YouTube playlist URL, following: #{url}" + return url + end + + nil + end + def discover_feed_url_from_html(content, base_url) doc = Nokogiri::HTML(content) link = doc.css('link[rel]').find do |node| @@ -52,13 +66,24 @@ def discover_feed_url_from_html(content, base_url) end href = link&.[]('href') - return nil if href.blank? - - URI.join(base_url, href).to_s + URI.join(base_url, href).to_s if href.present? rescue StandardError nil end + def discover_feed_url_from_youtube_playlist_url(base_url) + uri = URI.parse(base_url) + host = uri.host.to_s.downcase + return nil unless host == 'youtube.com' || host.end_with?('.youtube.com') + + playlist_id = CGI.parse(uri.query.to_s).fetch('list', []).first + return nil if playlist_id.blank? + + "https://www.youtube.com/feeds/videos.xml?playlist_id=#{CGI.escape(playlist_id)}" + rescue URI::InvalidURIError + nil + end + def feed_title(feed) channel = feed.respond_to?(:channel) ? feed.channel : nil return channel.title if channel.present? && channel.respond_to?(:title) diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index f4beb7b7b..4ed4e83b0 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -295,10 +295,19 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 1, @ingestor.events.count assert_includes @ingestor.messages, - "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" + "Found RSS/Atom feed link in HTML page, following: #{feed_url}" assert_equal 'Alternate feed event', @ingestor.events.first.title end + test 'discovers youtube playlist feed from watch url' do + html_without_feed_link = 'No feed link' + start_url = 'https://www.youtube.com/watch?v=z58CgdgFC8s&list=PLOVYPdB2NZ6M-hQfAIn6srsxfzuNp1EPd&index=1' + + discovered_url = @ingestor.send(:discover_feed_url, html_without_feed_link, start_url) + + assert_equal 'https://www.youtube.com/feeds/videos.xml?playlist_id=PLOVYPdB2NZ6M-hQfAIn6srsxfzuNp1EPd', discovered_url + end + test 'logs parse error for invalid feed input' do read_xml('not valid rss or atom') diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index ff0e3b7b4..4068182c5 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -456,7 +456,7 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase assert_equal 1, @ingestor.materials.count assert_includes @ingestor.messages, - "Found RSS/Atom alternate feed link during HTML discovery, following: #{feed_url}" + "Found RSS/Atom feed link in HTML page, following: #{feed_url}" assert_equal 'Alternate feed material', @ingestor.materials.first.title end From b2780cf705e818fce139414f3d42de03615ca5f8 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 16:50:56 +0200 Subject: [PATCH 11/14] Fix error class that was too specific --- lib/ingestors/rss_ingestion.rb | 4 ++-- test/unit/ingestors/event_rss_ingestor_test.rb | 8 +++++--- test/unit/ingestors/material_rss_ingestor_test.rb | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb index eea927aaf..44b8f1bd3 100644 --- a/lib/ingestors/rss_ingestion.rb +++ b/lib/ingestors/rss_ingestion.rb @@ -39,8 +39,8 @@ def parse_feed(content) return [feed, nil] if feed.present? [nil, 'parsing feed failed with: unrecognized feed content'] - rescue RSS::NotWellFormedError => e - [nil, "parsing feed failed with: #{e.message}"] + rescue RSS::Error => e + [nil, "parsing feed failed with #{e.class}: #{e.message}"] end def discover_feed_url(content, base_url) diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index 4ed4e83b0..766b98ff3 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -308,11 +308,13 @@ class EventRSSIngestorTest < ActiveSupport::TestCase assert_equal 'https://www.youtube.com/feeds/videos.xml?playlist_id=PLOVYPdB2NZ6M-hQfAIn6srsxfzuNp1EPd', discovered_url end - test 'logs parse error for invalid feed input' do - read_xml('not valid rss or atom') + test 'logs rss parser error' do + RSS::Parser.stub(:parse, proc { raise RSS::InvalidRSSError, 'simulated rss parse error' }) do + read_xml('') + end assert_equal 2, @ingestor.messages.length - assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) + assert_equal 'parsing feed failed with RSS::InvalidRSSError: simulated rss parse error', @ingestor.messages.first assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, @ingestor.messages.second) assert_empty @ingestor.events diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index 4068182c5..ccf274afc 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -232,7 +232,7 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase read_xml('not valid rss or atom') assert_equal 2, @ingestor.messages.length - assert_match(/^parsing feed failed with: This is not well formed XML/, @ingestor.messages.first) + assert_match(/^parsing feed failed with RSS::NotWellFormedError: This is not well formed XML/, @ingestor.messages.first) assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, @ingestor.messages.second) assert_empty @ingestor.materials From 0f042e75b5fc5f655650f2491de34e63ecc4a8e1 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Wed, 8 Apr 2026 17:08:20 +0200 Subject: [PATCH 12/14] Fix link handling in atom feeds --- lib/ingestors/rss_ingestion.rb | 12 +++++++++++- test/unit/ingestors/event_rss_ingestor_test.rb | 3 ++- test/unit/ingestors/material_rss_ingestor_test.rb | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb index 44b8f1bd3..06e6fd2f7 100644 --- a/lib/ingestors/rss_ingestion.rb +++ b/lib/ingestors/rss_ingestion.rb @@ -139,7 +139,17 @@ def extract_atom_authors(item) end def extract_atom_link(item) - item.links.map { |l| text_value(l.href) }.find(&:present?) + links = Array(item.links) + + preferred_link = links.find do |link| + href = text_value(link.href) + rel = text_value(link.respond_to?(:rel) ? link.rel : nil).to_s.downcase + + href.present? && (rel.blank? || rel == 'alternate') + end + return text_value(preferred_link.href) if preferred_link.present? + + links.map { |link| text_value(link.href) }.find(&:present?) end def resolve_feed_url(candidate_url, feed_url) diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb index 766b98ff3..2939a9c53 100644 --- a/test/unit/ingestors/event_rss_ingestor_test.rb +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -105,7 +105,8 @@ class EventRSSIngestorTest < ActiveSupport::TestCase Native Atom event title - + + Native Atom event summary Native Atom Author diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb index ccf274afc..97b6bc363 100644 --- a/test/unit/ingestors/material_rss_ingestor_test.rb +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -128,7 +128,8 @@ class MaterialRSSIngestorTest < ActiveSupport::TestCase Native Atom title - + + Native Atom summary Native Atom Author From 89e5f5374041e48964c13220e7a0019541606227 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Thu, 9 Apr 2026 10:46:41 +0200 Subject: [PATCH 13/14] Use relative import for loading the custom rss media extention Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- lib/ingestors/material_rss_ingestor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb index 4484aca9d..d6c6729db 100644 --- a/lib/ingestors/material_rss_ingestor.rb +++ b/lib/ingestors/material_rss_ingestor.rb @@ -1,5 +1,5 @@ require 'rss' -require 'rss/media' +require_relative '../rss/media' require 'tess_rdf_extractors' module Ingestors From 662c450a25fc0005044d2fc15509b76cfda1f887 Mon Sep 17 00:00:00 2001 From: Martin Voigt Date: Thu, 9 Apr 2026 11:37:27 +0200 Subject: [PATCH 14/14] Add comment for dublin core to text conversion options --- lib/ingestors/dublin_core_ingestion.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/ingestors/dublin_core_ingestion.rb b/lib/ingestors/dublin_core_ingestion.rb index 39cdca7fd..e9b93add7 100644 --- a/lib/ingestors/dublin_core_ingestion.rb +++ b/lib/ingestors/dublin_core_ingestion.rb @@ -75,8 +75,8 @@ def normalize_dublin_core_values(values) # this method is also used by RSS ingestion under an alias def dublin_core_text(value) return nil if value.nil? - return value.content if value.respond_to?(:content) - return value.text if value.respond_to?(:text) && !value.is_a?(String) + return value.content if value.respond_to?(:content) # rss gem xml nodes + return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes value.to_s end