diff --git a/config/initializers/inflections.rb b/config/initializers/inflections.rb index 3860f659e..a45df1401 100644 --- a/config/initializers/inflections.rb +++ b/config/initializers/inflections.rb @@ -14,3 +14,7 @@ # ActiveSupport::Inflector.inflections(:en) do |inflect| # inflect.acronym "RESTful" # end + +ActiveSupport::Inflector.inflections(:en) do |inflect| + inflect.acronym 'RSS' +end diff --git a/lib/ingestors/dublin_core_ingestion.rb b/lib/ingestors/dublin_core_ingestion.rb new file mode 100644 index 000000000..e9b93add7 --- /dev/null +++ b/lib/ingestors/dublin_core_ingestion.rb @@ -0,0 +1,84 @@ +module Ingestors + module DublinCoreIngestion + def build_material_from_dublin_core_data(dc) + material = OpenStruct.new + + material.title = dc[:title] + material.description = convert_description(dc[:description]) + material.authors = normalize_dublin_core_values(dc[:creators]) + material.contributors = normalize_dublin_core_values(dc[:contributors]) + + rights = normalize_dublin_core_values(dc[:rights]) + material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' + + parsed_dates = parse_dublin_core_dates(dc[:dates]) + material.date_created = parsed_dates.first + material.date_modified = parsed_dates.last if parsed_dates.size > 1 + + identifiers = normalize_dublin_core_values(dc[:identifiers]) + material.doi = extract_dublin_core_doi(identifiers) + material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + material.keywords = normalize_dublin_core_values(dc[:subjects]) + material.resource_type = normalize_dublin_core_values(dc[:types]) + material.contact = dublin_core_text(dc[:publisher]) + + material + end + + def build_event_from_dublin_core_data(dc) + event = OpenStruct.new + + event.title = dc[:title] + event.description = convert_description(dc[:description]) + event.organizer = normalize_dublin_core_values(dc[:creators]).first + event.contact = dublin_core_text(dc[:publisher]) || event.organizer + event.keywords = normalize_dublin_core_values(dc[:subjects]) + event.event_types = normalize_dublin_core_values(dc[:types]) + + dates = parse_dublin_core_dates(dc[:dates]) + event.start = dates.first + event.end = dates.last || dates.first + + identifiers = normalize_dublin_core_values(dc[:identifiers]) + event.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + event + end + + def parse_dublin_core_dates(dates) + normalize_dublin_core_values(dates).map do |date_value| + Date.parse(date_value) + rescue StandardError + nil + end.compact + end + + def extract_dublin_core_doi(identifiers) + doi = normalize_dublin_core_values(identifiers).find do |id| + id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') + end + return nil unless doi + + normalized = doi.sub(%r{https?://doi\.org/}, '') + "https://doi.org/#{normalized}" + end + + def normalize_dublin_core_values(values) + Array(values).map { |v| dublin_core_text(v) } + .map(&:to_s) + .map(&:strip) + .reject(&:blank?) + .uniq + end + + # this method is also used by RSS ingestion under an alias + def dublin_core_text(value) + return nil if value.nil? + return value.content if value.respond_to?(:content) # rss gem xml nodes + return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes + + value.to_s + end + end +end diff --git a/lib/ingestors/event_rss_ingestor.rb b/lib/ingestors/event_rss_ingestor.rb new file mode 100644 index 000000000..d02332a08 --- /dev/null +++ b/lib/ingestors/event_rss_ingestor.rb @@ -0,0 +1,98 @@ +require 'rss' +require 'tess_rdf_extractors' + +module Ingestors + class EventRSSIngestor < Ingestor + include RSSIngestion + + def initialize + super + + @bioschemas_manager = BioschemasIngestor.new + end + + def self.config + { + key: 'event_rss', + title: 'RSS / Atom Feed', + category: :events + } + end + + def read(url) + feed, content, source_url = fetch_feed(url) + return if feed.nil? + + if feed.is_a?(RSS::Rss) + @messages << "Parsing RSS feed: #{feed_title(feed)}" + feed.items.each { |item| add_event(build_event_from_rss_item(item, source_url)) } + elsif feed.is_a?(RSS::RDF) + @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" + rss_events = feed.items.map { |item| build_event_from_rss_item(item, source_url).to_h } + bioschemas_events = extract_rdf_bioschemas_events(content) + merge_with_bioschemas_priority(bioschemas_events, rss_events).each do |event| + add_event(event) + end + elsif feed.is_a?(RSS::Atom::Feed) + @messages << "Parsing ATOM feed: #{feed_title(feed)}" + feed.items.each { |item| add_event(build_event_from_atom_item(item, source_url)) } + else + @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" + @messages << 'unsupported feed format' + end + end + + private + + def extract_rdf_bioschemas_events(content) + return [] unless content.present? + + events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + + @bioschemas_manager.deduplicate(events + courses + course_instances) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? + @messages << 'An error occurred while extracting Bioschemas Events.' + [] + end + + def build_event_from_rss_item(item, feed_url) + event = build_event_from_dublin_core_data(extract_dublin_core(item)) + + event.title ||= text_value(item.title) + native_url = resolve_feed_url(item.link, feed_url) + event.url = native_url if native_url.present? + event.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded)) + event.keywords = merge_unique(event.keywords, extract_rss_keywords(item)) + organizer = text_value(item.respond_to?(:author) ? item.author : nil) + event.organizer ||= organizer + event.contact ||= organizer + + event + end + + def build_event_from_atom_item(item, feed_url) + event = build_event_from_dublin_core_data(extract_dublin_core(item)) + + event.title ||= text_value(item.title) + native_url = resolve_feed_url(extract_atom_link(item), feed_url) + event.url = native_url if native_url.present? + event.description ||= convert_description(text_value(item.summary) || text_value(item.content)) + event.keywords = merge_unique(event.keywords, extract_atom_keywords(item)) + organizer = extract_atom_authors(item).first + event.organizer ||= organizer + event.contact ||= organizer + + event + end + end +end diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 9bd8169fd..00fa48c26 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -13,6 +13,8 @@ def self.ingestors Ingestors::ZenodoIngestor, Ingestors::OaiPmhIngestor, Ingestors::GithubIngestor, + Ingestors::EventRSSIngestor, + Ingestors::MaterialRSSIngestor ] + taxila_ingestors + llm_ingestors + heptraining_ingestors end diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb new file mode 100644 index 000000000..d6c6729db --- /dev/null +++ b/lib/ingestors/material_rss_ingestor.rb @@ -0,0 +1,111 @@ +require 'rss' +require_relative '../rss/media' +require 'tess_rdf_extractors' + +module Ingestors + class MaterialRSSIngestor < Ingestor + include RSSIngestion + + def initialize + super + + @bioschemas_manager = BioschemasIngestor.new + end + + def self.config + { + key: 'material_rss', + title: 'RSS / Atom Feed', + category: :materials + } + end + + def read(url) + feed, content, source_url = fetch_feed(url) + return if feed.nil? + + if feed.is_a?(RSS::Rss) + @messages << "Parsing RSS feed: #{feed_title(feed)}" + feed.items.each { |item| add_material(build_material_from_rss_item(item, source_url)) } + elsif feed.is_a?(RSS::RDF) + @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" + rss_materials = feed.items.map { |item| build_material_from_rss_item(item, source_url).to_h } + bioschemas_materials = extract_rdf_bioschemas_materials(content) + merge_with_bioschemas_priority(bioschemas_materials, rss_materials).each do |material| + add_material(material) + end + elsif feed.is_a?(RSS::Atom::Feed) + @messages << "Parsing ATOM feed: #{feed_title(feed)}" + feed.items.each { |item| add_material(build_material_from_atom_item(item, source_url)) } + else + @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" + @messages << 'unsupported feed format' + end + end + + private + + def extract_rdf_bioschemas_materials(content) + return [] unless content.present? + + materials = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + + @bioschemas_manager.deduplicate(materials) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? + @messages << 'An error occurred while extracting Bioschemas LearningResources.' + [] + end + + def build_material_from_rss_item(item, feed_url) + material = build_material_from_dublin_core_data(extract_dublin_core(item)) + + material.title ||= text_value(item.title) + native_url = resolve_feed_url(item.link, feed_url) + material.url = native_url if native_url.present? + itunes_summary = text_value(item.itunes_summary) if item.respond_to?(:itunes_summary) + material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded) || itunes_summary) + material.keywords = merge_unique(material.keywords, extract_rss_keywords(item)) + author = item.author if item.respond_to?(:author) + itunes_author = item.itunes_author if item.respond_to?(:itunes_author) + material.authors = merge_unique(material.authors, [text_value(author)] + [text_value(itunes_author)].compact) + material.contact ||= material.authors&.first + guid = item.guid if item.respond_to?(:guid) + material.doi ||= extract_dublin_core_doi([text_value(guid)]) + + item_date = parse_time(item.pubDate) if item.respond_to?(:pubDate) + item_date ||= parse_time(item.date) if item.respond_to?(:date) + material.date_published ||= item_date + material.date_created = prefer_precise_time(material.date_created, item_date) + material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.date)) if item.respond_to?(:date) + + material + end + + def build_material_from_atom_item(item, feed_url) + material = build_material_from_dublin_core_data(extract_dublin_core(item)) + + media_title = text_value(item.media_group&.media_title) + material.title ||= text_value(item.title) || media_title + native_url = resolve_feed_url(extract_atom_link(item), feed_url) + material.url = native_url if native_url.present? + media_group_description = text_value(item.media_group&.media_description) + material.description ||= convert_description(text_value(item.summary) || text_value(item.content) || media_group_description) + material.keywords = merge_unique(material.keywords, extract_atom_keywords(item)) + material.authors = merge_unique(material.authors, extract_atom_authors(item)) + material.contact ||= material.authors&.first + material.doi ||= extract_dublin_core_doi([text_value(item.id)]) + + published = parse_time(item.published) + updated = parse_time(item.updated) + material.date_created = prefer_precise_time(material.date_created, published) + material.date_published ||= published || updated + material.date_modified = prefer_precise_time(material.date_modified, updated) + + material + end + end +end diff --git a/lib/ingestors/oai_pmh_ingestor.rb b/lib/ingestors/oai_pmh_ingestor.rb index a4261fe9d..9ce325e67 100644 --- a/lib/ingestors/oai_pmh_ingestor.rb +++ b/lib/ingestors/oai_pmh_ingestor.rb @@ -2,6 +2,8 @@ module Ingestors class OaiPmhIngestor < Ingestor + include DublinCoreIngestion + def self.config { key: 'oai_pmh', @@ -36,20 +38,36 @@ def ns } end + def extract_dublin_core_from_xml(xml_doc) + { + title: xml_doc.at_xpath('//dc:title', ns)&.text, + description: xml_doc.at_xpath('//dc:description', ns)&.text, + creators: xml_doc.xpath('//dc:creator', ns).map(&:text), + contributors: xml_doc.xpath('//dc:contributor', ns).map(&:text), + rights: xml_doc.xpath('//dc:rights', ns).map(&:text), + dates: xml_doc.xpath('//dc:date', ns).map(&:text), + identifiers: xml_doc.xpath('//dc:identifier', ns).map(&:text), + subjects: xml_doc.xpath('//dc:subject', ns).map(&:text), + types: xml_doc.xpath('//dc:type', ns).map(&:text), + publisher: xml_doc.at_xpath('//dc:publisher', ns)&.text + } + end + def read_oai_dublin_core(client) count = 0 client.list_records(metadata_prefix: 'oai_dc').full.each do |record| xml_string = record.metadata.to_s doc = Nokogiri::XML(xml_string) + dc = extract_dublin_core_from_xml(doc) - types = doc.xpath('//dc:type', ns).map(&:text) + types = normalize_dublin_core_values(dc[:types]) # this event detection heuristic captures in particular # - http://purl.org/dc/dcmitype/Event (the standard way of typing an event in dublin core) # - https://schema.org/Event if types.any? { |t| t.downcase.include? 'event' } - read_dublin_core_event(doc) + add_event(build_event_from_dublin_core_data(dc)) else - read_dublin_core_material(doc) + add_material(build_material_from_dublin_core_data(dc)) end count += 1 @@ -57,63 +75,6 @@ def read_oai_dublin_core(client) @messages << "found #{count} records" end - def read_dublin_core_material(xml_doc) - material = OpenStruct.new - material.title = xml_doc.at_xpath('//dc:title', ns)&.text - material.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) - material.authors = xml_doc.xpath('//dc:creator', ns).map(&:text) - material.contributors = xml_doc.xpath('//dc:contributor', ns).map(&:text) - - rights = xml_doc.xpath('//dc:rights', ns).map { |n| n.text&.strip }.reject(&:empty?) - material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' - - dates = xml_doc.xpath('//dc:date', ns).map(&:text) - parsed_dates = dates.map do |d| - Date.parse(d) - rescue StandardError - nil - end.compact - material.date_created = parsed_dates.first - material.date_modified = parsed_dates.last if parsed_dates.size > 1 - - identifiers = xml_doc.xpath('//dc:identifier', ns).map(&:text) - doi = identifiers.find { |id| id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') } - if doi - doi = doi&.sub(%r{https?://doi\.org/}, '') - material.doi = "https://doi.org/#{doi}" - end - material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } - - material.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) - material.resource_type = xml_doc.xpath('//dc:type', ns).map(&:text) - material.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text - - add_material material - end - - def read_dublin_core_event(xml_doc) - event = OpenStruct.new - - event.title = xml_doc.at_xpath('//dc:title', ns)&.text - event.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) - event.url = xml_doc.xpath('//dc:identifier', ns).map(&:text).find { |id| id.start_with?('http://', 'https://') } - event.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text - event.organizer = xml_doc.at_xpath('//dc:creator', ns)&.text - event.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) - event.event_types = xml_doc.xpath('//dc:type', ns).map(&:text) - - dates = xml_doc.xpath('//dc:date', ns).map(&:text) - parsed_dates = dates.map do |d| - Date.parse(d) - rescue StandardError - nil - end.compact - event.start = parsed_dates.first - event.end = parsed_dates.last - - add_event event - end - def read_oai_rdf(client) provider_events = [] provider_materials = [] diff --git a/lib/ingestors/rss_ingestion.rb b/lib/ingestors/rss_ingestion.rb new file mode 100644 index 000000000..06e6fd2f7 --- /dev/null +++ b/lib/ingestors/rss_ingestion.rb @@ -0,0 +1,202 @@ +module Ingestors + module RSSIngestion + require 'cgi' + + include DublinCoreIngestion + + # Fetches and parses a feed from the URL, with optional HTML feed discovery. + # Returns [feed, raw_xml, source_url] on success. + # Returns [nil, nil, nil] when the URL cannot be opened or parsing/discovery fails. + def fetch_feed(url) + io = open_url(url) + return [nil, nil, nil] if io.nil? + + content = io.read + feed, parse_error_message = parse_feed(content) + return [feed, content, url] unless feed.nil? + + discovered_feed_url = discover_feed_url(content, url) + if discovered_feed_url.blank? + @messages << parse_error_message + @messages << "Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in: #{url}" + return [nil, nil, nil] + end + discovered_io = open_url(discovered_feed_url) + return [nil, nil, nil] if discovered_io.nil? + + discovered_content = discovered_io.read + discovered_feed, discovered_parse_error_message = parse_feed(discovered_content) + if discovered_feed.blank? + @messages << discovered_parse_error_message + return [nil, nil, nil] + end + + [discovered_feed, discovered_content, discovered_feed_url] + end + + def parse_feed(content) + feed = RSS::Parser.parse(content, { validate: false }) + return [feed, nil] if feed.present? + + [nil, 'parsing feed failed with: unrecognized feed content'] + rescue RSS::Error => e + [nil, "parsing feed failed with #{e.class}: #{e.message}"] + end + + def discover_feed_url(content, base_url) + if (url = discover_feed_url_from_html(content, base_url)) + @messages << "Found RSS/Atom feed link in HTML page, following: #{url}" + return url + end + + if (url = discover_feed_url_from_youtube_playlist_url(base_url)) + @messages << "Found Atom feed link from YouTube playlist URL, following: #{url}" + return url + end + + nil + end + + def discover_feed_url_from_html(content, base_url) + doc = Nokogiri::HTML(content) + link = doc.css('link[rel]').find do |node| + rel = node['rel'].to_s.downcase + type = node['type'].to_s.downcase + rel.include?('alternate') && (type.include?('rss') || type.include?('atom')) + end + + href = link&.[]('href') + URI.join(base_url, href).to_s if href.present? + rescue StandardError + nil + end + + def discover_feed_url_from_youtube_playlist_url(base_url) + uri = URI.parse(base_url) + host = uri.host.to_s.downcase + return nil unless host == 'youtube.com' || host.end_with?('.youtube.com') + + playlist_id = CGI.parse(uri.query.to_s).fetch('list', []).first + return nil if playlist_id.blank? + + "https://www.youtube.com/feeds/videos.xml?playlist_id=#{CGI.escape(playlist_id)}" + rescue URI::InvalidURIError + nil + end + + def feed_title(feed) + channel = feed.respond_to?(:channel) ? feed.channel : nil + return channel.title if channel.present? && channel.respond_to?(:title) + return text_value(feed.title) if feed.respond_to?(:title) + + 'Untitled feed' + end + + alias text_value dublin_core_text + + def parse_time(value) + value = value.content if value.respond_to?(:content) + + return value if value.is_a?(Time) || value.is_a?(Date) || value.is_a?(DateTime) + + text = text_value(value) + return nil if text.blank? + + Time.zone.parse(text) + rescue ArgumentError + nil + end + + def extract_dublin_core(item) + { + title: text_value(item.dc_title), + description: text_value(item.dc_description), + creators: Array(item.dc_creators), + contributors: Array(item.dc_contributors), + rights: Array(item.dc_rights_list), + dates: Array(item.dc_dates), + identifiers: Array(item.dc_identifiers), + subjects: Array(item.dc_subjects), + types: Array(item.dc_types), + publisher: item.dc_publisher + } + end + + def extract_rss_keywords(item) + return [] unless item.respond_to?(:categories) + + Array(item.categories).map { |c| text_value(c.respond_to?(:content) ? c.content : c) } + end + + def extract_atom_keywords(item) + return [] unless item.respond_to?(:categories) + + Array(item.categories).map { |c| text_value(c.respond_to?(:term) ? c.term : c) } + end + + def extract_atom_authors(item) + Array(item.authors).map { |author| text_value(author.respond_to?(:name) ? author.name : author) } + end + + def extract_atom_link(item) + links = Array(item.links) + + preferred_link = links.find do |link| + href = text_value(link.href) + rel = text_value(link.respond_to?(:rel) ? link.rel : nil).to_s.downcase + + href.present? && (rel.blank? || rel == 'alternate') + end + return text_value(preferred_link.href) if preferred_link.present? + + links.map { |link| text_value(link.href) }.find(&:present?) + end + + def resolve_feed_url(candidate_url, feed_url) + candidate = text_value(candidate_url) + return nil if candidate.blank? + + URI.parse(candidate) + return candidate if URI::DEFAULT_PARSER.make_regexp(%w[http https]).match?(candidate) + + URI.join(feed_url, candidate).to_s + rescue URI::InvalidURIError + URI.join(feed_url, candidate).to_s + rescue StandardError + candidate + end + + def prefer_precise_time(existing_value, candidate_time) + return existing_value if candidate_time.blank? + return candidate_time if existing_value.blank? + + return candidate_time if existing_value.is_a?(Date) && !existing_value.is_a?(DateTime) && existing_value == candidate_time.to_date + + existing_value + end + + def merge_unique(existing_values, new_values) + normalize_dublin_core_values(Array(existing_values) + Array(new_values)) + end + + def merge_with_bioschemas_priority(bioschemas_records, rss_records) + rss_by_url = rss_records.index_by { |record| record[:url].to_s } + + merged = bioschemas_records.map do |bioschemas_record| + key = bioschemas_record[:url].to_s + rss_record = rss_by_url.delete(key) + merge_record_pair(bioschemas_record, rss_record) + end + + merged + rss_by_url.values + end + + def merge_record_pair(primary_record, secondary_record) + return primary_record if secondary_record.nil? + + secondary_record.merge(primary_record) do |_key, secondary_value, primary_value| + primary_value.present? ? primary_value : secondary_value + end + end + end +end diff --git a/lib/rss/media.rb b/lib/rss/media.rb new file mode 100644 index 000000000..bbc209079 --- /dev/null +++ b/lib/rss/media.rb @@ -0,0 +1,32 @@ +# Extension for the Yahoo Media RSS namespace (xmlns:media="http://search.yahoo.com/mrss/"). +# Used by feeds that carry rich media metadata, e.g. YouTube channel feeds which include +# , , and elements. +# +# The extension is structured as RSS::Media (rather than a flat module inside RSS) so that +# Zeitwerk can autoload it correctly from lib/rss/media.rb. +require 'rss/atom' + +module RSS + module Media + MEDIA_PREFIX = 'media' + MEDIA_URI = 'http://search.yahoo.com/mrss/' + + module MediaGroupDescriptionModel + extend ::RSS::BaseModel + + def self.append_features(klass) + super + return if klass.instance_of?(Module) + + klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) + klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group') + end + end + + ::RSS::BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup') + ::RSS::BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title') + ::RSS::BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description') + end +end + +require_relative 'media/atom' diff --git a/lib/rss/media/atom.rb b/lib/rss/media/atom.rb new file mode 100644 index 000000000..08bb76633 --- /dev/null +++ b/lib/rss/media/atom.rb @@ -0,0 +1,54 @@ +# Patches RSS::Atom::Feed and RSS::Atom::Entry with Media namespace support (see ../media.rb). +# Kept as RSS::Media::Atom so Zeitwerk can autoload it from lib/rss/media/atom.rb. +module RSS + module Media + module Atom + def self.install_media_namespace! + # This extension can be evaluated more than once in reloader/autoload flows. + # RSS::Element.install_ns raises on duplicate prefixes, so treat same mapping as a no-op. + ns_pool = ::RSS::Atom::Feed::NSPOOL + existing_uri = ns_pool[MEDIA_PREFIX] + + return if existing_uri == MEDIA_URI + + raise ::RSS::OverlappedPrefixError, MEDIA_PREFIX unless existing_uri.nil? + + ::RSS::Atom::Feed.install_ns(MEDIA_PREFIX, MEDIA_URI) + end + + install_media_namespace! + + class ::RSS::Atom::Feed + include ::RSS::Media::MediaGroupDescriptionModel + + class Entry + include ::RSS::Media::MediaGroupDescriptionModel + + class MediaGroup < Element + include RSS09 + + @tag_name = 'group' + + class << self + def required_prefix + ::RSS::Media::MEDIA_PREFIX + end + + def required_uri + ::RSS::Media::MEDIA_URI + end + end + + install_must_call_validator(::RSS::Media::MEDIA_PREFIX, ::RSS::Media::MEDIA_URI) + install_text_element('title', ::RSS::Media::MEDIA_URI, '?', 'media_title') + install_text_element('description', ::RSS::Media::MEDIA_URI, '?', 'media_description') + end + end + end + + class ::RSS::Atom::Entry + include ::RSS::Media::MediaGroupDescriptionModel + end + end + end +end diff --git a/test/unit/ingestors/event_rss_ingestor_test.rb b/test/unit/ingestors/event_rss_ingestor_test.rb new file mode 100644 index 000000000..2939a9c53 --- /dev/null +++ b/test/unit/ingestors/event_rss_ingestor_test.rb @@ -0,0 +1,340 @@ +require 'test_helper' +require 'stringio' + +class EventRSSIngestorTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::EventRSSIngestor.new + mock_timezone + end + + teardown do + reset_timezone + end + + test 'reads rss items from dublin core and native rss fields' do + rss_feed_xml = <<~XML + + + + RSS Event Feed + + + Native RSS event title + https://example.org/events/native + Native RSS event description + native.author@example.org (Native Event Author) + native-event-category + Sat, 01 Jun 2024 09:00:00 GMT + DC RSS event title + DC RSS event description + DC Event Creator + event-topic-a + workshop + 2024-06-01 + 2024-06-02 + https://example.org/events/dc-url + rss event publisher + + + + Fallback RSS event title + https://example.org/events/fallback + Fallback RSS event description + Fallback RSS Author + fallback-event-category + Mon, 03 Jun 2024 12:00:00 GMT + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 2, @ingestor.events.count + + dc_event = @ingestor.events.first + assert_equal 'DC RSS event title', dc_event.title + assert_equal 'https://example.org/events/native', dc_event.url + assert_equal 'DC RSS event description', dc_event.description + assert_equal 'DC Event Creator', dc_event.organizer + assert_equal 'rss event publisher', dc_event.contact + assert_equal %w[event-topic-a native-event-category], dc_event.keywords + assert_equal ['workshop'], dc_event.event_types + assert_equal Date.new(2024, 6, 1), dc_event.start + assert_equal Date.new(2024, 6, 2), dc_event.end.to_date + + fallback_event = @ingestor.events.second + assert_equal 'Fallback RSS event title', fallback_event.title + assert_equal 'https://example.org/events/fallback', fallback_event.url + assert_equal 'Fallback RSS event description', fallback_event.description + assert_equal 'Fallback RSS Author', fallback_event.organizer + assert_equal 'Fallback RSS Author', fallback_event.contact + assert_equal ['fallback-event-category'], fallback_event.keywords + assert_equal [], fallback_event.event_types + assert_nil fallback_event.start + assert_nil fallback_event.end + end + + test 'resolves relative rss item links against the feed url' do + rss_feed_xml = <<~XML + + + + Relative link feed + + Relative URL event + /community-calls/april-2026/ + Event from a feed with relative links + Wed, 08 Apr 2026 10:00:00 GMT + + + + XML + + read_xml(rss_feed_xml, 'https://nl-rse.org/feed.xml') + + assert_equal 1, @ingestor.events.count + assert_equal 'https://nl-rse.org/community-calls/april-2026/', @ingestor.events.first.url + end + + test 'reads atom items from dublin core and native atom fields' do + atom_feed_xml = <<~XML + + + Atom Event Feed + + + Native Atom event title + + + Native Atom event summary + Native Atom Author + + 2024-07-01T10:00:00Z + 2024-07-02T11:00:00Z + DC Atom event title + DC Atom event description + DC Atom Creator + atom-event-topic + seminar + 2024-07-01 + 2024-07-02 + https://example.org/atom-events/dc-url + atom event publisher + + + + Fallback Atom event title + + Fallback Atom event content + Fallback Atom Author + + 2024-07-03T10:00:00Z + 2024-07-04T11:00:00Z + + + XML + + read_xml(atom_feed_xml) + + assert_equal 2, @ingestor.events.count + + dc_event = @ingestor.events.first + assert_equal 'DC Atom event title', dc_event.title + assert_equal 'https://example.org/atom-events/native', dc_event.url + assert_equal 'DC Atom event description', dc_event.description + assert_equal 'DC Atom Creator', dc_event.organizer + assert_equal 'atom event publisher', dc_event.contact + assert_equal %w[atom-event-topic native-atom-event-category], dc_event.keywords + assert_equal ['seminar'], dc_event.event_types + assert_equal Date.new(2024, 7, 1), dc_event.start + assert_equal Date.new(2024, 7, 2), dc_event.end + + fallback_event = @ingestor.events.second + assert_equal 'Fallback Atom event title', fallback_event.title + assert_equal 'https://example.org/atom-events/fallback', fallback_event.url + assert_equal 'Fallback Atom event content', fallback_event.description + assert_equal 'Fallback Atom Author', fallback_event.organizer + assert_equal 'Fallback Atom Author', fallback_event.contact + assert_equal ['fallback-atom-event-category'], fallback_event.keywords + assert_equal [], fallback_event.event_types + assert_nil fallback_event.start + assert_nil fallback_event.end + end + + test 'reads bioschemas event from rss 1.0 rdf feed' do + rss_10_bioschemas_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas event feed + https://example.org/rss10-bioschemas-events + desc + + + + + + + + Fallback RSS 1.0 event title + https://example.org/rss10-bioschemas/event-item + Fallback RSS 1.0 event description + + + + RSS 1.0 Bioschemas event title + + 2024-08-01 + 2024-08-02 + + + XML + + read_xml(rss_10_bioschemas_feed_xml) + + assert_equal 2, @ingestor.events.count + + event = @ingestor.events.detect { |e| e.url == 'https://example.org/rss10/bioschemas/event' } + refute_nil event + assert_equal 'RSS 1.0 Bioschemas event title', event.title + assert_equal 'https://example.org/rss10/bioschemas/event', event.url + + fallback_event = @ingestor.events.detect { |e| e.url == 'https://example.org/rss10-bioschemas/event-item' } + refute_nil fallback_event + assert_equal 'Fallback RSS 1.0 event title', fallback_event.title + end + + test 'merges rss properties into bioschemas event for same url with bioschemas priority' do + rss_10_bioschemas_merged_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas merged event feed + https://example.org/rss10-merged-events + desc + + + + + + + + + RSS 1.0 fallback event title + https://example.org/rss10/merged/event + RSS 1.0 fallback event description that should fill missing bioschemas value + RSS 1.0 Merged Event Creator + rss10-merged-event-subject + 2024-08-01 + + + + RSS 1.0 Bioschemas preferred event title + + + + XML + + read_xml(rss_10_bioschemas_merged_feed_xml) + + assert_equal 1, @ingestor.events.count + + event = @ingestor.events.first + assert_equal 'RSS 1.0 Bioschemas preferred event title', event.title + assert_equal 'https://example.org/rss10/merged/event', event.url + assert_equal 'RSS 1.0 fallback event description that should fill missing bioschemas value', event.description + assert_equal ['rss10-merged-event-subject'], event.keywords + assert_equal 'RSS 1.0 Merged Event Creator', event.organizer + assert_equal Date.new(2024, 8, 1), event.start.to_date + assert_equal Date.new(2024, 8, 1), event.end.to_date + end + + test 'reads feed from html alternate meta link' do + start_url = 'https://www.youtube.com/@event_channel' + feed_url = 'https://www.youtube.com/feeds/videos.xml?channel_id=UCevent123' + + html_with_alternate_feed_link = <<~HTML + + + + + + Channel page + + HTML + + atom_feed_xml = <<~XML + + + Minimal Atom event feed + + Alternate feed event + + Minimal content used for alternate-link test + Alternate Event Organizer + 2024-07-02T11:00:00Z + + + XML + + read_xml_map( + { + start_url => html_with_alternate_feed_link, + feed_url => atom_feed_xml + }, + start_url + ) + + assert_equal 1, @ingestor.events.count + assert_includes @ingestor.messages, + "Found RSS/Atom feed link in HTML page, following: #{feed_url}" + assert_equal 'Alternate feed event', @ingestor.events.first.title + end + + test 'discovers youtube playlist feed from watch url' do + html_without_feed_link = 'No feed link' + start_url = 'https://www.youtube.com/watch?v=z58CgdgFC8s&list=PLOVYPdB2NZ6M-hQfAIn6srsxfzuNp1EPd&index=1' + + discovered_url = @ingestor.send(:discover_feed_url, html_without_feed_link, start_url) + + assert_equal 'https://www.youtube.com/feeds/videos.xml?playlist_id=PLOVYPdB2NZ6M-hQfAIn6srsxfzuNp1EPd', discovered_url + end + + test 'logs rss parser error' do + RSS::Parser.stub(:parse, proc { raise RSS::InvalidRSSError, 'simulated rss parse error' }) do + read_xml('') + end + + assert_equal 2, @ingestor.messages.length + assert_equal 'parsing feed failed with RSS::InvalidRSSError: simulated rss parse error', @ingestor.messages.first + assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, + @ingestor.messages.second) + assert_empty @ingestor.events + end + + private + + def read_xml(xml, url = 'https://example.org/event-feed.xml') + @ingestor.stub(:open_url, StringIO.new(xml)) do + @ingestor.read(url) + end + end + + def read_xml_map(url_to_content, start_url) + @ingestor.stub(:open_url, lambda do |requested_url| + content = url_to_content[requested_url] + content.nil? ? nil : StringIO.new(content) + end) do + @ingestor.read(start_url) + end + end +end diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb new file mode 100644 index 000000000..97b6bc363 --- /dev/null +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -0,0 +1,597 @@ +require 'test_helper' +require 'stringio' + +class MaterialRSSIngestorTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::MaterialRSSIngestor.new + end + + test 'reads rss items from dublin core and native rss fields' do + rss_feed_xml = <<~XML + + + + RSS material feed + + + Native RSS title + https://example.org/rss/native-link + Native RSS description + native.author@example.org (Native RSS Author) + native-category + 10.9999/native-rss-guid + Tue, 02 Jan 2024 03:04:05 GMT + DC RSS title + DC RSS description + DC Creator One + DC Creator Two + DC Contributor One + DC Contributor Two + plain rights + https://example.org/licenses/rss + 2024-01-01 + 2024-01-10 + https://example.org/rss/dc-url + 10.1234/rss-doi + dc-subject-a + dc-subject-b + dc-type-a + dc-type-b + rss publisher + + + + Plain Rights RSS title + https://example.org/rss/plain-rights + Plain rights RSS description + Plain Rights RSS Creator + plain-only-rights + not-a-date + 2024-01-11 + https://example.org/rss/plain-rights + plain-rights-subject + plain-rights-type + plain rights publisher + + + + Fallback RSS title + https://example.org/rss/fallback + Fallback RSS Author + fallback-category-a + fallback-category-b + 10.5555/fallback-rss-guid + Wed, 03 Jan 2024 04:05:06 GMT + + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 3, @ingestor.materials.count + + dc_material = @ingestor.materials.first + assert_equal 'DC RSS title', dc_material.title + assert_equal 'https://example.org/rss/native-link', dc_material.url + assert_equal 'DC RSS description', dc_material.description + assert_equal ['DC Creator One', 'DC Creator Two', 'native.author@example.org (Native RSS Author)'], dc_material.authors + assert_equal ['DC Contributor One', 'DC Contributor Two'], dc_material.contributors + assert_equal 'https://example.org/licenses/rss', dc_material.licence + assert_equal Date.new(2024, 1, 1), dc_material.date_created + assert_equal Time.utc(2024, 1, 2, 3, 4, 5), dc_material.date_published.utc + assert_equal Date.new(2024, 1, 10), dc_material.date_modified + assert_equal 'https://doi.org/10.1234/rss-doi', dc_material.doi + assert_equal %w[dc-subject-a dc-subject-b native-category], dc_material.keywords + assert_equal %w[dc-type-a dc-type-b], dc_material.resource_type + assert_equal 'rss publisher', dc_material.contact + + plain_rights_material = @ingestor.materials.second + assert_equal 'Plain Rights RSS title', plain_rights_material.title + assert_equal 'https://example.org/rss/plain-rights', plain_rights_material.url + assert_equal 'Plain rights RSS description', plain_rights_material.description + assert_equal ['Plain Rights RSS Creator'], plain_rights_material.authors + assert_equal [], plain_rights_material.contributors + assert_equal 'plain-only-rights', plain_rights_material.licence + assert_equal Date.new(2024, 1, 11), plain_rights_material.date_created + assert_nil plain_rights_material.date_modified + assert_nil plain_rights_material.doi + assert_equal ['plain-rights-subject'], plain_rights_material.keywords + assert_equal ['plain-rights-type'], plain_rights_material.resource_type + assert_equal 'plain rights publisher', plain_rights_material.contact + + fallback_material = @ingestor.materials.third + assert_equal 'Fallback RSS title', fallback_material.title + assert_equal 'https://example.org/rss/fallback', fallback_material.url + assert_equal 'Fallback RSS content encoded', fallback_material.description + assert_equal ['Fallback RSS Author'], fallback_material.authors + assert_equal [], fallback_material.contributors + assert_equal 'notspecified', fallback_material.licence + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_created.utc + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_published.utc + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_modified.utc + assert_equal 'https://doi.org/10.5555/fallback-rss-guid', fallback_material.doi + assert_equal %w[fallback-category-a fallback-category-b], fallback_material.keywords + assert_equal [], fallback_material.resource_type + assert_equal 'Fallback RSS Author', fallback_material.contact + end + + test 'reads atom items from dublin core and native atom fields' do + atom_feed_xml = <<~XML + + + Atom material feed + + + Native Atom title + + + Native Atom summary + Native Atom Author + + 10.9999/native-atom-id + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + DC Atom title + DC Atom description + DC Atom Creator One + DC Atom Creator Two + DC Atom Contributor One + plain atom rights + https://example.org/licenses/atom + 2024-02-01 + 2024-02-05 + https://example.org/atom/dc-url + https://doi.org/10.1234/atom-doi + atom-dc-subject + atom-dc-type + atom publisher + + + + Plain Rights Atom title + + Plain rights Atom description + Plain Rights Atom Creator + plain-atom-rights + invalid-date + 2024-02-11 + https://example.org/atom/plain-rights + plain-atom-subject + plain-atom-type + plain atom publisher + + + + Fallback Atom title + + Fallback Atom content + Fallback Atom Author + + + 10.5555/fallback-atom-id + 2024-03-04T05:06:07Z + 2024-03-05T06:07:08Z + + + XML + + read_xml(atom_feed_xml) + + assert_equal 3, @ingestor.materials.count + + dc_material = @ingestor.materials.first + assert_equal 'DC Atom title', dc_material.title + assert_equal 'https://example.org/atom/native-link', dc_material.url + assert_equal 'DC Atom description', dc_material.description + assert_equal ['DC Atom Creator One', 'DC Atom Creator Two', 'Native Atom Author'], dc_material.authors + assert_equal ['DC Atom Contributor One'], dc_material.contributors + assert_equal 'https://example.org/licenses/atom', dc_material.licence + assert_equal Date.new(2024, 2, 1), dc_material.date_created + assert_equal Time.utc(2024, 2, 2, 3, 4, 5), dc_material.date_published.utc + assert_equal Date.new(2024, 2, 5), dc_material.date_modified + assert_equal 'https://doi.org/10.1234/atom-doi', dc_material.doi + assert_equal %w[atom-dc-subject native-atom-category], dc_material.keywords + assert_equal ['atom-dc-type'], dc_material.resource_type + assert_equal 'atom publisher', dc_material.contact + + plain_rights_material = @ingestor.materials.second + assert_equal 'Plain Rights Atom title', plain_rights_material.title + assert_equal 'https://example.org/atom/plain-rights', plain_rights_material.url + assert_equal 'Plain rights Atom description', plain_rights_material.description + assert_equal ['Plain Rights Atom Creator'], plain_rights_material.authors + assert_equal [], plain_rights_material.contributors + assert_equal 'plain-atom-rights', plain_rights_material.licence + assert_equal Date.new(2024, 2, 11), plain_rights_material.date_created + assert_nil plain_rights_material.date_modified + assert_nil plain_rights_material.doi + assert_equal ['plain-atom-subject'], plain_rights_material.keywords + assert_equal ['plain-atom-type'], plain_rights_material.resource_type + assert_equal 'plain atom publisher', plain_rights_material.contact + + fallback_material = @ingestor.materials.third + assert_equal 'Fallback Atom title', fallback_material.title + assert_equal 'https://example.org/atom/fallback', fallback_material.url + assert_equal 'Fallback Atom content', fallback_material.description + assert_equal ['Fallback Atom Author'], fallback_material.authors + assert_equal [], fallback_material.contributors + assert_equal 'notspecified', fallback_material.licence + assert_equal Time.utc(2024, 3, 4, 5, 6, 7), fallback_material.date_created.utc + assert_equal Time.utc(2024, 3, 4, 5, 6, 7), fallback_material.date_published.utc + assert_equal Time.utc(2024, 3, 5, 6, 7, 8), fallback_material.date_modified.utc + assert_equal 'https://doi.org/10.5555/fallback-atom-id', fallback_material.doi + assert_equal %w[fallback-atom-category-a fallback-atom-category-b], fallback_material.keywords + assert_equal [], fallback_material.resource_type + assert_equal 'Fallback Atom Author', fallback_material.contact + end + + test 'logs parse error for invalid feed input' do + read_xml('not valid rss or atom') + + assert_equal 2, @ingestor.messages.length + assert_match(/^parsing feed failed with RSS::NotWellFormedError: This is not well formed XML/, @ingestor.messages.first) + assert_match(%r{^Attempted HTML feed discovery, but no RSS/Atom alternate feed link was found in:}, + @ingestor.messages.second) + assert_empty @ingestor.materials + end + + test 'reads rss 0.91 feed' do + rss_091_feed_xml = <<~XML + + + + RSS 0.91 feed + https://example.org/rss091 + desc + + RSS 0.91 title + https://example.org/rss091/item + RSS 0.91 description + + + + XML + + read_xml(rss_091_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 0.91 title', material.title + assert_equal 'https://example.org/rss091/item', material.url + assert_equal 'RSS 0.91 description', material.description + assert_equal [], material.keywords + assert_equal 'notspecified', material.licence + assert_nil material.doi + assert_nil material.contact + end + + test 'reads rss 1.0 feed' do + rss_10_feed_xml = <<~XML + + + + RSS 1.0 feed + https://example.org/rss10 + desc + + + + + + + + RSS 1.0 title + https://example.org/rss10/item + RSS 1.0 description + RSS 1.0 Creator + rss10-subject + 10.1111/rss10doi + 2024-04-01 + + + XML + + read_xml(rss_10_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 1.0 title', material.title + assert_equal 'https://example.org/rss10/item', material.url + assert_equal 'RSS 1.0 description', material.description + assert_equal ['RSS 1.0 Creator'], material.authors + assert_equal ['rss10-subject'], material.keywords + assert_equal 'https://doi.org/10.1111/rss10doi', material.doi + assert_equal Date.new(2024, 4, 1), material.date_created.to_date + assert_equal Date.new(2024, 4, 1), material.date_modified.to_date + end + + test 'reads bioschemas learning resource from rss 1.0 rdf feed' do + rss_10_bioschemas_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas feed + https://example.org/rss10-bioschemas + desc + + + + + + + + Fallback RSS 1.0 title + https://example.org/rss10-bioschemas/item + Fallback RSS 1.0 description + + + + + + + RSS 1.0 Bioschemas title + + + + + XML + + read_xml(rss_10_bioschemas_feed_xml) + + assert_equal 2, @ingestor.materials.count + + material = @ingestor.materials.detect { |m| m.url == 'https://example.org/rss10/bioschemas/material' } + refute_nil material + assert_equal 'RSS 1.0 Bioschemas title', material.title + assert_equal 'https://example.org/rss10/bioschemas/material', material.url + assert_equal 'https://opensource.org/licenses/MIT', material.licence + + fallback_material = @ingestor.materials.detect { |m| m.url == 'https://example.org/rss10-bioschemas/item' } + refute_nil fallback_material + assert_equal 'Fallback RSS 1.0 title', fallback_material.title + end + + test 'merges rss properties into bioschemas material for same url with bioschemas priority' do + rss_10_bioschemas_merged_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas merged feed + https://example.org/rss10-merged + desc + + + + + + + + + RSS 1.0 fallback title + https://example.org/rss10/merged/material + RSS 1.0 fallback description that should fill missing bioschemas value + RSS 1.0 Merged Creator + rss10-merged-subject + 2024-05-01 + + + + + + + RSS 1.0 Bioschemas preferred title + + + + + XML + + read_xml(rss_10_bioschemas_merged_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 1.0 Bioschemas preferred title', material.title + assert_equal 'https://example.org/rss10/merged/material', material.url + assert_equal 'https://opensource.org/licenses/Apache-2.0', material.licence + assert_equal 'RSS 1.0 fallback description that should fill missing bioschemas value', material.description + assert_equal ['rss10-merged-subject'], material.keywords + assert_equal ['RSS 1.0 Merged Creator'], material.authors + assert_equal Date.new(2024, 5, 1), material.date_created.to_date + assert_equal Date.new(2024, 5, 1), material.date_modified.to_date + end + + test 'reads feed from html alternate meta link' do + start_url = 'https://www.youtube.com/@example' + feed_url = 'https://www.youtube.com/feeds/videos.xml?channel_id=UC123456789' + + html_with_alternate_feed_link = <<~HTML + + + + Channel + + + Channel page + + HTML + + atom_feed_xml = <<~XML + + + Minimal Atom material feed + + Alternate feed material + + Minimal content used for alternate-link test + Alternate Feed Author + 2024-02-02T03:04:05Z + + + XML + + read_xml_map( + { + start_url => html_with_alternate_feed_link, + feed_url => atom_feed_xml + }, + start_url + ) + + assert_equal 1, @ingestor.materials.count + assert_includes @ingestor.messages, + "Found RSS/Atom feed link in HTML page, following: #{feed_url}" + assert_equal 'Alternate feed material', @ingestor.materials.first.title + end + + test 'uses native atom title and description taking precedence over media extension' do + atom_feed_xml = <<~XML + + + Atom media precedence feed + + + yt:video:abc123 + Native Atom title wins + + Native Atom summary wins + Atom Author + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + + Media title ignored + Media description ignored + + + + XML + + read_xml(atom_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'Native Atom title wins', material.title + assert_equal 'Native Atom summary wins', material.description + end + + test 'uses media extension title and description for atom item when native ones are missing' do + atom_feed_xml = <<~XML + + + Atom media extension feed + + + yt:video:fallback123 + + Atom Author + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + + Media title used here + Media description used here + + + + XML + + read_xml(atom_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'Media title used here', material.title + assert_equal 'Media description used here', material.description + end + + test 'parses media group description through rss media extension' do + atom_feed_xml = <<~XML + + + Media extension feed + urn:feed:test + 2024-01-01T00:00:00Z + + + urn:entry:test + Media extension title + + 2024-01-01T00:00:00Z + + Media extension description + + + + XML + + feed = RSS::Parser.parse(atom_feed_xml, validate: false, ignore_unknown_element: true) + item = feed.items.first + + assert item.respond_to?(:media_group) + assert_equal 'Media extension description', item.media_group.media_description + end + + test 'uses itunes extension summary for rss item when native description is missing' do + rss_feed_xml = <<~XML + + + + RSS iTunes extension feed + + RSS item with iTunes summary + https://example.org/rss/itunes-summary + RSS Author + Fri, 02 Feb 2024 03:04:05 GMT + iTunes summary used here + iTunes Author + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'RSS item with iTunes summary', material.title + assert_equal 'iTunes summary used here', material.description + assert_includes material.authors, 'RSS Author' + assert_includes material.authors, 'iTunes Author' + end + + private + + def read_xml(xml, url = 'https://example.org/feed.xml') + @ingestor.stub(:open_url, StringIO.new(xml)) do + @ingestor.read(url) + end + end + + def read_xml_map(url_to_content, start_url) + @ingestor.stub(:open_url, lambda do |requested_url| + content = url_to_content[requested_url] + content.nil? ? nil : StringIO.new(content) + end) do + @ingestor.read(start_url) + end + end +end diff --git a/test/unit/rss_media_atom_test.rb b/test/unit/rss_media_atom_test.rb new file mode 100644 index 000000000..3faf0f589 --- /dev/null +++ b/test/unit/rss_media_atom_test.rb @@ -0,0 +1,12 @@ +require 'test_helper' + +class RSSMediaAtomTest < ActiveSupport::TestCase + test 'install_media_namespace! is idempotent for the media prefix' do + assert_nothing_raised do + RSS::Media::Atom.install_media_namespace! + RSS::Media::Atom.install_media_namespace! + end + + assert_equal RSS::Media::MEDIA_URI, RSS::Atom::Feed::NSPOOL[RSS::Media::MEDIA_PREFIX] + end +end