Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 17 additions & 53 deletions lib/ingestors/taxila/uhasselt_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,65 +27,29 @@ def read(url)
private

def process_uhasselt(url)
uhasselt_url = 'https://bibliotheek.uhasselt.be/nl/resources#kalender'
event_page = Nokogiri::HTML5.parse(open_url(uhasselt_url.to_s, raise: true)).css("table[summary='RDM training activities at Hasselt University']").first.css('tr')
event_page.each_with_index do |el, idx|
if el.css('td').length != 9
uhasselt_url = 'https://www.uhasselt.be/en/university-library/research/research-data-management/training-calendar-rdm'
overview_page = Nokogiri::HTML5.parse(open_url(uhasselt_url.to_s, raise: true)).css("table").first.css('tr')
overview_page.each_with_index do |el, idx|
if el.css('td').length != 4
next
end

event = OpenStruct.new
new_url = "https://www.uhasselt.be#{el.css('td').last.css('a').first.get_attribute('href').to_s}"
sleep(1) unless Rails.env.test? and File.exist?('test/vcr_cassettes/ingestors/uhasselt.yml')
event_page = Nokogiri::HTML5.parse(open_url(new_url.to_s, raise: true))

# date
datetime_text = el.css('td')[0].text.gsub("\n", '').gsub("\t", '').strip
if datetime_text.include?('(')
datetime_list = datetime_text.split('(')
date_text = datetime_list[0].strip
time_text = datetime_list[1].gsub(")", '').strip
time_list = time_text.split('-')
start_hours = time_list[0]
end_hours = time_list[1]
else
date_text = datetime_text
start_hours = 9
end_hours = 17
end
date_s = date_text.split('/')
if date_s.length == 1
next
end
start_date = Time.zone.parse("#{date_s[1]}/#{date_s[0]} #{start_hours}:00")
end_date = Time.zone.parse("#{date_s[1]}/#{date_s[0]} #{end_hours}:00")
if start_date < Time.zone.now - 2.months
start_date += 1.year
end_date += 1.year
event = OpenStruct.new
event.url = new_url.to_s
event.title = event_page.css('.uhasselt-container > .column > div > h1.heading').first.text.strip
time_strs = event_page.css('.uhasselt-container > .column > .extra-agenda-info > .info-row').first.text.strip.split('-')
event.start = DateTime.parse(time_strs[0].strip)
if time_strs.length > 1
event.end = DateTime.parse(time_strs[1].strip)
end

event.start = start_date
event.end = end_date
event.set_default_times

# location
location = el.css('td')[5].css('h5').map{ |e| e.text.strip}.join(' ')
event.venue = location

# title & description
title_el = el.css('td')[1]
url = title_el&.css('a')&.first&.get_attribute('href')&.gsub(' ', '') || uhasselt_url
if title_el&.text
title = title_el.text
elsif title_el&.css('a')&.first&.text
title = title_el&.css('a')&.first&.text
elsif title_el&.css('a')&.first&.css('#text').length
title = title_el&.css('a')&.first&.css('#text').map{ |e| e.text.strip}.join(' ')
else
next
end
# weird case where multiple types of space character where used in same title
event.title = title.gsub("\n\t\t\t", ' ').strip.chars.map{ |ch| ch.ord == 160 ? ' ' : ch }.join('')
hash = "#{event.title}#{event.start.strftime('%y%m%d')}#{event.venue}".gsub(' ', '').strip.chars.filter{ |ch| (ch.to_i(36) > 0) || (ch == '0') }.join('')
event.url = url.split('#').first + '#' + hash

event.description = event_page.css('.uhasselt-container > h2#anch-content').first.parent.css('.paragraph').first.text.strip
event.venue = event_page.css('.uhasselt-container > .column > .extra-agenda-info > .info-row').last.text.strip

event.source = 'UHasselt'
event.timezone = 'Amsterdam'

Expand Down
16 changes: 8 additions & 8 deletions test/unit/ingestors/taxila/uhasselt_ingestor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ class UhasseltIngestorTest < ActiveSupport::TestCase
ingestor = Ingestors::Taxila::UhasseltIngestor.new

# check event doesn't
new_title = 'Data Management Plan - DMP writing session'
new_url = 'http://bibliotheek.uhasselt.be/nl/een-woordje-uitleg-over-onze-opleidingen#DataManagementPlanDMPwritingsession230606CampusDiepenbeekE140'
new_title = 'Tidy data part 1: How to structure your data | March 2026'
new_url = 'https://www.uhasselt.be/en/university-library/research/research-data-management/training-calendar-rdm/tidy-data-part-1-how-to-structure-your-data-march-2026'
refute Event.where(title: new_title, url: new_url).any?

# run task
assert_difference 'Event.count', 14 do
assert_difference 'Event.count', 10 do
freeze_time(2023) do
VCR.use_cassette("ingestors/uhasselt") do
ingestor.read(source.url)
Expand All @@ -32,9 +32,9 @@ class UhasseltIngestorTest < ActiveSupport::TestCase
end
end

assert_equal 14, ingestor.events.count
assert_equal 10, ingestor.events.count
assert ingestor.materials.empty?
assert_equal 14, ingestor.stats[:events][:added]
assert_equal 10, ingestor.stats[:events][:added]
assert_equal 0, ingestor.stats[:events][:updated]
assert_equal 0, ingestor.stats[:events][:rejected]

Expand All @@ -47,9 +47,9 @@ class UhasseltIngestorTest < ActiveSupport::TestCase
# check other fields
assert_equal 'UHasselt', event.source
assert_equal 'Amsterdam', event.timezone
assert_equal Time.zone.parse('Tue, 06 Jun 2023 13:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Tue, 06 Jun 2023 15:00:00.000000000 UTC +00:00'), event.end
assert_equal 'Campus Diepenbeek E140', event.venue
assert_equal Time.zone.parse('Mon, 16 Mar 2026 09:00:00.000000000 UTC +00:00'), event.start
assert_equal Time.zone.parse('Mon, 16 Mar 2026 17:00:00.000000000 UTC +00:00'), event.end
assert_equal 'campus Diepenbeek', event.venue
refute event.online?
end
end
Loading