class Object

Public Instance Methods

export_to_file(hash_data, dir) click to toggle source
# File lib/history_scraper.rb, line 63
def export_to_file(hash_data, dir)
  File.write(dir, hash_data.to_json)
  puts "Results stored in #{dir}"
end
extract_from(day, month) click to toggle source
# File lib/history_scraper.rb, line 36
def extract_from(day, month)
  html = Nokogiri::HTML open("https://en.wikipedia.org/wiki/#{month}_#{day}")

  description = html.css('#mw-content-text p')
                    .map(&:text)
                    .find { |text| text.include?("#{month} #{day}") }

  events = parse_ul html.css('#Events')[0].parent.next_element
  births = parse_ul html.css('#Births')[0].parent.next_element
  deaths = parse_ul html.css('#Deaths')[0].parent.next_element

  [description, events.compact, births.compact, deaths.compact]
end
form_date(day_index, month_index) click to toggle source
# File lib/history_scraper.rb, line 31
def form_date(day_index, month_index)
  date = Date._strptime("#{day_index}/#{month_index}", '%d/%m')
  [date[:mday], Date::MONTHNAMES[date[:mon]]]
end
parse_keywords(li) click to toggle source
# File lib/history_scraper.rb, line 59
def parse_keywords(li)
  li.css('a').map { |a| { title: a['title'], href: a['href'] } }
end
parse_ul(ul) click to toggle source
# File lib/history_scraper.rb, line 50
def parse_ul(ul)
  ul.css('li').map do |li|
    year, *text = li.text.split(' – ')

    next unless year && !text.empty?
    { year: year, data: text.join(' – '), kw: parse_keywords(li) }
  end
end
scrap_year(output_dir) click to toggle source
# File lib/history_scraper.rb, line 6
def scrap_year(output_dir)
  result = {}
  (1..12).each do |month_index|
    (1..31).each do |day_index|
      begin
        day, month = form_date(day_index, month_index)

        puts "Scraping #{month} #{day}..."

        description, events, births, deaths = extract_from(day, month)

        result["#{month}-#{day}".to_sym] = {
          description: description, events: events, births: births, deaths: deaths
        }
      rescue NoMethodError
        puts 'It seems this date does not have any episodes.'
      end
    end
  end

  export_to_file(result, output_dir)
end