class EnterRockstar::Scraper::Wikia

lyrics scraper for lyrics.wikia.com

Constants

DATA_DIR
SLEEP_BETWEEN_REQUESTS
START_HOST

Attributes

category_name[R]
output[R]
tree[R]
url[R]

Public Class Methods

new(category_name: 'heavy_metal', url: '/wiki/Category:Genre/Heavy_Metal', data_dir: 'lyrics_data') click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 16
def initialize(category_name: 'heavy_metal', url: '/wiki/Category:Genre/Heavy_Metal', data_dir: 'lyrics_data')
  @tree = {}
  @output = "#{data_dir}/wikia_#{category_name}.json.gz"
  @url = url
  @category_name = category_name
end

Public Instance Methods

load_saved_json() click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 54
def load_saved_json
  @tree = JSON.parse(EnterRockstar::Utils.load_json(@output))
end
parse_all_pages(start_index: 0) click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 64
def parse_all_pages(start_index: 0)
  @tree.each_with_index do |(key, val), index|
    next if index < start_index

    puts "#{index}: #{key}"

    val.each do |k, v|
      dirname = k == 'band_url' ? [DATA_DIR, @category_name, key].join('/') : [DATA_DIR, @category_name, key, k].join('/')
      FileUtils.mkdir_p dirname

      parse_page(v, dirname)
    end
  end
end
parse_category(url: nil, test_limit: false) click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 23
def parse_category(url: nil, test_limit: false)
  url ||= START_HOST + @url
  html = URI.parse(url).open
  doc = Nokogiri::HTML(html)

  # get all category member links and sort them by band and album
  doc.css('li.category-page__member a').each do |category_link|
    next if category_link.attr('title').include?('Category:')

    band, album = category_link.attr('title').split(':')
    @tree[band] ||= {}

    if album.nil?
      @tree[band]['band_url'] = category_link.attr('href')
    else
      @tree[band][album] = category_link.attr('href')
    end
  end

  return if test_limit # test only first page scraping so it's easier

  print '.'
  # get next page if one exists and parse that
  next_url = doc.css('a.category-page__pagination-next')&.first&.attr('href')
  parse_category(url: next_url) unless next_url.nil?
end
parse_page(url, dirname) click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 79
def parse_page(url, dirname)
  sleep SLEEP_BETWEEN_REQUESTS
  html = URI.parse(START_HOST + url).open
  doc = Nokogiri::HTML(html)

  if doc.css('h2 span.mw-headline a').count.zero?
    # single album page listed on the category
    doc.css('div.mw-content-text ol li a').each do |song|
      next unless song&.attr('href')

      lyrics = parse_song(song.attr('href'), dirname, song.text)
      save_song("#{dirname}/#{song.text}.txt", lyrics) unless lyrics.nil?
    end
    puts
  else
    doc.css('h2 span.mw-headline a').each do |album|
      puts album.text
      # some band pages have extra albums that are not listed in the category page for some reason
      album_dirname = [dirname, album.text].join('/')
      FileUtils.mkdir_p album_dirname

      # get song pages
      album.parent.parent.css('+ div + ol > li a').each do |song|
        next unless song&.attr('href')

        lyrics = parse_song(song.attr('href'), album_dirname, song.text)
        save_song("#{album_dirname}/#{song.text}.txt", lyrics) unless lyrics.nil?
      end
      puts
    end
  end
end
parse_song(url, dirname, songname) click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 112
def parse_song(url, dirname, songname)
  return if url.start_with? 'http'

  songfile = "#{dirname}/#{songname}.txt"
  without_last = songfile.split('/')
  without_last.pop
  FileUtils.mkdir_p without_last.join('/')
  return if File.exist?(songfile)

  print '.'
  sleep SLEEP_BETWEEN_REQUESTS
  html = URI.parse(START_HOST + url).open
  doc = Nokogiri::HTML(html)

  lyrics = doc.css('div.lyricbox').first
  return if lyrics.nil?
  return if lyrics.css('a')&.first&.attr('href') == '/wiki/Category:Instrumental'

  lyrics.inner_html.split('<br>').join("\n").gsub(%r{<\/?[^>]*>}, '')
end
print_indexed_tree() click to toggle source
save_category() click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 50
def save_category
  EnterRockstar::Utils.save_file(@output, @tree.to_json)
end
save_song(songfile, contents) click to toggle source
# File lib/enter_rockstar/scraper/wikia.rb, line 133
def save_song(songfile, contents)
  EnterRockstar::Utils.save_plain(songfile, contents)
end