class GildiaComicsCrawler::ComicCrawler

Constants

FIRST_BLOCK_SECTIONS
SECOND_BLOCK_SECTIONS

Public Class Methods

new(link) click to toggle source
# File lib/gildia_comics_crawler/comic_crawler.rb, line 5
def initialize link
  @link = link
end

Public Instance Methods

crawl() click to toggle source
# File lib/gildia_comics_crawler/comic_crawler.rb, line 9
def crawl
  @noko = download(@link)
  @data = {gildia_link: @downloader_uri}
  @data[:gildia_sklep_link] = @noko.css('#product a.p').first[:href] rescue nil
  @data[:cover] = @noko.css('.main-article-image[src^="http://images.gildia.pl/"]').last[:src].gsub(/200.jpg$/, '600.jpg') rescue nil
  elements = @noko.css('.widetext').children
  @data[:title] = elements.css('h1').children.first.text

  @data.merge! first_block_elements(elements)
  @data.merge! second_block_elements(elements)

  @data
end

Private Instance Methods

first_block_elements(elements) click to toggle source
# File lib/gildia_comics_crawler/comic_crawler.rb, line 31
def first_block_elements elements
  found_elements = []
  elements.each do |el|
    break if el.name == 'div' && !el[:class]
    found_elements << el if el.name == 'text' || el.name == 'a'
  end
  extracted_data = found_elements.map(&:text).map(&:strip).reject { |t| t.empty? || t == ',' }
  data = {}
  current_section = nil
  extracted_data.each do |text|
    if (new_section=FIRST_BLOCK_SECTIONS[text])
      current_section = new_section
    else
      data[current_section] ||= []
      data[current_section] << text
    end
  end

  data
end
second_block_elements(elements) click to toggle source
# File lib/gildia_comics_crawler/comic_crawler.rb, line 65
def second_block_elements elements
  data = {}
  extracted_data = elements.filter('div:not([class])')[1].children.map(&:text).map(&:strip).reject(&:empty?)
  if extracted_data[0] == 'Wydawnictwo:'
    data[:publisher] = extracted_data[1]
    extracted_data.shift(2)
  end
  if extracted_data[0] =~ /^\d+\/\d+$/
    month, year = extracted_data.shift.split('/')
    data[:publish_date_month] = month
    data[:publish_date_year] = year
  end

  extracted_data.each do |line|
    attr, val = line.split(':', 2)
    if(attr_sym=SECOND_BLOCK_SECTIONS[attr])
      data[attr_sym] = val.strip
    end
  end
  data
end