class Object

Constants

BASE_URL
DEFAULTS
EXTENSION
FOLDER

page = Nokogiri::HTML(open(“raw_html_pages/book_45454.html”))

puts page.class # => Nokogiri::HTML::Document puts page

Public Instance Methods

crawl_and_save(options={}) click to toggle source
# File lib/bookshark/crawlers/author_crawler.rb, line 17
def crawl_and_save(options={})
  options = DEFAULTS.merge(options)

  start_id  = options[:first_id] + options[:step] - 1
  last_id   = options[:last_id]
  step      = options[:step]

  start_id.step(last_id, step) do |last|  
    first     = last - step + 1
    subfolder = (last/step - 1).to_s
    path      = "#{options[:folder]}/#{subfolder}/"

    # Create a new directory (does nothing if directory exists)
    FileUtils.mkdir_p path

    first.upto(last) do |id|
      file_to_save = "#{path}author_#{id}#{options[:extension]}"
      url_to_download = "#{options[:base_url]}#{id}/"

      downloader = Biblionet::Core::Base.new(url_to_download)
      downloader.save_page(file_to_save) unless downloader.page.nil?

    end
  end

end