class Biblionet::Crawlers::Base

Public Class Methods

new(options = {}) click to toggle source
# File lib/bookshark/crawlers/base.rb, line 7
def initialize(options = {})
  @folder             = options[:folder]            ||= 'lib/bookshark/storage/html_base_pages'
  @base_url           = options[:base_url]          ||= 'http://www.biblionet.gr/base/'
  @page_type          = options[:page_type]         ||= 'base'
  @extension          = options[:extension]         ||= '.html'
  @save_only_content  = options[:save_only_content] ||= false
  @start              = options[:start]             ||= 1
  @finish             = options[:finish]            ||= 10000
  @step               = options[:step]              ||= 1000
end

Public Instance Methods

spider() { |url_to_download, file_to_save| ... } click to toggle source
# File lib/bookshark/crawlers/base.rb, line 18
def spider
  start  = @start  + @step - 1
  finish = @finish
  
  start.step(finish, @step) do |last|  
    first     = last - @step + 1
    subfolder = (last/@step - 1).to_s          
    slash     = (@page_type != 'bg_record') ? '/' : ''
    path      = "#{@folder}/#{subfolder}/"

    # Create a new directory (does nothing if directory exists)
    # FileUtils.mkdir_p path

    first.upto(last) do |id|
      file_to_save    = "#{path}#{@page_type}_#{id}#{@extension}"
      url_to_download = "#{@base_url}#{id}#{slash}"

      yield(url_to_download, file_to_save)
      # downloader = Biblionet::Core::Base.new(url_to_download)
      # downloader.save_page(file_to_save) unless downloader.page.nil?

    end
  end
end