class SpiderBot::Crawl

Public Class Methods

new(url, options = {}) click to toggle source

Initialize a new Spider Bot

@param url [String] the spider target website curl @param options [Hash] the spider crawl configurate options

@option options :type [Symbol] the request body format, `:html` or `:json` @option options :headers [Hash] the custom request headers @option options :path, [String] the custom request path @option options :query [Hash] the request query @option options :user_agent [String] the custom request user agent @option options :source [Boolean] @option options :data [Proc] get crawl data list in body @option options :first [Proc] get crawl data list first item @option options :last [Porc] get crawl data list last item @option options :encode [String] custom request encode

# File lib/spider_bot/crawl.rb, line 19
def initialize(url, options = {})
  parse_uri = URI.parse url
  @uri = parse_uri.scheme + "://" + parse_uri.host
  
  # don't add 443 port append to url when access https website
  if !["80", "443"].include?(parse_uri.port.to_s)
    @uri = @uri + ":" + parse_uri.port.to_s 
  end
  
  @origin_path = parse_uri.path || "/"
  
  @origin_type = options[:type] || :html
  @origin_headers = options[:headers] || {}
  @origin_query = options[:query] || {}

  @origin_user_agent = options[:user_agent] || "Mac Safari"
  @origin_source = options[:source] || false

  @origin_data = options[:data]
  @origin_first = options[:first]
  @origin_last = options[:last]

  @origin_encode = options[:encode]
  
  @page_path = @origin_path
  @page_type = @origin_type
  @page_headers = @origin_headers || {}
  @page_query = {}

  @page_data = @origin_data
  @page_first = @origin_first
  @page_last = @origin_last

  @page_start = 1
  @page_add = 1
  @page_expire = 10
  @page_sleep = 0
  
  @paginate_last = nil
  @paginate_error = 0
  @paginate_type = :html
  @paginate_path = ""
  @paginate_query = {}
  
  @connection = Http::Client.new do |http| 
    http.url= @uri
    http.user_agent= @origin_user_agent
    http.headers= @origin_headers
  end
end

Public Instance Methods

crawl_data(&block) click to toggle source

Process crawl data

@param a [block]

# File lib/spider_bot/crawl.rb, line 74
def crawl_data(&block)
  @paginate_num = @page_start
  
  catch :all do
    begin
      crawl_response = crawl_request(@origin_path, @origin_query, @origin_type, @origin_data, @origin_first, @origin_last, &block)
      return crawl_response if !block_given?
      process_response(crawl_response, &block)
    rescue Exception => e
      handle_error(e)
      crawl_data(&block)
    end
    
    @paginate_error = 0
    return if @page_query.blank? && @page_path == @origin_path
    
    crawl_paginate(&block) 
  end
end

Private Instance Methods

break_all() click to toggle source
# File lib/spider_bot/crawl.rb, line 205
def break_all
  throw :all
end
crawl_paginate(&block) click to toggle source
# File lib/spider_bot/crawl.rb, line 96
def crawl_paginate(&block)
  @page_headers.merge({"X-Requested-With" => "XMLHttpRequest"}) if @page_type.to_s == 'json'
  @connection.headers = @page_headers
  begin
    loop do
      real_page_num  = (@page_start == 0 && @page_add > 1) ? (@paginate_num / @page_add) + 1 : @paginate_num
      if defined?($expire_num)
        if $expire_num > 1
          break if real_page_num > $expire_num.to_i
        else
          break if real_page_num > 1
        end
      end
      #  break crawl_paginate current page number more than @page_expire and @page_expre
      if real_page_num > @page_expire && @page_expire != -1
        SpiderBot.logger.info "Crawl finished..."
        SpiderBot.logger.info "Finish reson: The current page more than setting paginate expire"
        break 
      end
      
      sleep(@page_sleep) if @page_sleep > 0
      
      path = @page_path.to_s % {page: @paginate_num}
      query_str = @page_query.to_s % { page: @paginate_num, last: @paginate_last, first: @paginate_first }
      query = eval(query_str)
      
      crawl_response = crawl_request(path, query, @page_type, @page_data, @page_first, @page_last, &block)
      process_response(crawl_response, &block)
    end
  rescue Exception => e
    @paginate_num += @page_add if @paginate_error == 2
    handle_error(e)
    crawl_paginate(&block)
  end
end
crawl_request(path, query, type, data, first, last, &block) click to toggle source
# File lib/spider_bot/crawl.rb, line 132
def crawl_request(path, query, type, data, first, last, &block)
  @paginate_path = path
  @paginate_query = query
  
  response = @connection.get(path, query)

  return if !response
  return if response.status != 200

  options = { encode: @origin_encode } if @origin_encode

  if @origin_source && !block_given?
    return response.body(options) 
  end

  if type.to_s == "html"
    @paginate_type = :html
    body = Nokogiri::HTML response.body(options)
  elsif type.to_s == "json"
    @paginate_type = :json
    body = MultiJson.load response.body(options)
  else
    @paginate_type = response.parser
    body = response.parsed
  end
  
  return if body.nil?
  return body if data.nil?

  body_data = data.call(body) if data
  @paginate_first = first.call(body_data, body) if first
  @paginate_last = last.call(body_data, body) if last

  return body_data 
end
get_page_url() click to toggle source
# File lib/spider_bot/crawl.rb, line 168
def get_page_url
  if !@paginate_query.blank?
    @uri + @paginate_path + "?" + @paginate_query.map{ |k,v| "#{k}=#{v}" }.join("&")
  else
    @uri + @paginate_path
  end
end
handle_error(error) click to toggle source
# File lib/spider_bot/crawl.rb, line 209
def handle_error(error)
  SpiderBot.logger.error "crawling url #{ get_page_url } has error..."
  SpiderBot.logger.error error.to_s
  
  break_all if @paginate_error == 3
  @paginate_error += 1
  
  sleep( 60 * @paginate_error )
end
option(name, params) click to toggle source
# File lib/spider_bot/crawl.rb, line 200
def option(name, params)
  raise "set paginate options has error" if %i(path query first last type data start add expire sleep).include?(name.to_s)
  eval("@page_#{name} = params")
end
paginate(&block) click to toggle source

set crawl paginate settings

@example

paginate do
  option :path, '/path'
  option :query, {page: "%{page}"}
  option :first, Proc.new{|data| data.css("#item")}
  option :last, Proc.new{|data| data.css("#item")}
  option :type, :html
  option :data, Proc.new{|body| body.css("#item")}
  option :start, 1
  option :add, 1
  option :expire, 100
  option :sleep, 100
end
# File lib/spider_bot/crawl.rb, line 196
def paginate(&block)
  block.call
end
process_response(response) { |response, paginate_num, paginate_type| ... } click to toggle source

Print error infomation with http client response blank

@param response [Object] The Faraday connection builder

# File lib/spider_bot/crawl.rb, line 223
def process_response(response, &block)
  if response.blank?
    SpiderBot.logger.info "Crawl finished..."
    SpiderBot.logger.info "Finish reson: Crawl response body is blank..." 
    break_all
  end
  SpiderBot.logger.info "crawling page for #{get_page_url}"
  yield response, @paginate_num, @paginate_type
  @paginate_num += @page_add
  @paginate_error = 0
end
set_paginate_headers(arg) click to toggle source
# File lib/spider_bot/crawl.rb, line 176
def set_paginate_headers(arg)
  @page_headers = arg || {}
end