class MicroSpider

Attributes

actions[RW]
broken_paths[R]
current_location[R]
delay[R]
excretion[R]
logger[RW]
paths[R]
recipe[RW]
selector[RW]
skip_set_entrance[RW]
timeout[RW]
visited_paths[R]

Public Class Methods

new(excretion = nil, selector: :css) click to toggle source
# File lib/micro_spider.rb, line 38
def initialize(excretion = nil, selector: :css)
  @selector         = selector
  @paths            = []
  @actions          = []
  @setted_variables = {}
  @timeout          = 120
  @status           = 'pending'
  @excretion        = excretion || SpiderCore::Excretion.new
  @logger           = Logger.new(STDOUT)
  @visited_paths    = Set.new
  @broken_paths     = []
end

Public Instance Methods

click(locator, opts = {}, &block) click to toggle source

Click the locator. This will trigger visit action and change current location. @params locator [String] the text or id of the link.

# File lib/micro_spider.rb, line 104
def click(locator, opts = {}, &block)
  actions << lambda {
    path = find_link(locator, opts)[:href] rescue nil
    raise SpiderCore::ClickPathNotFound, "#{locator} not found" if path.nil?
    if block_given?
      spider = self.spawn
      spider.entrance(path)
      spider.learn(&block)
      put(
        "click::#{path}", spider.crawl
      )
    else
      visit(path)
    end
  }
end
completed?() click to toggle source
# File lib/micro_spider.rb, line 312
def completed?
  @status == 'completed'
end
crawl(&block) click to toggle source
# File lib/micro_spider.rb, line 199
def crawl(&block)
  return excretion if completed?

  @paths.compact!
  path = nil
  loop do
    path = @paths.shift
    break if path.nil?
    break unless @visited_paths.include?(path)
  end

  if path.nil?
    complete
    return excretion
  end

  learn(@recipe) if @actions.empty?

  begin
    visit(path)
    @status = 'inprogress'
  rescue Timeout::Error => err
    @broken_paths << path
    logger.fatal("Timeout!!! execution expired when visit `#{path}`")
    logger.fatal(err)
  rescue SystemExit, Interrupt
    logger.fatal("SystemExit && Interrupt")
    @status = 'exit'
    exit!
  rescue Exception => err
    @broken_paths << path
    logger.fatal("Caught exception when visit `#{path}`")
    logger.fatal(err)
    logger.fatal(err.backtrace.join("\n"))
  else
    @visited_paths << path
    execute_actions
    #yield(@current_location) if block_given?
    @excretion = @excretion.put(path, @current_location)
  ensure
    @actions = []
    @skip_set_entrance = true
    crawl(&block)
  end

  excretion
end
create_action(name, &block) click to toggle source

Spider can create custom action when it is crawling. @param name [String] the name of action @param block [Proc] the actions

@example

spider = MicroSpider.new

spider.create_action :save do |result|
  SomeClass.save(result)
end

spider.save
# File lib/micro_spider.rb, line 269
def create_action(name, &block)
  action = proc { actions << lambda { block.call(@excretion) } }
  metaclass.send :define_method, name, &action
end
delay=(sec) click to toggle source

The seconds between each two request.

@param sec [Float]

# File lib/micro_spider.rb, line 54
def delay=(sec)
  raise ArgumentError, 'Delay sec can not be a negative number.' if sec.to_f < 0
  @delay = sec.to_f
end
entrance(*path_or_paths) click to toggle source

This will be the first path for spider to visit. If more than one entrance, the spider will crawl theme one by one. @param path_or_paths [String] one or more entrances

@example

spider = MicroSpider.new
spider.site('http://google.com')
spider.entrance('/a')
spider.entrance('/b')
# File lib/micro_spider.rb, line 172
def entrance(*path_or_paths)
  return if @skip_set_entrance
  @paths += path_or_paths
end
entrance_on(pattern, path: '/', attr: :href) click to toggle source

Sometimes the entrances are on the page. @param path [String] path to visit @param pattern [String, Regexp] links pattern

@example

spider = MicroSpider.new
spider.entrance_on('.links a')
spider.entrance_on('.links a', path: '/a')
# File lib/micro_spider.rb, line 191
def entrance_on(pattern, path: '/', attr: :href)
  return if @skip_set_entrance

  visit(path)
  entrances = scan_all(pattern).map{ |element| element[attr] }
  @paths += entrances.to_a
end
execute_actions() click to toggle source
# File lib/micro_spider.rb, line 274
def execute_actions
  actions.delete_if { |action|
    begin
      Timeout::timeout(@timeout) { action.call }
    rescue Timeout::Error => err
      logger.fatal('Timeout!!! execution expired when execute action')
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      @visited_paths.pop
      break
    rescue SpiderCore::ClickPathNotFound => err
      logger.fatal(err.message)
      logger.fatal(err.backtrace.inspect)
      @visited_paths.pop
      break
    end
  }
end
get(field) click to toggle source
# File lib/micro_spider.rb, line 320
def get(field)
  @_deep_fetch ||= excretion.extend Hashie::Extensions::DeepFind
  result = @_deep_fetch.deep_find_all(field.to_s)
  return if result.nil?
  result.length == 1 ? result.pop : result
end
learn(recipe = nil, &block) click to toggle source

Teach the spider behaviors and it will repeat to the end. @param recipe [String, Proc] the recipe be learned.

@example

spider = MicroSpider.new
spider.learn do
  entrance 'http://google.com'
end
spider.crawl

@example

spider.learn("entrance 'http://google.com'")
spider.crawl

@example

recipe = lambda {
  entrance 'http://google.com'
}
spider.learn(recipe)
spider.crawl
# File lib/micro_spider.rb, line 142
def learn(recipe = nil, &block)
  if block_given?
    instance_eval(&block)
    @recipe = block
  elsif recipe.is_a?(Proc)
    instance_eval(&recipe)
    @recipe = recipe
  elsif recipe.is_a?(String)
    instance_eval(recipe)
    @recipe = recipe
  else
    self
  end
end
metaclass() click to toggle source
# File lib/micro_spider.rb, line 316
def metaclass
  class << self; self; end
end
page() click to toggle source

The default page is Capybara.current_session. Share one page may cause difficult issue, so here i separate it.

# File lib/micro_spider.rb, line 329
def page
  @page ||= Capybara::Session.new(Capybara.mode, Capybara.app)
end
reset() click to toggle source
# File lib/micro_spider.rb, line 247
def reset
  return unless completed?
  @paths            = visited_paths.to_a
  @status           = 'pending'
  @excretion        = nil
  @visited_paths    = Set.new
  @current_location = nil
end
set(name, value) click to toggle source

Set a variable. You can use it later.

@param name [String] the variable name @param value [String] the variable value @param opts [Hash] the options. can set selector with css or xpath

@example Set a variable

spider = MicroSpider.new
spider.set :id, '645'
spider.set :table, '.tb a', selector: :css
spider.set :table, '.tb a', selector: :css do |e|
  e['src']
end
# File lib/micro_spider.rb, line 90
def set(name, value)
  @setted_variables[name.to_s] = value
end
set_on(name, pattern) { |element| ... } click to toggle source
# File lib/micro_spider.rb, line 94
def set_on(name, pattern, &block)
  actions << lambda {
    element = scan_first(pattern)
    @setted_variables[name.to_s] = block_given? ? yield(element) : handle_element(element)
  }
end
site(url) click to toggle source
# File lib/micro_spider.rb, line 157
def site(url)
  return if @site
  Capybara.app_host = @site = url
end
spawn(&block) click to toggle source

@example

spider = MicroSpider.new
kid = spider.spawn

or

kid = spider.spawn do
  ...
  ...
end
# File lib/micro_spider.rb, line 303
def spawn(&block)
  spider         = self.class.new
  spider.logger  = logger
  spider.timeout = timeout
  spider.site(@site)
  spider.learn(&block) if block_given?
  spider
end
suicide() click to toggle source

Because we don't share the page, the connect may or maynot be killd, it will eat too much mem. Make this spider instance suicide. For now, specially for `capybara-webkit`

# File lib/micro_spider.rb, line 336
def suicide
  if Capybara.mode.to_s == 'webkit'
    @page.driver.browser.instance_variable_get(:@connection).send :kill_process
  end
  @page = nil
end
visit(path) click to toggle source

Visit the path.

@param path [String] the path to visit, can be absolute path or relative path.

@example Visit a path

spider = MicroSpider.new
spider.visit('/example')
spider.visit('http://google.com')
Calls superclass method
# File lib/micro_spider.rb, line 68
def visit(path)
  raise ArgumentError, "Path can't be nil or empty" if path.nil? || path.empty?
  sleep_or_not
  logger.info "Begin to visit #{path}."
  super(path)
  @current_location = SpiderCore::Excretion['_path' => path]
  logger.info "Current location is #{path}."
end
with(pattern, path:) { |element| ... } click to toggle source
# File lib/micro_spider.rb, line 177
def with(pattern, path:, &block)
  visit(path)
  scan_all(pattern).map{ |element| yield(element) }
end

Protected Instance Methods

complete() click to toggle source
# File lib/micro_spider.rb, line 353
def complete
  @status = 'completed'
  suicide
end
sleep_or_not() click to toggle source
# File lib/micro_spider.rb, line 345
def sleep_or_not
  if delay && delay > 0
    logger.info "Nedd sleep #{delay} sec."
    sleep(delay)
    logger.info 'Wakeup'
  end
end