class Spidr::Agent

Attributes

authorized[RW]

HTTP Authentication credentials

@return [AuthStore]

cookies[R]

Cached cookies

@return [CookieJar]

default_headers[R]

HTTP Headers to use for every request

@return [Hash{String => String}]

@since 0.6.0

delay[RW]

Delay in between fetching pages

@return [Integer]

failures[R]

List of unreachable URLs

@return [Set<URI::HTTP>]

history[R]

History containing visited URLs

@return [Set<URI::HTTP>]

host_header[RW]

HTTP Host Header to use

@return [String]

host_headers[R]

HTTP Host Headers to use for specific hosts

@return [Hash{String,Regexp => String}]

levels[R]

The visited URLs and their depth within a site

@return [Hash{URI::HTTP => Integer}]

limit[R]

Maximum number of pages to visit.

@return [Integer]

max_depth[R]

Maximum depth

@return [Integer]

pending_urls[R]

Queue of URLs to visit

@return [Array<URI::HTTP>]

queue[R]

Queue of URLs to visit

@return [Array<URI::HTTP>]

referer[RW]

Referer to use

@return [String]

schemes[R]

List of acceptable URL schemes to follow

sessions[R]

The session cache

@return [SessionCache]

@since 0.6.0

strip_fragments[RW]

Specifies whether the Agent will strip URI fragments

strip_query[RW]

Specifies whether the Agent will strip URI queries

visited_urls[R]

History containing visited URLs

@return [Set<URI::HTTP>]

Public Class Methods

host(name,options={},&block) click to toggle source

Creates a new agent and spiders the given host.

@param [String] name

The host-name to spider.

@param [Hash] options

Additional options. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@see initialize

# File lib/spidr/agent.rb, line 295
def self.host(name,options={},&block)
  agent = new(options.merge(host: name),&block)
  agent.start_at(URI::HTTP.build(host: name, path: '/'))
end
new(options={}) { |self| ... } click to toggle source

Creates a new Agent object.

@param [Hash] options

Additional options

@option options [Integer] :open_timeout (Spidr.open_timeout)

Optional open timeout.

@option options [Integer] :read_timeout (Spidr.read_timeout)

Optional read timeout.

@option options [Integer] :ssl_timeout (Spidr.ssl_timeout)

Optional ssl timeout.

@option options [Integer] :continue_timeout (Spidr.continue_timeout)

Optional continue timeout.

@option options [Integer] :keep_alive_timeout (Spidr.keep_alive_timeout)

Optional keep_alive timeout.

@option options [Hash] :proxy (Spidr.proxy)

The proxy information to use.

@option :proxy [String] :host

The host the proxy is running on.

@option :proxy [Integer] :port

The port the proxy is running on.

@option :proxy [String] :user

The user to authenticate as with the proxy.

@option :proxy [String] :password

The password to authenticate with.

@option options [Hash{String => String}] :default_headers

Default headers to set for every request.

@option options [String] :host_header

The HTTP Host header to use with each request.

@option options [Hash{String,Regexp => String}] :host_headers

The HTTP Host headers to use for specific hosts.

@option options [String] :user_agent (Spidr.user_agent)

The User-Agent string to send with each requests.

@option options [String] :referer

The Referer URL to send with each request.

@option options [Integer] :delay (0)

The number of seconds to pause between each request.

@option options [Set, Array] :queue

The initial queue of URLs to visit.

@option options [Set, Array] :history

The initial list of visited URLs.

@option options [Integer] :limit

The maximum number of pages to visit.

@option options [Integer] :max_depth

The maximum link depth to follow.

@option options [Boolean] :robots (Spidr.robots?)

Specifies whether `robots.txt` should be honored.

@yield [agent]

If a block is given, it will be passed the newly created agent
for further configuration.

@yieldparam [Agent] agent

The newly created agent.

@see initialize_sanitizers @see initialize_filters @see initialize_actions @see initialize_events

# File lib/spidr/agent.rb, line 177
def initialize(options={})
  @host_header  = options[:host_header]
  @host_headers = {}

  if options[:host_headers]
    @host_headers.merge!(options[:host_headers])
  end

  @default_headers = {}

  if options[:default_headers]
    @default_headers.merge!(options[:default_headers])
  end

  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
  @referer    = options[:referer]

  @sessions   = SessionCache.new(options)
  @cookies    = CookieJar.new
  @authorized = AuthStore.new

  @running  = false
  @delay    = options.fetch(:delay,0)
  @history  = Set[]
  @failures = Set[]
  @queue    = []

  @limit     = options[:limit]
  @levels    = Hash.new(0)
  @max_depth = options[:max_depth]

  if options[:queue]
    self.queue = options[:queue]
  end

  if options[:history]
    self.history = options[:history]
  end

  initialize_sanitizers(options)
  initialize_filters(options)
  initialize_actions(options)
  initialize_events(options)

  if options.fetch(:robots,Spidr.robots?)
    initialize_robots
  end

  yield self if block_given?
end
site(url,options={},&block) click to toggle source

Creates a new agent and spiders the web-site located at the given URL.

@param [URI::HTTP, String] url

The web-site to spider.

@param [Hash] options

Additional options. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@see initialize

# File lib/spidr/agent.rb, line 270
def self.site(url,options={},&block)
  url = URI(url)

  agent = new(options.merge(host: url.host),&block)
  agent.start_at(url)
end
start_at(url,options={},&block) click to toggle source

Creates a new agent and begin spidering at the given URL.

@param [URI::HTTP, String] url

The URL to start spidering at.

@param [Hash] options

Additional options. See {Agent#initialize}.

@yield [agent]

If a block is given, it will be passed the newly created agent
before it begins spidering.

@yieldparam [Agent] agent

The newly created agent.

@see initialize @see start_at

# File lib/spidr/agent.rb, line 247
def self.start_at(url,options={},&block)
  agent = new(options,&block)
  agent.start_at(url)
end

Public Instance Methods

all_headers() { |headers| ... } click to toggle source

Pass the headers from every response the agent receives to a given block.

@yield [headers]

The block will be passed the headers of every response.

@yieldparam [Hash] headers

The headers from a response.
# File lib/spidr/agent/events.rb, line 68
def all_headers
  every_page { |page| yield page.headers }
end
clear() click to toggle source

Clears the history of the agent.

# File lib/spidr/agent.rb, line 334
def clear
  @queue.clear
  @history.clear
  @failures.clear
  return self
end
continue!(&block) click to toggle source

Continue spidering.

@yield [page]

If a block is given, it will be passed every page visited.

@yieldparam [Page] page

The page to be visited.
# File lib/spidr/agent/actions.rb, line 40
def continue!(&block)
  @paused = false
  return run(&block)
end
enqueue(url,level=0) click to toggle source

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.

@param [URI::HTTP, String] url

The URL to enqueue for visiting.

@return [Boolean]

Specifies whether the URL was enqueued, or ignored.
# File lib/spidr/agent.rb, line 534
def enqueue(url,level=0)
  url = sanitize_url(url)

  if (!(queued?(url)) && visit?(url))
    link = url.to_s

    begin
      @every_url_blocks.each { |url_block| url_block.call(url) }

      @every_url_like_blocks.each do |pattern,url_blocks|
        match = case pattern
                when Regexp
                  link =~ pattern
                else
                  (pattern == link) || (pattern == url)
                end

        if match
          url_blocks.each { |url_block| url_block.call(url) }
        end
      end
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipLink
      return false
    rescue Actions::Action
    end

    @queue << url
    @levels[url] = level
    return true
  end

  return false
end
every_atom_doc() { |doc| ... } click to toggle source

Pass every Atom document that the agent parses to a given block.

@yield [doc]

The block will be passed every Atom document parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 387
def every_atom_doc
  every_page do |page|
    if (block_given? && page.atom?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_atom_page() { |page| ... } click to toggle source

Pass every Atom feed that the agent visits to a given block.

@yield [feed]

The block will be passed every Atom feed visited.

@yieldparam [Page] feed

A visited page.
# File lib/spidr/agent/events.rb, line 451
def every_atom_page
  every_page do |page|
    yield page if (block_given? && page.atom?)
  end
end
every_bad_request_page() { |page| ... } click to toggle source

Pass every Bad Request page that the agent visits to a given block.

@yield [page]

The block will be passed every Bad Request page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 140
def every_bad_request_page
  every_page do |page|
    yield page if (block_given? && page.bad_request?)
  end
end
every_css_page() { |page| ... } click to toggle source

Pass every CSS page that the agent visits to a given block.

@yield [page]

The block will be passed every CSS page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 421
def every_css_page
  every_page do |page|
    yield page if (block_given? && page.css?)
  end
end
every_doc() { |doc| ... } click to toggle source

Pass every HTML or XML document that the agent parses to a given block.

@yield [doc]

The block will be passed every HTML or XML document parsed.

@yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc

A parsed HTML or XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html @see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html

# File lib/spidr/agent/events.rb, line 281
def every_doc
  every_page do |page|
    if block_given?
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_failed_url(&block) click to toggle source

Pass each URL that could not be requested to the given block.

@yield [url]

The block will be passed every URL that could not be requested.

@yieldparam [URI::HTTP] url

A failed URL.
# File lib/spidr/agent/events.rb, line 26
def every_failed_url(&block)
  @every_failed_url_blocks << block
  return self
end
every_forbidden_page() { |page| ... } click to toggle source

Pass every Forbidden page that the agent visits to a given block.

@yield [page]

The block will be passed every Forbidden page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 170
def every_forbidden_page
  every_page do |page|
    yield page if (block_given? && page.forbidden?)
  end
end
every_html_doc() { |doc| ... } click to toggle source

Pass every HTML document that the agent parses to a given block.

@yield [doc]

The block will be passed every HTML document parsed.

@yieldparam [Nokogiri::HTML::Document] doc

A parsed HTML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html

# File lib/spidr/agent/events.rb, line 302
def every_html_doc
  every_page do |page|
    if (block_given? && page.html?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_html_page() { |page| ... } click to toggle source

Pass every HTML page that the agent visits to a given block.

@yield [page]

The block will be passed every HTML page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 231
def every_html_page
  every_page do |page|
    yield page if (block_given? && page.html?)
  end
end
every_internal_server_error_page() { |page| ... } click to toggle source

Pass every Internal Server Error page that the agent visits to a given block.

@yield [page]

The block will be passed every Internal Server Error page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 201
def every_internal_server_error_page
  every_page do |page|
    yield page if (block_given? && page.had_internal_server_error?)
  end
end
every_javascript_page() { |page| ... } click to toggle source

Pass every JavaScript page that the agent visits to a given block.

@yield [page]

The block will be passed every JavaScript page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 406
def every_javascript_page
  every_page do |page|
    yield page if (block_given? && page.javascript?)
  end
end
every_missing_page() { |page| ... } click to toggle source

Pass every Missing page that the agent visits to a given block.

@yield [page]

The block will be passed every Missing page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 185
def every_missing_page
  every_page do |page|
    yield page if (block_given? && page.missing?)
  end
end
every_ms_word_page() { |page| ... } click to toggle source

Pass every MS Word page that the agent visits to a given block.

@yield [page]

The block will be passed every MS Word page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 466
def every_ms_word_page
  every_page do |page|
    yield page if (block_given? && page.ms_word?)
  end
end
every_ok_page() { |page| ... } click to toggle source

Pass every OK page that the agent visits to a given block.

@yield [page]

The block will be passed every OK page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 95
def every_ok_page
  every_page do |page|
    yield page if (block_given? && page.ok?)
  end
end
every_page(&block) click to toggle source

Pass every page that the agent visits to a given block.

@yield [page]

The block will be passed every page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 81
def every_page(&block)
  @every_page_blocks << block
  return self
end
every_pdf_page() { |page| ... } click to toggle source

Pass every PDF page that the agent visits to a given block.

@yield [page]

The block will be passed every PDF page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 481
def every_pdf_page
  every_page do |page|
    yield page if (block_given? && page.pdf?)
  end
end
every_redirect_page() { |page| ... } click to toggle source

Pass every Redirect page that the agent visits to a given block.

@yield [page]

The block will be passed every Redirect page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 110
def every_redirect_page
  every_page do |page|
    yield page if (block_given? && page.redirect?)
  end
end
every_rss_doc() { |doc| ... } click to toggle source

Pass every RSS document that the agent parses to a given block.

@yield [doc]

The block will be passed every RSS document parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 366
def every_rss_doc
  every_page do |page|
    if (block_given? && page.rss?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_rss_page() { |page| ... } click to toggle source

Pass every RSS feed that the agent visits to a given block.

@yield [feed]

The block will be passed every RSS feed visited.

@yieldparam [Page] feed

A visited page.
# File lib/spidr/agent/events.rb, line 436
def every_rss_page
  every_page do |page|
    yield page if (block_given? && page.rss?)
  end
end
every_timedout_page() { |page| ... } click to toggle source

Pass every Timeout page that the agent visits to a given block.

@yield [page]

The block will be passed every Timeout page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 125
def every_timedout_page
  every_page do |page|
    yield page if (block_given? && page.timedout?)
  end
end
every_txt_page() { |page| ... } click to toggle source

Pass every Plain Text page that the agent visits to a given block.

@yield [page]

The block will be passed every Plain Text page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 216
def every_txt_page
  every_page do |page|
    yield page if (block_given? && page.txt?)
  end
end
every_unauthorized_page() { |page| ... } click to toggle source

Pass every Unauthorized page that the agent visits to a given block.

@yield [page]

The block will be passed every Unauthorized page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 155
def every_unauthorized_page
  every_page do |page|
    yield page if (block_given? && page.unauthorized?)
  end
end
every_url(&block) click to toggle source

Pass each URL from each page visited to the given block.

@yield [url]

The block will be passed every URL from every page visited.

@yieldparam [URI::HTTP] url

Each URL from each page visited.
# File lib/spidr/agent/events.rb, line 12
def every_url(&block)
  @every_url_blocks << block
  return self
end
every_url_like(pattern,&block) click to toggle source

Pass every URL that the agent visits, and matches a given pattern, to a given block.

@param [Regexp, String] pattern

The pattern to match URLs with.

@yield [url]

The block will be passed every URL that matches the given pattern.

@yieldparam [URI::HTTP] url

A matching URL.

@since 0.3.2

# File lib/spidr/agent/events.rb, line 46
def every_url_like(pattern,&block)
  @every_url_like_blocks[pattern] << block
  return self
end
every_xml_doc() { |doc| ... } click to toggle source

Pass every XML document that the agent parses to a given block.

@yield [doc]

The block will be passed every XML document parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 323
def every_xml_doc
  every_page do |page|
    if (block_given? && page.xml?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_xml_page() { |page| ... } click to toggle source

Pass every XML page that the agent visits to a given block.

@yield [page]

The block will be passed every XML page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 246
def every_xml_page
  every_page do |page|
    yield page if (block_given? && page.xml?)
  end
end
every_xsl_doc() { |doc| ... } click to toggle source

Pass every XML Stylesheet (XSL) that the agent parses to a given block.

@yield [doc]

The block will be passed every XSL Stylesheet (XSL) parsed.

@yieldparam [Nokogiri::XML::Document] doc

A parsed XML document.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html

# File lib/spidr/agent/events.rb, line 345
def every_xsl_doc
  every_page do |page|
    if (block_given? && page.xsl?)
      if (doc = page.doc)
        yield doc
      end
    end
  end
end
every_xsl_page() { |page| ... } click to toggle source

Pass every XML Stylesheet (XSL) page that the agent visits to a given block.

@yield [page]

The block will be passed every XML Stylesheet (XSL) page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 262
def every_xsl_page
  every_page do |page|
    yield page if (block_given? && page.xsl?)
  end
end
every_zip_page() { |page| ... } click to toggle source

Pass every ZIP page that the agent visits to a given block.

@yield [page]

The block will be passed every ZIP page visited.

@yieldparam [Page] page

A visited page.
# File lib/spidr/agent/events.rb, line 496
def every_zip_page
  every_page do |page|
    yield page if (block_given? && page.zip?)
  end
end
failed?(url) click to toggle source

Determines whether a given URL could not be visited.

@param [URI::HTTP, String] url

The URL to check for failures.

@return [Boolean]

Specifies whether the given URL was unable to be visited.
# File lib/spidr/agent.rb, line 483
def failed?(url)
  @failures.include?(URI(url))
end
failures=(new_failures) click to toggle source

Sets the list of failed URLs.

@param [#each] new_failures

The new list of failed URLs.

@return [Array<URI::HTTP>]

The list of failed URLs.

@example

agent.failures = ['http://localhost/']
# File lib/spidr/agent.rb, line 464
def failures=(new_failures)
  @failures.clear

  new_failures.each do |url|
    @failures << URI(url)
  end

  return @failures
end
get_page(url) { |new_page| ... } click to toggle source

Requests and creates a new Page object from a given URL.

@param [URI::HTTP] url

The URL to request.

@yield [page]

If a block is given, it will be passed the page that represents the
response.

@yieldparam [Page] page

The page for the response.

@return [Page, nil]

The page for the response, or `nil` if the request failed.
# File lib/spidr/agent.rb, line 586
def get_page(url)
  url = URI(url)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.get(path,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end
history=(new_history) click to toggle source

Sets the history of URLs that were previously visited.

@param [#each] new_history

A list of URLs to populate the history with.

@return [Set<URI::HTTP>]

The history of the agent.

@example

agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
# File lib/spidr/agent.rb, line 407
def history=(new_history)
  @history.clear

  new_history.each do |url|
    @history << URI(url)
  end

  return @history
end
ignore_exts() click to toggle source

Specifies the patterns that match URI path extensions to not visit.

@return [Array<String, Regexp, Proc>]

The URI path extension patterns to not visit.
# File lib/spidr/agent/filters.rb, line 328
def ignore_exts
  @ext_rules.reject
end
ignore_exts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_exts}.

@param [String, Regexp] pattern

The pattern to match URI path extensions with.

@yield [ext]

If a block is given, it will be used to filter URI path extensions.

@yieldparam [String] ext

A URI path extension to reject or accept.
# File lib/spidr/agent/filters.rb, line 344
def ignore_exts_like(pattern=nil,&block)
  if pattern
    ignore_exts << pattern
  elsif block
    ignore_exts << block
  end

  return self
end
ignore_hosts() click to toggle source

Specifies the patterns that match host-names to not visit.

@return [Array<String, Regexp, Proc>]

The host-name patterns to not visit.
# File lib/spidr/agent/filters.rb, line 60
def ignore_hosts
  @host_rules.reject
end
ignore_hosts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_hosts}.

@param [String, Regexp] pattern

The pattern to match host-names with.

@yield [host]

If a block is given, it will be used to filter host-names.

@yieldparam [String] host

A host-name to reject or accept.
# File lib/spidr/agent/filters.rb, line 76
def ignore_hosts_like(pattern=nil,&block)
  if pattern
    ignore_hosts << pattern
  elsif block
    ignore_hosts << block
  end

  return self
end
ignore_ports() click to toggle source

Specifies the patterns that match ports to not visit.

@return [Array<Integer, Regexp, Proc>]

The port patterns to not visit.
# File lib/spidr/agent/filters.rb, line 124
def ignore_ports
  @port_rules.reject
end
ignore_ports_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_ports}.

@param [Integer, Regexp] pattern

The pattern to match ports with.

@yield [port]

If a block is given, it will be used to filter ports.

@yieldparam [Integer] port

A port to reject or accept.
# File lib/spidr/agent/filters.rb, line 140
def ignore_ports_like(pattern=nil,&block)
  if pattern
    ignore_ports << pattern
  elsif block
    ignore_ports << block
  end

  return self
end
ignore_urls() click to toggle source

Specifies the patterns that match URLs to not visit.

@return [Array<String, Regexp, Proc>]

The URL patterns to not visit.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 262
def ignore_urls
  @url_rules.reject
end
ignore_urls_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_urls}.

@param [String, Regexp] pattern

The pattern to match URLs with.

@yield [url]

If a block is given, it will be used to filter URLs.

@yieldparam [URI::HTTP, URI::HTTPS] url

A URL to reject or accept.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 280
def ignore_urls_like(pattern=nil,&block)
  if pattern
    ignore_urls << pattern
  elsif block
    ignore_urls << block
  end

  return self
end
initialize_robots() click to toggle source

Initializes the robots filter.

# File lib/spidr/agent/robots.rb, line 11
def initialize_robots
  unless Object.const_defined?(:Robots)
    raise(ArgumentError,":robots option given but unable to require 'robots' gem")
  end

  @robots = Robots.new(@user_agent)
end
pause!() click to toggle source

Pauses the agent, causing spidering to temporarily stop.

@raise [Paused]

Indicates to the agent, that it should pause spidering.
# File lib/spidr/agent/actions.rb, line 61
def pause!
  @paused = true
  raise(Actions::Paused)
end
pause=(state) click to toggle source

Sets the pause state of the agent.

@param [Boolean] state

The new pause state of the agent.
# File lib/spidr/agent/actions.rb, line 51
def pause=(state)
  @paused = state
end
paused?() click to toggle source

Determines whether the agent is paused.

@return [Boolean]

Specifies whether the agent is paused.
# File lib/spidr/agent/actions.rb, line 72
def paused?
  @paused == true
end
post_page(url,post_data='') { |new_page| ... } click to toggle source

Posts supplied form data and creates a new Page object from a given URL.

@param [URI::HTTP] url

The URL to request.

@param [String] post_data

Form option data.

@yield [page]

If a block is given, it will be passed the page that represents the
response.

@yieldparam [Page] page

The page for the response.

@return [Page, nil]

The page for the response, or `nil` if the request failed.

@since 0.2.2

# File lib/spidr/agent.rb, line 621
def post_page(url,post_data='')
  url = URI(url)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.post(path,post_data,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end
proxy() click to toggle source

The proxy information the agent uses.

@return [Proxy]

The proxy information.

@see SessionCache#proxy

@since 0.2.2

# File lib/spidr/agent.rb, line 310
def proxy
  @sessions.proxy
end
proxy=(new_proxy) click to toggle source

Sets the proxy information that the agent uses.

@param [Proxy] new_proxy

The new proxy information.

@return [Hash]

The new proxy information.

@see SessionCache#proxy=

@since 0.2.2

# File lib/spidr/agent.rb, line 327
def proxy=(new_proxy)
  @sessions.proxy = new_proxy
end
queue=(new_queue) click to toggle source

Sets the queue of URLs to visit.

@param [#each] new_queue

The new list of URLs to visit.

@return [Array<URI::HTTP>]

The list of URLs to visit.

@example

agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
# File lib/spidr/agent.rb, line 501
def queue=(new_queue)
  @queue.clear

  new_queue.each do |url|
    @queue << URI(url)
  end

  return @queue
end
queued?(url) click to toggle source

Determines whether a given URL has been enqueued.

@param [URI::HTTP] url

The URL to search for in the queue.

@return [Boolean]

Specifies whether the given URL has been queued for visiting.
# File lib/spidr/agent.rb, line 520
def queued?(url)
  @queue.include?(url)
end
robot_allowed?(url) click to toggle source

Determines whether a URL is allowed by the robot policy.

@param [URI::HTTP, String] url

The URL to check.

@return [Boolean]

Specifies whether a URL is allowed by the robot policy.
# File lib/spidr/agent/robots.rb, line 28
def robot_allowed?(url)
  if @robots
    @robots.allowed?(url)
  else
    true
  end
end
run(&block) click to toggle source

Start spidering until the queue becomes empty or the agent is paused.

@yield [page]

If a block is given, it will be passed every page visited.

@yieldparam [Page] page

A page which has been visited.
# File lib/spidr/agent.rb, line 368
def run(&block)
  @running = true

  until (@queue.empty? || paused? || limit_reached?)
    begin
      visit_page(dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false
  @sessions.clear
  return self
end
running?() click to toggle source

Determines if the agent is running.

@return [Boolean]

Specifies whether the agent is running or stopped.
# File lib/spidr/agent.rb, line 391
def running?
  @running == true
end
sanitize_url(url) click to toggle source

Sanitizes a URL based on filtering options.

@param [URI::HTTP, URI::HTTPS, String] url

The URL to be sanitized

@return [URI::HTTP, URI::HTTPS]

The new sanitized URL.

@since 0.2.2

# File lib/spidr/agent/sanitizers.rb, line 23
def sanitize_url(url)
  url = URI(url)

  url.fragment = nil if @strip_fragments
  url.query    = nil if @strip_query

  return url
end
schemes=(new_schemes) click to toggle source

Sets the list of acceptable URL schemes to visit.

@param [Array] new_schemes

The new schemes to visit.

@example

agent.schemes = ['http']
# File lib/spidr/agent/filters.rb, line 18
def schemes=(new_schemes)
  @schemes = new_schemes.map(&:to_s)
end
skip_page!() click to toggle source

Causes the agent to skip the page being visited.

@raise [SkipPage]

Indicates to the agent, that the current page should be skipped.
# File lib/spidr/agent/actions.rb, line 93
def skip_page!
  raise(Actions::SkipPage)
end
start_at(url,&block) click to toggle source

Start spidering at a given URL.

@param [URI::HTTP, String] url

The URL to start spidering at.

@yield [page]

If a block is given, it will be passed every page visited.

@yieldparam [Page] page

A page which has been visited.
# File lib/spidr/agent.rb, line 353
def start_at(url,&block)
  enqueue(url)
  return run(&block)
end
to_hash() click to toggle source

Converts the agent into a Hash.

@return [Hash]

The agent represented as a Hash containing the `history` and
the `queue` of the agent.
# File lib/spidr/agent.rb, line 695
def to_hash
  {history: @history, queue: @queue}
end
urls_like(pattern,&block) click to toggle source

@see every_url_like

# File lib/spidr/agent/events.rb, line 54
def urls_like(pattern,&block)
  every_url_like(pattern,&block)
end
visit_exts() click to toggle source

Specifies the patterns that match the URI path extensions to visit.

@return [Array<String, Regexp, Proc>]

The URI path extensions patterns to visit.
# File lib/spidr/agent/filters.rb, line 296
def visit_exts
  @ext_rules.accept
end
visit_exts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_exts}.

@param [String, Regexp] pattern

The pattern to match URI path extensions with.

@yield [ext]

If a block is given, it will be used to filter URI path extensions.

@yieldparam [String] ext

A URI path extension to accept or reject.
# File lib/spidr/agent/filters.rb, line 312
def visit_exts_like(pattern=nil,&block)
  if pattern
    visit_exts << pattern
  elsif block
    visit_exts << block
  end

  return self
end
visit_hosts() click to toggle source

Specifies the patterns that match host-names to visit.

@return [Array<String, Regexp, Proc>]

The host-name patterns to visit.
# File lib/spidr/agent/filters.rb, line 28
def visit_hosts
  @host_rules.accept
end
visit_hosts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_hosts}.

@param [String, Regexp] pattern

The pattern to match host-names with.

@yield [host]

If a block is given, it will be used to filter host-names.

@yieldparam [String] host

A host-name to accept or reject.
# File lib/spidr/agent/filters.rb, line 44
def visit_hosts_like(pattern=nil,&block)
  if pattern
    visit_hosts << pattern
  elsif block
    visit_hosts << block
  end

  return self
end
visit_page(url) { |page| ... } click to toggle source

Visits a given URL, and enqueus the links recovered from the URL to be visited later.

@param [URI::HTTP, String] url

The URL to visit.

@yield [page]

If a block is given, it will be passed the page which was visited.

@yieldparam [Page] page

The page which was visited.

@return [Page, nil]

The page that was visited. If `nil` is returned, either the request
for the page failed, or the page was skipped.
# File lib/spidr/agent.rb, line 652
def visit_page(url)
  url = sanitize_url(url)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      yield page if block_given?
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.each_url do |next_url|
      begin
        @every_link_blocks.each do |link_block|
          link_block.call(page.url,next_url)
        end
      rescue Actions::Paused => action
        raise(action)
      rescue Actions::SkipLink
        next
      rescue Actions::Action
      end

      if (@max_depth.nil? || @max_depth > @levels[url])
        enqueue(next_url,@levels[url] + 1)
      end
    end
  end
end
visit_ports() click to toggle source

Specifies the patterns that match the ports to visit.

@return [Array<Integer, Regexp, Proc>]

The port patterns to visit.
# File lib/spidr/agent/filters.rb, line 92
def visit_ports
  @port_rules.accept
end
visit_ports_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_ports}.

@param [Integer, Regexp] pattern

The pattern to match ports with.

@yield [port]

If a block is given, it will be used to filter ports.

@yieldparam [Integer] port

A port to accept or reject.
# File lib/spidr/agent/filters.rb, line 108
def visit_ports_like(pattern=nil,&block)
  if pattern
    visit_ports << pattern
  elsif block
    visit_ports << block
  end

  return self
end
visit_urls() click to toggle source

Specifies the patterns that match the URLs to visit.

@return [Array<String, Regexp, Proc>]

The link patterns to visit.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 226
def visit_urls
  @url_rules.accept
end
visit_urls_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_urls}

@param [String, Regexp] pattern

The pattern to match URLs with.

@yield [url]

If a block is given, it will be used to filter URLs.

@yieldparam [URI::HTTP, URI::HTTPS] url

A URL to accept or reject.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 244
def visit_urls_like(pattern=nil,&block)
  if pattern
    visit_urls << pattern
  elsif block
    visit_urls << block
  end

  return self
end
visited?(url) click to toggle source

Determines whether a URL was visited or not.

@param [URI::HTTP, String] url

The URL to search for.

@return [Boolean]

Specifies whether a URL was visited.
# File lib/spidr/agent.rb, line 448
def visited?(url)
  @history.include?(URI(url))
end
visited_hosts() click to toggle source

Specifies all hosts that were visited.

@return [Array<String>]

The hosts which have been visited.
# File lib/spidr/agent.rb, line 435
def visited_hosts
  visited_urls.map(&:host).uniq
end

Protected Instance Methods

dequeue() click to toggle source

Dequeues a URL that will later be visited.

@return [URI::HTTP]

The URL that was at the front of the queue.
# File lib/spidr/agent.rb, line 798
def dequeue
  @queue.shift
end
failed(url) click to toggle source

Adds a given URL to the failures list.

@param [URI::HTTP] url

The URL to add to the failures list.
# File lib/spidr/agent.rb, line 839
def failed(url)
  @failures << url
  @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
  return true
end
initialize_actions(options={}) click to toggle source
# File lib/spidr/agent/actions.rb, line 99
def initialize_actions(options={})
  @paused = false
end
initialize_events(options={}) click to toggle source
# File lib/spidr/agent/events.rb, line 523
def initialize_events(options={})
  @every_url_blocks        = []
  @every_failed_url_blocks = []
  @every_url_like_blocks   = Hash.new { |hash,key| hash[key] = [] }

  @every_page_blocks = []
  @every_link_blocks = []
end
initialize_filters(options={}) click to toggle source

Initializes filtering rules.

@param [Hash] options

Additional options.

@option options [Array] :schemes (['http', 'https'])

The list of acceptable URI schemes to visit.
The `https` scheme will be ignored if `net/https` cannot be loaded.

@option options [String] :host

The host-name to visit.

@option options [Array<String, Regexp, Proc>] :hosts

The patterns which match the host-names to visit.

@option options [Array<String, Regexp, Proc>] :ignore_hosts

The patterns which match the host-names to not visit.

@option options [Array<Integer, Regexp, Proc>] :ports

The patterns which match the ports to visit.

@option options [Array<Integer, Regexp, Proc>] :ignore_ports

The patterns which match the ports to not visit.

@option options [Array<String, Regexp, Proc>] :links

The patterns which match the links to visit.

@option options [Array<String, Regexp, Proc>] :ignore_links

The patterns which match the links to not visit.

@option options [Array<String, Regexp, Proc>] :urls

The patterns which match the URLs to visit.

@option options [Array<String, Regexp, Proc>] :ignore_urls

The patterns which match the URLs to not visit.

@option options [Array<String, Regexp, Proc>] :exts

The patterns which match the URI path extensions to visit.

@option options [Array<String, Regexp, Proc>] :ignore_exts

The patterns which match the URI path extensions to not visit.
# File lib/spidr/agent/filters.rb, line 399
def initialize_filters(options={})
  @schemes = []

  if options[:schemes]
    self.schemes = options[:schemes]
  else
    @schemes << 'http'

    begin
      require 'net/https'

      @schemes << 'https'
    rescue Gem::LoadError => e
      raise(e)
    rescue ::LoadError
      warn "Warning: cannot load 'net/https', https support disabled"
    end
  end

  @host_rules = Rules.new(
    accept: options[:hosts],
    reject: options[:ignore_hosts]
  )
  @port_rules = Rules.new(
    accept: options[:ports],
    reject: options[:ignore_ports]
  )
  @link_rules = Rules.new(
    accept: options[:links],
    reject: options[:ignore_links]
  )
  @url_rules = Rules.new(
    accept: options[:urls],
    reject: options[:ignore_urls]
  )
  @ext_rules = Rules.new(
    accept: options[:exts],
    reject: options[:ignore_exts]
  )

  if options[:host]
    visit_hosts_like(options[:host])
  end
end
initialize_sanitizers(options={}) click to toggle source

Initializes the Sanitizer rules.

@param [Hash] options

Additional options.

@option options [Boolean] :strip_fragments (true)

Specifies whether or not to strip the fragment component from URLs.

@option options [Boolean] :strip_query (false)

Specifies whether or not to strip the query component from URLs.

@since 0.2.2

# File lib/spidr/agent/sanitizers.rb, line 48
def initialize_sanitizers(options={})
  @strip_fragments = options.fetch(:strip_fragments,true)
  @strip_query     = options.fetch(:strip_query,false)
end
limit_reached?() click to toggle source

Determines if the maximum limit has been reached.

@return [Boolean]

@since 0.6.0

# File lib/spidr/agent.rb, line 809
def limit_reached?
  @limit && @history.length >= @limit
end
prepare_request(url) { |sessions, path, headers| ... } click to toggle source

Normalizes the request path and grabs a session to handle page get and post requests.

@param [URI::HTTP] url

The URL to request.

@yield [request]

A block whose purpose is to make a page request.

@yieldparam [Net::HTTP] session

An HTTP session object.

@yieldparam [String] path

Normalized URL string.

@yieldparam [Hash] headers

A Hash of request header options.

@since 0.2.2

# File lib/spidr/agent.rb, line 761
def prepare_request(url,&block)
  path = unless url.path.empty?
           url.path
         else
           '/'
         end

  # append the URL query to the path
  path += "?#{url.query}" if url.query

  headers = prepare_request_headers(url)

  begin
    sleep(@delay) if @delay > 0

    yield @sessions[url], path, headers
  rescue SystemCallError,
         Timeout::Error,
         SocketError,
         IOError,
         OpenSSL::SSL::SSLError,
         Net::HTTPBadResponse,
         Zlib::Error

    @sessions.kill!(url)

    failed(url)
    return nil
  end
end
prepare_request_headers(url) click to toggle source

Prepares request headers for the given URL.

@param [URI::HTTP] url

The URL to prepare the request headers for.

@return [Hash{String => String}]

The prepared headers.

@since 0.6.0

# File lib/spidr/agent.rb, line 712
def prepare_request_headers(url)
  # set any additional HTTP headers
  headers = @default_headers.dup

  unless @host_headers.empty?
    @host_headers.each do |name,header|
      if url.host.match(name)
        headers['Host'] = header
        break
      end
    end
  end

  headers['Host']     ||= @host_header if @host_header
  headers['User-Agent'] = @user_agent if @user_agent
  headers['Referer']    = @referer if @referer

  if (authorization = @authorized.for_url(url))
    headers['Authorization'] = "Basic #{authorization}"
  end

  if (header_cookies = @cookies.for_host(url.host))
    headers['Cookie'] = header_cookies
  end

  return headers
end
visit?(url) click to toggle source

Determines if a given URL should be visited.

@param [URI::HTTP] url

The URL in question.

@return [Boolean]

Specifies whether the given URL should be visited.
# File lib/spidr/agent.rb, line 822
def visit?(url)
  !visited?(url) &&
   visit_scheme?(url.scheme) &&
   visit_host?(url.host) &&
   visit_port?(url.port) &&
   visit_link?(url.to_s) &&
   visit_url?(url) &&
   visit_ext?(url.path) &&
   robot_allowed?(url.to_s)
end
visit_ext?(path) click to toggle source

Determines if a given URI path extension should be visited.

@param [String] path

The path that contains the extension.

@return [Boolean]

Specifies whether the given URI path extension should be visited.
# File lib/spidr/agent/filters.rb, line 524
def visit_ext?(path)
  @ext_rules.accept?(File.extname(path)[1..-1])
end
visit_host?(host) click to toggle source

Determines if a given host-name should be visited.

@param [String] host

The host-name.

@return [Boolean]

Specifies whether the given host-name should be visited.
# File lib/spidr/agent/filters.rb, line 470
def visit_host?(host)
  @host_rules.accept?(host)
end
visit_port?(port) click to toggle source

Determines if a given port should be visited.

@param [Integer] port

The port number.

@return [Boolean]

Specifies whether the given port should be visited.
# File lib/spidr/agent/filters.rb, line 483
def visit_port?(port)
  @port_rules.accept?(port)
end
visit_scheme?(scheme) click to toggle source

Determines if a given URI scheme should be visited.

@param [String] scheme

The URI scheme.

@return [Boolean]

Specifies whether the given scheme should be visited.
# File lib/spidr/agent/filters.rb, line 453
def visit_scheme?(scheme)
  if scheme
    @schemes.include?(scheme)
  else
    true
  end
end
visit_url?(link) click to toggle source

Determines if a given URL should be visited.

@param [URI::HTTP, URI::HTTPS] link

The URL.

@return [Boolean]

Specifies whether the given URL should be visited.

@since 0.2.4

# File lib/spidr/agent/filters.rb, line 511
def visit_url?(link)
  @url_rules.accept?(link)
end