module Spidr::Filters

The {Filters} module adds methods to {Agent} for controlling which URLs the agent will visit.

Attributes

schemes[R]

List of acceptable URL schemes to follow

Public Instance Methods

ignore_exts() click to toggle source

Specifies the patterns that match URI path extensions to not visit.

@return [Array<String, Regexp, Proc>]

The URI path extension patterns to not visit.
# File lib/spidr/filters.rb, line 331
def ignore_exts
  @ext_rules.reject
end
ignore_exts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_exts}.

@param [String, Regexp] pattern

The pattern to match URI path extensions with.

@yield [ext]

If a block is given, it will be used to filter URI path extensions.

@yieldparam [String] ext

A URI path extension to reject or accept.
# File lib/spidr/filters.rb, line 347
def ignore_exts_like(pattern=nil,&block)
  if pattern
    ignore_exts << pattern
  elsif block
    ignore_exts << block
  end

  return self
end
ignore_hosts() click to toggle source

Specifies the patterns that match host-names to not visit.

@return [Array<String, Regexp, Proc>]

The host-name patterns to not visit.
# File lib/spidr/filters.rb, line 63
def ignore_hosts
  @host_rules.reject
end
ignore_hosts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_hosts}.

@param [String, Regexp] pattern

The pattern to match host-names with.

@yield [host]

If a block is given, it will be used to filter host-names.

@yieldparam [String] host

A host-name to reject or accept.
# File lib/spidr/filters.rb, line 79
def ignore_hosts_like(pattern=nil,&block)
  if pattern
    ignore_hosts << pattern
  elsif block
    ignore_hosts << block
  end

  return self
end
ignore_ports() click to toggle source

Specifies the patterns that match ports to not visit.

@return [Array<Integer, Regexp, Proc>]

The port patterns to not visit.
# File lib/spidr/filters.rb, line 127
def ignore_ports
  @port_rules.reject
end
ignore_ports_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_ports}.

@param [Integer, Regexp] pattern

The pattern to match ports with.

@yield [port]

If a block is given, it will be used to filter ports.

@yieldparam [Integer] port

A port to reject or accept.
# File lib/spidr/filters.rb, line 143
def ignore_ports_like(pattern=nil,&block)
  if pattern
    ignore_ports << pattern
  elsif block
    ignore_ports << block
  end

  return self
end
ignore_urls() click to toggle source

Specifies the patterns that match URLs to not visit.

@return [Array<String, Regexp, Proc>]

The URL patterns to not visit.

@since 0.2.4

# File lib/spidr/filters.rb, line 265
def ignore_urls
  @url_rules.reject
end
ignore_urls_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#ignore_urls}.

@param [String, Regexp] pattern

The pattern to match URLs with.

@yield [url]

If a block is given, it will be used to filter URLs.

@yieldparam [URI::HTTP, URI::HTTPS] url

A URL to reject or accept.

@since 0.2.4

# File lib/spidr/filters.rb, line 283
def ignore_urls_like(pattern=nil,&block)
  if pattern
    ignore_urls << pattern
  elsif block
    ignore_urls << block
  end

  return self
end
schemes=(new_schemes) click to toggle source

Sets the list of acceptable URL schemes to visit.

@param [Array] new_schemes

The new schemes to visit.

@example

agent.schemes = ['http']
# File lib/spidr/filters.rb, line 21
def schemes=(new_schemes)
  @schemes = new_schemes.map { |scheme| scheme.to_s }
end
visit_exts() click to toggle source

Specifies the patterns that match the URI path extensions to visit.

@return [Array<String, Regexp, Proc>]

The URI path extensions patterns to visit.
# File lib/spidr/filters.rb, line 299
def visit_exts
  @ext_rules.accept
end
visit_exts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_exts}.

@param [String, Regexp] pattern

The pattern to match URI path extensions with.

@yield [ext]

If a block is given, it will be used to filter URI path extensions.

@yieldparam [String] ext

A URI path extension to accept or reject.
# File lib/spidr/filters.rb, line 315
def visit_exts_like(pattern=nil,&block)
  if pattern
    visit_exts << pattern
  elsif block
    visit_exts << block
  end

  return self
end
visit_hosts() click to toggle source

Specifies the patterns that match host-names to visit.

@return [Array<String, Regexp, Proc>]

The host-name patterns to visit.
# File lib/spidr/filters.rb, line 31
def visit_hosts
  @host_rules.accept
end
visit_hosts_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_hosts}.

@param [String, Regexp] pattern

The pattern to match host-names with.

@yield [host]

If a block is given, it will be used to filter host-names.

@yieldparam [String] host

A host-name to accept or reject.
# File lib/spidr/filters.rb, line 47
def visit_hosts_like(pattern=nil,&block)
  if pattern
    visit_hosts << pattern
  elsif block
    visit_hosts << block
  end

  return self
end
visit_ports() click to toggle source

Specifies the patterns that match the ports to visit.

@return [Array<Integer, Regexp, Proc>]

The port patterns to visit.
# File lib/spidr/filters.rb, line 95
def visit_ports
  @port_rules.accept
end
visit_ports_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_ports}.

@param [Integer, Regexp] pattern

The pattern to match ports with.

@yield [port]

If a block is given, it will be used to filter ports.

@yieldparam [Integer] port

A port to accept or reject.
# File lib/spidr/filters.rb, line 111
def visit_ports_like(pattern=nil,&block)
  if pattern
    visit_ports << pattern
  elsif block
    visit_ports << block
  end

  return self
end
visit_urls() click to toggle source

Specifies the patterns that match the URLs to visit.

@return [Array<String, Regexp, Proc>]

The link patterns to visit.

@since 0.2.4

# File lib/spidr/filters.rb, line 229
def visit_urls
  @url_rules.accept
end
visit_urls_like(pattern=nil,&block) click to toggle source

Adds a given pattern to the {#visit_urls}

@param [String, Regexp] pattern

The pattern to match URLs with.

@yield [url]

If a block is given, it will be used to filter URLs.

@yieldparam [URI::HTTP, URI::HTTPS] url

A URL to accept or reject.

@since 0.2.4

# File lib/spidr/filters.rb, line 247
def visit_urls_like(pattern=nil,&block)
  if pattern
    visit_urls << pattern
  elsif block
    visit_urls << block
  end

  return self
end

Protected Instance Methods

initialize_filters(options={}) click to toggle source

Initializes filtering rules.

@param [Hash] options

Additional options.

@option options [Array] :schemes (['http', 'https'])

The list of acceptable URI schemes to visit.
The `https` scheme will be ignored if `net/https` cannot be loaded.

@option options [String] :host

The host-name to visit.

@option options [Array<String, Regexp, Proc>] :hosts

The patterns which match the host-names to visit.

@option options [Array<String, Regexp, Proc>] :ignore_hosts

The patterns which match the host-names to not visit.

@option options [Array<Integer, Regexp, Proc>] :ports

The patterns which match the ports to visit.

@option options [Array<Integer, Regexp, Proc>] :ignore_ports

The patterns which match the ports to not visit.

@option options [Array<String, Regexp, Proc>] :links

The patterns which match the links to visit.

@option options [Array<String, Regexp, Proc>] :ignore_links

The patterns which match the links to not visit.

@option options [Array<String, Regexp, Proc>] :urls

The patterns which match the URLs to visit.

@option options [Array<String, Regexp, Proc>] :ignore_urls

The patterns which match the URLs to not visit.

@option options [Array<String, Regexp, Proc>] :exts

The patterns which match the URI path extensions to visit.

@option options [Array<String, Regexp, Proc>] :ignore_exts

The patterns which match the URI path extensions to not visit.
# File lib/spidr/filters.rb, line 402
def initialize_filters(options={})
  @schemes = []

  if options[:schemes]
    @schemes += options[:schemes]
  else
    @schemes << 'http'

    begin
      require 'net/https'

      @schemes << 'https'
    rescue Gem::LoadError => e
      raise(e)
    rescue ::LoadError
      warn "Warning: cannot load 'net/https', https support disabled"
    end
  end

  @host_rules = Rules.new(
    :accept => options[:hosts],
    :reject => options[:ignore_hosts]
  )
  @port_rules = Rules.new(
    :accept => options[:ports],
    :reject => options[:ignore_ports]
  )
  @link_rules = Rules.new(
    :accept => options[:links],
    :reject => options[:ignore_links]
  )
  @url_rules = Rules.new(
    :accept => options[:urls],
    :reject => options[:ignore_urls]
  )
  @ext_rules = Rules.new(
    :accept => options[:exts],
    :reject => options[:ignore_exts]
  )

  if options[:host]
    visit_hosts_like(options[:host])
  end

  if options[:queue]
    self.queue = options[:queue]
  end

  if options[:history]
    self.history = options[:history]
  end
end
visit_ext?(path) click to toggle source

Determines if a given URI path extension should be visited.

@param [String] path

The path that contains the extension.

@return [Boolean]

Specifies whether the given URI path extension should be visited.
# File lib/spidr/filters.rb, line 535
def visit_ext?(path)
  @ext_rules.accept?(File.extname(path)[1..-1])
end
visit_host?(host) click to toggle source

Determines if a given host-name should be visited.

@param [String] host

The host-name.

@return [Boolean]

Specifies whether the given host-name should be visited.
# File lib/spidr/filters.rb, line 481
def visit_host?(host)
  @host_rules.accept?(host)
end
visit_port?(port) click to toggle source

Determines if a given port should be visited.

@param [Integer] port

The port number.

@return [Boolean]

Specifies whether the given port should be visited.
# File lib/spidr/filters.rb, line 494
def visit_port?(port)
  @port_rules.accept?(port)
end
visit_scheme?(scheme) click to toggle source

Determines if a given URI scheme should be visited.

@param [String] scheme

The URI scheme.

@return [Boolean]

Specifies whether the given scheme should be visited.
# File lib/spidr/filters.rb, line 464
def visit_scheme?(scheme)
  if scheme
    return @schemes.include?(scheme)
  else
    return true
  end
end
visit_url?(link) click to toggle source

Determines if a given URL should be visited.

@param [URI::HTTP, URI::HTTPS] url

The URL.

@return [Boolean]

Specifies whether the given URL should be visited.

@since 0.2.4

# File lib/spidr/filters.rb, line 522
def visit_url?(link)
  @url_rules.accept?(link)
end