class Recluse::Profile

A profile is an atomic unit of rules for link checking.

Attributes

blacklist[RW]

Array of URL patterns to check. Optional. Defaults to empty array.

email[RW]

Used in the user-agent to identify who is running the crawler. This is so that if there's a problem with your spidering, you will be contacted and not the author of Recluse. Required.

internal_only[RW]

Don't check external URLs. Optional. Defaults to false.

name[RW]

Identifier of the profile. Make sure that it is filename friendly. Required.

redirect[RW]

When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to false.

results[RW]

Hash of resulting +HashTree+s.

roots[RW]

Array of URLs to start spidering. Required.

scheme_squash[RW]

HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to false.

tasks[RW]

The list of run tests.

whitelist[RW]

Array of exceptions to the blacklist. Optional. Defaults to empty array.

Public Class Methods

load(profile) click to toggle source

Loads profile by name.

# File lib/recluse/profile.rb, line 151
def self.load(profile)
  uconf = UserConfig.new '.recluse'
  raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml")
  options = uconf["#{profile}.yaml"]
  expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect]
  opts = {}
  expects.each do |e|
    estr = e.to_s
    opts[e] = options[estr] if options.key?(estr) && !options[estr].nil?
  end
  ret = Profile.new(
    profile,
    (options.key?('roots') && !options['roots'].nil? ? options['roots'] : []),
    (options.key?('email') && !options['email'].nil? ? options['email'] : ''),
    **opts
  )
  ret
end
new( name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false ) click to toggle source

Create a profile.

# File lib/recluse/profile.rb, line 63
def initialize(
    name,
    roots,
    email,
    blacklist: [],
    whitelist: [],
    internal_only: false,
    scheme_squash: false,
    redirect: false
)
  raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
  @name = name
  @email = email
  @roots = roots.map do |root|
    if root.class == Link
      root
    else
      Link.new(root, :root)
    end
  end
  @blacklist = blacklist
  @whitelist = whitelist
  @internal_only = internal_only
  @scheme_squash = scheme_squash
  @redirect = redirect
  @tasks = {}
  @results = {}
end

Public Instance Methods

==(other) click to toggle source

Test if profiles share the same configuration options.

# File lib/recluse/profile.rb, line 140
def ==(other)
  return false if other.class != self.class
  instance_variables.all? do |ivar|
    next true if ivar == '@results'.to_sym
    next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s)
    instance_variable_get(ivar) == other.instance_variable_get(ivar)
  end
end
create_agent() click to toggle source

Create a Mechanize agent.

# File lib/recluse/profile.rb, line 94
def create_agent
  Mechanize.new do |a|
    a.ssl_version = 'TLSv1'
    a.verify_mode = OpenSSL::SSL::VERIFY_NONE
    a.max_history = nil
    a.follow_meta_refresh = true
    a.keep_alive = false
    a.redirect_ok = @redirect
    a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
  end
end
save() click to toggle source

Saves profile to ~/.recluse/NAME.yaml.

# File lib/recluse/profile.rb, line 123
def save
  uconf = UserConfig.new '.recluse'
  fname = "#{@name}.yaml"
  options = uconf[fname]
  options['name'] = @name
  options['roots'] = @roots.map(&:to_s)
  options['email'] = @email
  options['blacklist'] = @blacklist
  options['whitelist'] = @whitelist
  options['internal_only'] = @internal_only
  options['scheme_squash'] = @scheme_squash
  options['redirect'] = @redirect
  options.save
end
test(key, options = {}) click to toggle source

Runs test.

# File lib/recluse/profile.rb, line 108
def test(key, options = {})
  unless @results.key?(key) && @results[key].class == Recluse::HashTree
    @results[key] = Recluse::HashTree.new do |url1, url2|
      url1, url2 = url2, url1 if url2.length > url1.length
      # Detect if URL exists already, but just has a slash at end
      (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
    end
  end
  @tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key]))
  @tasks[key].run
  @results[key]
end