class Sledgehammer::CrawlWorker

Constants

DEFAULT_OPTIONS
MAIL_REGEX
URL_REGEX

Public Instance Methods

after_queue(urls) click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 21
def after_queue(urls)
  # stub
end
before_queue(urls) click to toggle source

Callbacks to overload in application

# File lib/sledgehammer/workers/crawl_worker.rb, line 10
def before_queue(urls)
  # stub
end
on_complete(response) click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 25
def on_complete(response)
  page = self.find_or_create_page!(response.request.url)
  unless page.completed?
    self.parse_emails(response, page)
    self.parse_urls(response)
    page.update_attributes completed: true
  end
end
on_queue(url) click to toggle source

Stops element from being added to queue if returns false

# File lib/sledgehammer/workers/crawl_worker.rb, line 17
def on_queue(url)
  true
end
perform(urls, opts = {}) click to toggle source

There shouldn't be any need to overload methods below

# File lib/sledgehammer/workers/crawl_worker.rb, line 38
def perform(urls, opts = {})
  @options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS)
  @options.merge!(opts)

  return if @options[:depth] == @options[:depth_limit]

  before_queue(urls)
  urls.each { |site| self.queue(site) }
  run_queue
  after_queue(urls)
end
queue(url) click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 50
def queue(url)
  return unless self.on_queue(url) && valid_url?(url)

  request = Typhoeus::Request.new(url)
  request.on_complete { |response| self.on_complete(response) }

  Typhoeus::Hydra.hydra.queue(request)
end
run_queue() click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 59
def run_queue
  Typhoeus::Hydra.hydra.run
end

Protected Instance Methods

find_or_create_page!(request_url) click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 64
def find_or_create_page!(request_url)
  page = Sledgehammer::Page.find_by(url: request_url)

  if page.blank?
    hostname = URI.parse(request_url).host
    website  = Sledgehammer::Website.find_or_create_by(hostname: hostname)
    page     = Sledgehammer::Page.create!(url: request_url, depth: @options[:depth], website: website)
  elsif page.depth < @options[:depth]
    page.update_attributes completed: false
  end
  page
end
parse_emails(response, page) click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 77
def parse_emails(response, page)
  mail_list = response.body.scan MAIL_REGEX
  mail_list.each do |email|
    contact = Sledgehammer::Contact.find_or_create_by(email: email)
    Sledgehammer::PageContact.find_or_create_by page: page, contact: contact
  end
end
parse_urls(response) click to toggle source

TODO: remove url == '/' because we not always start at root page

# File lib/sledgehammer/workers/crawl_worker.rb, line 86
def parse_urls(response)
  request_url = response.request.url
  request_url = "http://#{request_url}" unless request_url.match /^http/

  url_list = response.body.scan(URL_REGEX).flatten.map do |url|
    if url == request_url || !valid_url?(url)
      nil
    elsif url.starts_with?('/')
      URI.join(request_url, url).to_s
    else
      url
    end
  end.compact

  opts         = @options.dup
  opts[:depth] += 1

  unless opts[:depth] >= opts[:depth_limit] || url_list.empty?
    Sidekiq::Client.push('queue' => opts[:queue],
      'class' => self.class,
      'args' => [url_list, opts])
  end
end
valid_url?(url) click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 110
def valid_url?(url)
  !!URI.parse(url) rescue false
end