class Sledgehammer::CrawlWorker
Constants
- DEFAULT_OPTIONS
- MAIL_REGEX
- URL_REGEX
Public Instance Methods
after_queue(urls)
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 21 def after_queue(urls) # stub end
before_queue(urls)
click to toggle source
Callbacks to overload in application
# File lib/sledgehammer/workers/crawl_worker.rb, line 10 def before_queue(urls) # stub end
on_complete(response)
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 25 def on_complete(response) page = self.find_or_create_page!(response.request.url) unless page.completed? self.parse_emails(response, page) self.parse_urls(response) page.update_attributes completed: true end end
on_queue(url)
click to toggle source
Stops element from being added to queue if returns false
# File lib/sledgehammer/workers/crawl_worker.rb, line 17 def on_queue(url) true end
perform(urls, opts = {})
click to toggle source
There shouldn't be any need to overload methods below
# File lib/sledgehammer/workers/crawl_worker.rb, line 38 def perform(urls, opts = {}) @options = HashWithIndifferentAccess.new(DEFAULT_OPTIONS) @options.merge!(opts) return if @options[:depth] == @options[:depth_limit] before_queue(urls) urls.each { |site| self.queue(site) } run_queue after_queue(urls) end
queue(url)
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 50 def queue(url) return unless self.on_queue(url) && valid_url?(url) request = Typhoeus::Request.new(url) request.on_complete { |response| self.on_complete(response) } Typhoeus::Hydra.hydra.queue(request) end
run_queue()
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 59 def run_queue Typhoeus::Hydra.hydra.run end
Protected Instance Methods
find_or_create_page!(request_url)
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 64 def find_or_create_page!(request_url) page = Sledgehammer::Page.find_by(url: request_url) if page.blank? hostname = URI.parse(request_url).host website = Sledgehammer::Website.find_or_create_by(hostname: hostname) page = Sledgehammer::Page.create!(url: request_url, depth: @options[:depth], website: website) elsif page.depth < @options[:depth] page.update_attributes completed: false end page end
parse_emails(response, page)
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 77 def parse_emails(response, page) mail_list = response.body.scan MAIL_REGEX mail_list.each do |email| contact = Sledgehammer::Contact.find_or_create_by(email: email) Sledgehammer::PageContact.find_or_create_by page: page, contact: contact end end
parse_urls(response)
click to toggle source
TODO: remove url == '/' because we not always start at root page
# File lib/sledgehammer/workers/crawl_worker.rb, line 86 def parse_urls(response) request_url = response.request.url request_url = "http://#{request_url}" unless request_url.match /^http/ url_list = response.body.scan(URL_REGEX).flatten.map do |url| if url == request_url || !valid_url?(url) nil elsif url.starts_with?('/') URI.join(request_url, url).to_s else url end end.compact opts = @options.dup opts[:depth] += 1 unless opts[:depth] >= opts[:depth_limit] || url_list.empty? Sidekiq::Client.push('queue' => opts[:queue], 'class' => self.class, 'args' => [url_list, opts]) end end
valid_url?(url)
click to toggle source
# File lib/sledgehammer/workers/crawl_worker.rb, line 110 def valid_url?(url) !!URI.parse(url) rescue false end