class EmailCrawler::PageLinks
Constants
- MAX_LINKS
- MAX_RETRIES
- SLEEP_TIME
Public Class Methods
for(url, max_links: MAX_LINKS, logger: Logger.new("/dev/null"))
click to toggle source
# File lib/email_crawler/page_links.rb, line 23 def self.for(url, max_links: MAX_LINKS, logger: Logger.new("/dev/null")) new(url, logger).fetch_links(max_links) end
new(url, logger = Logger.new("/dev/null"))
click to toggle source
# File lib/email_crawler/page_links.rb, line 9 def initialize(url, logger = Logger.new("/dev/null")) @url = url uri = begin URI(url) rescue; end scheme_and_host = if uri && uri.host "#{uri.scheme}://#{uri.host}" else url[%r(\A(https?://([^/]+))), 1] end @domain = Regexp.new("#{scheme_and_host}/", true) @logger = logger end
Public Instance Methods
fetch_links(max_links = MAX_LINKS)
click to toggle source
# File lib/email_crawler/page_links.rb, line 27 def fetch_links(max_links = MAX_LINKS) queue, links = Set.new([@url]), Set.new([@url]) retries = 0 until queue.empty? current_link = queue.first @logger.info "current_link: #{current_link}" begin page = get(current_link) rescue Net::HTTP::Persistent::Error => err @logger.warn err.inspect page = nil if retries < MAX_RETRIES retries += 1 @logger.debug "Retry ##{retries}" agent.shutdown Thread.current[:agent] = nil sleep(SLEEP_TIME) retry else @logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries" break end rescue => err @logger.warn err.inspect else retries = 0 end if page new_links = page.links_with(href: @domain).map(&:href) new_links.reject! { |link| links.include?(link) } @logger.debug "found: #{new_links.length} new link(s)" new_links.each { |link| queue << link } links << current_link if links.length == max_links break else sleep(SLEEP_TIME) end end queue.delete(current_link) end links.to_a end