class EmailCrawler::EmailScanner
Constants
- EMAIL_REGEXP
- UTF_8
Public Class Methods
new(logger = Logger.new("/dev/null"))
click to toggle source
# File lib/email_crawler/email_scanner.rb, line 8 def initialize(logger = Logger.new("/dev/null")) @logger = logger end
Public Instance Methods
scan(links)
click to toggle source
# File lib/email_crawler/email_scanner.rb, line 12 def scan(links) links.each_with_object({}) do |link, h| @logger.info "searching for emails on '#{link}'.." retried = false begin html = get(link).body rescue => err @logger.warn err.inspect nil end next unless html begin emails = html.scan(EMAIL_REGEXP) rescue ArgumentError => err if retried emails = [] else @logger.warn err.inspect html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "") retried = true retry end end h[link] = Set.new(emails) unless emails.empty? end end