class EmailCrawler::EmailScanner

Constants

EMAIL_REGEXP
UTF_8

Public Class Methods

new(logger = Logger.new("/dev/null")) click to toggle source
# File lib/email_crawler/email_scanner.rb, line 8
def initialize(logger = Logger.new("/dev/null"))
  @logger = logger
end

Public Instance Methods

scan(links) click to toggle source
# File lib/email_crawler/email_scanner.rb, line 12
def scan(links)
  links.each_with_object({}) do |link, h|
    @logger.info "searching for emails on '#{link}'.."
    retried = false

    begin
      html = get(link).body
    rescue => err
      @logger.warn err.inspect
      nil
    end
    next unless html

    begin
      emails = html.scan(EMAIL_REGEXP)
    rescue ArgumentError => err
      if retried
        emails = []
      else
        @logger.warn err.inspect
        html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
        retried = true
        retry
      end
    end

    h[link] = Set.new(emails) unless emails.empty?
  end
end