class Scrapah::Extract

Public Class Methods

emails(content) click to toggle source

TODO, make case insensitive and remove one set?

# File lib/scrapah/extract.rb, line 12
def self.emails(content)
        r = Regexp.new(/\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/)
        regex(content,r)
end
ips(content) click to toggle source
# File lib/scrapah/extract.rb, line 17
def self.ips(content)
        # very simple IPv4 regex
        r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\b/)
        regex(content,r)
end
proxies(content) click to toggle source
# File lib/scrapah/extract.rb, line 23
def self.proxies(content)
        # ex. IPadress:port
        r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\:[0-9]{1,5})\b/)
        regex(content,r)
end
regex(content, regex) click to toggle source
# File lib/scrapah/extract.rb, line 29
def self.regex(content, regex)
        # deals with nokogiri and misc
        if content.respond_to?(:to_s) && !content.is_a?(String)
                content = content.to_s 
        end

        results = []
        results << content.scan(regex)
        results = results.flatten.uniq

        results
end