class Scrapah::Extract
Public Class Methods
emails(content)
click to toggle source
TODO, make case insensitive and remove one set?
# File lib/scrapah/extract.rb, line 12 def self.emails(content) r = Regexp.new(/\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/) regex(content,r) end
ips(content)
click to toggle source
# File lib/scrapah/extract.rb, line 17 def self.ips(content) # very simple IPv4 regex r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\b/) regex(content,r) end
proxies(content)
click to toggle source
# File lib/scrapah/extract.rb, line 23 def self.proxies(content) # ex. IPadress:port r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\:[0-9]{1,5})\b/) regex(content,r) end
regex(content, regex)
click to toggle source
# File lib/scrapah/extract.rb, line 29 def self.regex(content, regex) # deals with nokogiri and misc if content.respond_to?(:to_s) && !content.is_a?(String) content = content.to_s end results = [] results << content.scan(regex) results = results.flatten.uniq results end