module NewsCrawler::URLHelper
Contains various method for processing url
Public Instance Methods
get_url_path(url)
click to toggle source
split URL into 3 parts: scheme, domain, path @param [ String ] url return [ Hash ] contains parts
# File lib/news_crawler/url_helper.rb, line 45 def get_url_path(url) pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/ md = pattern.match(url) { :scheme => md[:scheme], :domain => md[:domain], :path => md[:path]} end
same_domain?(url1, url2)
click to toggle source
produce true if 2 urls belong to same domain, or url is start with ‘/’ @param [ String ] url1 Url 1 @param [ String ] url2 Url 2 @return [ Boolean ] true if both url belong to same domain
# File lib/news_crawler/url_helper.rb, line 29 def same_domain?(url1, url2) if (url1[0] == '/') || (url2[0] == '/') return true end p1 = get_url_path(url1) p2 = get_url_path(url2) d1 = p1[:domain].split('.').reverse d2 = p2[:domain].split('.').reverse d1.zip(d2).inject(true) do | mem, obj | mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?)) end end