module NewsCrawler::URLHelper

Contains various method for processing url

Public Instance Methods

get_url_path(url) click to toggle source

split URL into 3 parts: scheme, domain, path @param [ String ] url return [ Hash ] contains parts

# File lib/news_crawler/url_helper.rb, line 45
def get_url_path(url)
  pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/
  md = pattern.match(url)
  { :scheme => md[:scheme],
    :domain => md[:domain],
    :path => md[:path]}
end
same_domain?(url1, url2) click to toggle source

produce true if 2 urls belong to same domain, or url is start with ‘/’ @param [ String ] url1 Url 1 @param [ String ] url2 Url 2 @return [ Boolean ] true if both url belong to same domain

# File lib/news_crawler/url_helper.rb, line 29
def same_domain?(url1, url2)
  if (url1[0] == '/') || (url2[0] == '/')
    return true
  end
  p1 = get_url_path(url1)
  p2 = get_url_path(url2)
  d1 = p1[:domain].split('.').reverse
  d2 = p2[:domain].split('.').reverse
  d1.zip(d2).inject(true) do | mem, obj |
    mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?))
  end
end