class SiteMapper::CrawlUrl

Crawl URL formatter.

Constants

TOO_MANY_REQUEST_MSG

Too many request error message

Attributes

resolved_base_url[R]

Public Class Methods

new(base_url) click to toggle source

Initialize CrawlUrl @param [String] base_url @example Intitialize CrawlUrl with example.com as base_url

CrawlUrl.new('example.com')
# File lib/site_mapper/crawl_url.rb, line 13
def initialize(base_url)
  @resolved_base_url = Request.resolve_url(base_url)
  @base_hostname     = URI.parse(@resolved_base_url).hostname
end

Public Instance Methods

absolute_url_from(page_url, current_url) click to toggle source

Given a link it constructs the absolute path, if valid URL & URL has same domain as @resolved_base_url. @param [String] page_url url found on page @param [String] current_url current page url @return [String] with absolute path to resource @example Construct absolute URL for '/path', example.com

cu = CrawlUrl.new('example.com')
cu.absolute_url_from('/path', 'example.com/some/path')
# => http://example.com/some/path
# File lib/site_mapper/crawl_url.rb, line 27
def absolute_url_from(page_url, current_url)
  return unless eligible_url?(page_url)
  parsed_uri = URI.join(current_url, page_url) rescue return
  return unless parsed_uri.hostname == @base_hostname
  parsed_uri.to_s
end

Private Instance Methods

eligible_url?(href) click to toggle source
# File lib/site_mapper/crawl_url.rb, line 36
def eligible_url?(href)
  return false if href.nil? || href.empty?
  dont_start   = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
  dont_include = %w(/email-protection#)
  err_include  = %w(/sorry/IndexRedirect?)
  dont_end     = %w(.zip .rar .json .pdf .exe .dmg .pkg .dpkg .bat)

  err_include.each  { |pattern| fail TOO_MANY_REQUEST_MSG if href.include?(pattern) }
  dont_start.each   { |pattern| return false if href.start_with?(pattern) }      
  dont_include.each { |pattern| return false if href.include?(pattern) }
  dont_end.each     { |pattern| return false if href.end_with?(pattern) }
  true
end