module UrlCommon

Constants

VERSION

Public Class Methods

agent() click to toggle source
# File lib/url_common.rb, line 68
def self.agent
  return Mechanize.new
end
check_for_404(url, elixir_style = false) click to toggle source

TODO needs tests

# File lib/url_common.rb, line 197
def self.check_for_404(url, elixir_style = false)
  agent = Mechanize.new
  results = []

  begin
    head_result = agent.head(url)
    return OpenStruct.new(:url => url, :status => 200) if elixir_style == false
    return :ok, url if elixir_style
  rescue StandardError => e
    if e.to_s =~ /404/
      return OpenStruct.new(:url => url, :error => e, :status => 404)
    else
      return OpenStruct.new(:url => url, :error => e, :status => 404)        
    end
  end
end
check_for_amazon_referrer(url, referrer_code) click to toggle source

tested https://www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0

returns nil if link isn't amazon at all
returns true if link is amazon and has referrer code
returns false if link is amazon and doesn't have referrer code
# File lib/url_common.rb, line 98
def self.check_for_amazon_referrer(url, referrer_code)
#def check_for_amazon_referrer(url, referrer_code)
  #https://github.com/gamache/fuzzyurl.rb
  fu = Fuzzyurl.from_string(url)
  return nil if fu.hostname.nil? 
  base_domain = fu.hostname.sub(/^www./,'')
  # base_domain = UrlCommon.get_base_domain
  parts = base_domain.split(".")
  return nil if parts[0] != "amazon"
  #referer_code = self.account.user.details[:amazon_referrer_code]
  if url =~ /#{referrer_code}/
    return true
  else
    return false
  end
end
create_mechanize_page_from_html(url, html) click to toggle source

TODO needs tests

# File lib/url_common.rb, line 263
def self.create_mechanize_page_from_html(url, html)
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
  mechanize_page.uri = URI.parse(url)    

  return mechanize_page
end
fix_relative_url(base_url, partial_url) click to toggle source
# File lib/url_common.rb, line 232
def self.fix_relative_url(base_url, partial_url)
  return partial_url if partial_url =~ /^http/
  parts = URI.parse(base_url)
  return parts.scheme + '://' +  parts.host + partial_url
  return File.join(base_url, partial_url)
end
get_base_domain(url) click to toggle source
# File lib/url_common.rb, line 42
def self.get_base_domain(url)
  parts = URI.parse(url)
  return parts.host.gsub(/^www./,'')
end
get_page(url, return_html = false, user_agent = nil) click to toggle source

TODO needs tests

# File lib/url_common.rb, line 132
def self.get_page(url, return_html = false, user_agent = nil)
  agent = Mechanize.new { |a| 
    if user_agent.nil?
      #a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:46.0) Gecko/20100101 Firefox/46.0"
      a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
    else
      a.user_agent = user_agent
    end
    #a.user_agent = "curl/7.54.0"
    #debugger
  }
  agent.verify_callback = Proc.new do |ok,x509|
    status = x509.error
    msg = x509.error_string
    logger.warn "server certificate verify: status: #{status}, msg: #{msg}" if status != 0
    true # this has the side effect of ignoring errors. nice!
  end 
  begin
    page = agent.get(url)
    if return_html
      return :ok, page.body
    else
      return :ok, page
    end
    #return :ok, page
  rescue StandardError => e
    return :error, e
  end
end
has_own_domain?(url) click to toggle source
TODO needs tests

def self.check_for_jekyll_subdomain?(url)

# File lib/url_common.rb, line 117
def self.has_own_domain?(url)
  return false if url =~ /\.github\.io/
  return false if url =~ /\.blogspot\.com/
  return false if url =~ /\.wordpress\.com/
  #return false if url =~ /\..+\./
  return true
  if site_url =~ /\..+\./
    return true
  else
    analysis_results << "You have a domain of your own; that's a great first step!"
  end

end
is_valid?(url) click to toggle source

UrlCommon.is_valid?(“fuzzyblog.io/blog/”) UrlCommon.is_valid?(“fuzzyblog.io/blog/”)

# File lib/url_common.rb, line 11
def self.is_valid?(url)
  begin
    result = Fuzzyurl.from_string(url)
    return false if result.hostname.nil?
    return false if result.protocol.nil?
    return false if (!result.hostname.include?('.')) && result.protocol.nil?
    return true
  rescue StandardError => e
    return false
  end
end
join(base, rest, debug = false) click to toggle source
# File lib/url_common.rb, line 47
def self.join(base, rest, debug = false)
  return URI.join(base, rest).to_s
end
mpage_is_html?(page) click to toggle source

def self.get_page_caching_attempt(url, return_html = false)

agent = Mechanize.new { |a|
  a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:46.0) Gecko/20100101 Firefox/46.0"
}
agent.verify_callback = Proc.new do |ok,x509|
  status = x509.error
  msg = x509.error_string
  logger.warn "server certificate verify: status: #{status}, msg: #{msg}" if status != 0
  true # this has the side effect of ignoring errors. nice!
end
begin
  page = agent.get(url)
  if return_html
    Rails.cache.fetch(UrlCommon.sha_it(url), :expires_in => 1.hour) do
      page.body
    end
    # Rails.cache.fetch(UrlCommon.sha_it(url), :expires_in => 1.hour) do
    #   debugger
    #   page.body
    # end
    return :ok, page.body
  else
    return :ok, page
  end
rescue StandardError => e
  return :error, e
end

end

# File lib/url_common.rb, line 191
def self.mpage_is_html?(page)
  return true if page.respond_to?(:title)
  return false
end
parse_country_from_itunes_url(url) click to toggle source
# File lib/url_common.rb, line 33
def self.parse_country_from_itunes_url(url)
  country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
  if country
    country = country[1] 
  end
  return country if country
  return 'us'
end
parse_fid_from_itunes_url(url) click to toggle source

UrlCommon.parse_fid_from_itunes_url(“itunes.apple.com/us/app/imovie/id408981434?mt=12”)

# File lib/url_common.rb, line 24
def self.parse_fid_from_itunes_url(url)
  tmp = /\/id([0-9]+)/.match(url)
  if tmp && tmp[1]
    return tmp[1] 
  else
    return nil
  end
end
strip_a_tag(a_tag) click to toggle source
# File lib/url_common.rb, line 72
def self.strip_a_tag(a_tag)
  #<a href="https://www.keyingredient.com/recipes/12194051/egg-salad-best-ever-creamy/">
  return a_tag.sub(/<a href=[\"']/,'').sub(/[\"']>/,'')
end
url_base(url, base_domain=nil) click to toggle source

Returns a url w/o wwww UrlCommon.url_base(“www.udemy.com/the-build-a-saas-app-with-flask-course/”) “udemy.com/the-build-a-saas-app-with-flask-course/”

# File lib/url_common.rb, line 83
def self.url_base(url, base_domain=nil)
  if base_domain.nil?
    base_domain = get_base_domain(url)
  end
  parts = URI.parse(url)
  extra = ""
  extra = "?#{parts.query}" if parts.query
  url_base = "#{base_domain}#{parts.path}#{extra}"
  return url_base[0..254]
end
url_no_www(url) click to toggle source
# File lib/url_common.rb, line 51
def self.url_no_www(url)
  parts = Fuzzyurl.new(url)
  if parts.query
    #return parts.hostname.sub(/^www\./, '') + parts.try(:path) + '?' + parts.query
    return parts.hostname.sub(/^www\./, '') + parts&.path + '?' + parts.query 
  else
    #byebug
    #return parts.hostname.sub(/^www\./, '') + parts.try(:path).to_s
    return parts.hostname.sub(/^www\./, '') + parts&.path.to_s
  end
end
validate_with_merge_fragment(url, merge_fragment) click to toggle source

status, url = UrlCommon.validate_with_merge_fragment(“nickjj/orats”, “www.github.com/”)

# File lib/url_common.rb, line 240
def self.validate_with_merge_fragment(url, merge_fragment)
  #
  # verify it is a valid url and it isn't a 404 or redirect
  #
  if is_valid?(url) && check_for_404(url)
    return true, url 
  end

  #
  # Try and make it valid
  #
  if url =~ /^http/
    # if its invalid and has http then don't know what to do so return false
    return false, url
  end

  url = File.join(merge_fragment, url)
  if is_valid?(url) && check_for_404(url)
    return true, url
  end        
end