module UrlScrubber
Constants
- VERSION
Public Class Methods
find_identity_from_url(url)
click to toggle source
# File lib/url_scrubber.rb, line 124 def self.find_identity_from_url(url) return nil unless url.present? url = UrlScrubber.scrub(url) url ? url.split("/").last : nil end
find_linkedin_identity_from_url(url)
click to toggle source
# File lib/url_scrubber.rb, line 131 def self.find_linkedin_identity_from_url(url) return nil if url.nil? scrubbed_url = scrub(url) if scrubbed_url && linkedin_company_url?(scrubbed_url) scrubbed_url.split("/").last elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/in/') scrubbed_url.split("/").last elsif scrubbed_url && scrubbed_url.include?('http://linkedin.com/pub/') id_partition = scrubbed_url.partition('linkedin.com/pub/') id_partition[2] && id_partition[2] != "" ? drop_url_ampersand!(id_partition[2].split('/').first) : nil elsif scrubbed_url.include?('linkedin.com/groups/') scrubbed_url.split("/").last elsif scrubbed_url.include?('linkedin.com/groups?gid=') id_partition = scrubbed_url.partition('linkedin.com/groups?gid=') drop_url_ampersand!(id_partition[2]) end end
ideal_form?(url)
click to toggle source
# File lib/url_scrubber.rb, line 70 def self.ideal_form?(url) url = scrub(url) return false unless url case service_of(url) when :vkontakte !!url.match(%r{^http://vk\.com/[\w_]+$}) when :weibo !!url.match(%r{^http://weibo\.com/[\w_-]+$}) when :youtube !!url.match(%r{^http://youtube\.com/[\w_-]+$}) when :twitter !!url.match(%r{^http://twitter\.com/[\w_]+$}) when :facebook !!url.match(%r{^http://facebook\.com/(profile\.php?id=\d+|[\w_\.-]+)$}) || !!url.match(%r{^http://facebook\.com/groups/[\w_\.-]+$}) when :linkedin !!url.match(%r{^http://linkedin\.com/pub/[\w-]+/[\w]+/[\w]+/[\w]+$}) || !!url.match(%r{^http://linkedin\.com/in/[\w_-]+$}) || !!url.match(%r{^http://linkedin\.com/(company/[\w_-]+|profile/view\?id=\d+)$}) || !!url.match(%r{^http://linkedin\.com/(groups\?gid=[0-9]+)$}) || !!url.match(%r{^http://linkedin\.com/(groups/[\w_-]+)$}) when :google !!url.match(%r{^http://plus\.google\.com/(\+[\w_-]+|\d+)$}) || !!url.match(%r{^http://plus\.google\.com/communities/\d+$}) when :slideshare !!url.match(%r{^http://slideshare\.net/[\w_-]+$}) when :flickr !!url.match(%r{^http://flickr\.com/[\w_\@-]+$}) || !!url.match(%r{^http://flickr\.com/groups/[\w_\@\.-]+$}) when :pinterest !!url.match(%r{^http://pinterest\.com/[\w_-]+$}) when :yelp !!url.match(%r{^http://yelp\.com/[\w_-]+$}) when :vimeo (!!url.match(%r{^http://vimeo\.com/[\w_-]+$}) && !url.match(%r{/\d+$})) || !!url.match(%r{^http://vimeo\.com/groups/[\w_\.-]+$}) when :instagram !!url.match(%r{^http://instagram\.com/[\w_]+$}) when :tumblr #Rails.logger.debug "CCC Tumblr - url=#{url}, ideal=#{!!url.match(%r{^http://[\w_]+\.tumblr\.com$})}, www=#{url.index("://www.") ? url.index("://www.") : 'NIL'}" !!url.match(%r{^http://[\w_]+\.tumblr\.com$}) && !url.index("://www.") else true end end
linkedin_company_url?(url)
click to toggle source
# File lib/url_scrubber.rb, line 110 def self.linkedin_company_url?(url) url = scrub(url) return false unless url return url.include?('http://linkedin.com/company/') end
linkedin_personal_url?(url)
click to toggle source
# File lib/url_scrubber.rb, line 117 def self.linkedin_personal_url?(url) url = scrub(url) return false unless url return url.include?('http://linkedin.com/in/') || url.include?('http://linkedin.com/pub/') end
maps_to_public_url(url)
click to toggle source
# File lib/url_scrubber.rb, line 163 def self.maps_to_public_url(url) scrubbed = scrub(url) parsed = URI.parse(URI.escape(url)) or return nil host = Domainatrix.parse(parsed.host) if host.domain == "facebook" && host.subdomain == "business" public_url = scrubbed.sub("http://business.facebook.com", "http://facebook.com") elsif host.domain == "google" && host.subdomain == "business" public_url = scrubbed.sub("http://business.google.com", "http://plus.google.com") else public_url = nil end public_url end
scrub(url)
click to toggle source
# File lib/url_scrubber.rb, line 9 def self.scrub(url) return url if url.blank? return url if /^app:\/\//.match(url) # Do not scrub app-only URLs return url if /^https?:\/\/(www.)?business.tiktok\.com\/manage\//.match(url) # Don't scrub tik tok business manager urls, quick fix until we can implement a different solution, https://business.tiktok.com/manage/overview?org_id=6974497704617492482 url = url.clone # don't modify the original argument m = url.match(/(htt?ps?:\/\/\S+)/i) return nil unless m url = m[1] url.sub!(/^https/i, 'http') url.sub!(/^htp/i, 'http') url.sub!(/\/+$/, '') url.sub!(/;+$/, '') url.sub!('#!/', '') url.sub!('%27', '\'') url = downcase_domain(url) remove_subdomain!(url) remove_html_tags!(url) # CHANGED we depend on the special case methods to decide if and when to drop the query string part of the URL url = drop_anchor!(special_cases(url)) url.sub!(/,+$/, "") # remove one or more trailing commas at the end of the URL url.gsub!(/\/+$/, '') # remove any trailing slashes (/) in the resulting URL return url end
service_of(url)
click to toggle source
# File lib/url_scrubber.rb, line 37 def self.service_of(url) url_parts = Domainatrix.parse(url) if url_parts.host.present? case url_parts.domain when 'facebook' then return :facebook when 'fb' then return :facebook when 'flickr' then return :flickr when 'instagram' then return :instagram when 'linkedin' then return :linkedin when 'pinterest' then return :pinterest when 'slideshare' then return :slideshare when 'tumblr' then return :tumblr when 'twitter' then return :twitter when 'vimeo' then return :vimeo when 'vk' then return :vkontakte when 'weibo' then return :weibo when 'yelp' then return :yelp when 'youtube' then return :youtube end case url_parts.host when /\bplus\.google\.com$/ then return :google end else Rails.logger.debug "No Domain Match" end :other end
valid_url?(url)
click to toggle source
Requirements:
-
must have http/https scheme
-
no “@” in any of the passed in url string
-
valid uri as determined by Addressable::URI
# File lib/url_scrubber.rb, line 154 def self.valid_url?(url) schemes = %w(http https) parsed = URI.parse(URI.escape(url)) or return false schemes.include?(parsed.scheme) && !url.include?("@") rescue URI::InvalidURIError false end
Private Class Methods
check_for_facebook_redirection(uri_str, limit = 5)
click to toggle source
# File lib/url_scrubber.rb, line 438 def self.check_for_facebook_redirection(uri_str, limit = 5) #puts "check_for_facebook_redirection called! uri=#{uri_str}, limit=#{limit.to_s}" # finds any redirects intended for facebook URLs only!!!! login_patterns = [ # pages that require user logins %r{^.*/login[^/]*$} ] failure_patterns = [ # pages that give 200 codes but actually indicate a not found %r{linkedin\.com/home\?report%2Efailure}i ] raise 'Too many HTTP redirects' if limit == 0 uri_str_new = uri_str.sub('http://', 'https://') uri_str_new = uri_str_new.sub('https://', 'https://www.') if !uri_str_new.include?("https://www.") begin url = URI.parse(URI.escape(uri_str_new)) rescue URI::InvalidURIError => e return [uri_str_new, CustomError.new(786, "Invalid URI #{uri_str_new} : #{e.message}") ] end http = Net::HTTP.new(url.host, url.port) http = Net::HTTP.new(url.host, url.port) http.open_timeout = 7 # only wait up to 7 seconds for a the connection to be established http.read_timeout = 10 # and up to 10 seconds for a response if url.port == 443 http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE else http.use_ssl = false end request = Net::HTTP::Get.new(url.request_uri, { 'User-Agent' => USER_AGENT }) begin response = http.request(request) rescue Timeout::Error #Rails.logger.error("UrlScrubber.check_for_facebook_redirection - http.request Timeout, URL=#{uri_str_new}") failure_response = Net::HTTPClientError.new('1.1', '400', 'Unreachable') return [uri_str_new, failure_response] rescue Exception => e failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found') return [uri_str_new, failure_response] end if response.is_a? Net::HTTPRedirection if response['location'][0,4] == "http" if failure_patterns.any? { |pattern| response['location'].match(pattern) } # got redirected to a page indicating failure, so act like it's a 404 failure_response = Net::HTTPClientError.new('1.1', '404', 'Not Found') #puts "check_for_facebook_redirection 404" return [uri_str_new, failure_response] end if login_patterns.any? { |pattern| redirected_url.match(pattern) } # got redirected to a login page. return the ultimate response, but the previous url failure_response = Net::HTTPClientError.new('1.1', '401', 'Inaccessible') #puts "check_for_facebook_redirection 401" return [uri_str_new, failure_response] end #puts "check_for_facebook_redirection 1 limit=#{limit.to_s}" redirected_url, base_response = check_for_facebook_redirection(response['location'], limit - 1) return [redirected_url, base_response] else redir_url = "http://#{url.host}#{response['location']}" #puts "check_for_facebook_redirection recalled limit =#{limit.to_s}" redirected_url, base_response = check_for_facebook_redirection(redir_url, limit - 1) return [redirected_url, base_response] end else #puts "check_for_facebook_redirection return code #{response.code.to_s}" return [uri_str_new, response] end end
downcase_domain(url)
click to toggle source
# File lib/url_scrubber.rb, line 183 def self.downcase_domain(url) domain_match = url.match(%r{http://[^/]+}i) if domain_match domain_match[0].downcase + domain_match.post_match else url end end
drop_anchor!(url)
click to toggle source
# File lib/url_scrubber.rb, line 242 def self.drop_anchor!(url) #puts "drop anchor" url.sub!(/#.*$/, '') url end
drop_url_ampersand!(url)
click to toggle source
# File lib/url_scrubber.rb, line 230 def self.drop_url_ampersand!(url) url.sub!(/\&.*$/, '') url end
drop_url_query!(url)
click to toggle source
# File lib/url_scrubber.rb, line 236 def self.drop_url_query!(url) url.sub!(/\?.*$/, '') url end
remove_subdomain!(url)
click to toggle source
# File lib/url_scrubber.rb, line 213 def self.remove_subdomain!(url) # url.sub!(%r{://www\d*\.}, '://') url.sub!(%r{^https?://www?w?\d*\.}i, 'http://') url.sub!(%r{^https?://m\d*\.}i, 'http://') url.sub!(%r{^https?://mobile\d*\.}i, 'http://') url.sub!(%r{^https?://touch\d*\.}i, 'http://') url.sub!(%r{^https?://mbasic\.facebook\.com}i, 'http://facebook.com') url end
sc_facebook(url)
click to toggle source
TODO This needs to be rewritten to be independent of the Facebook domain and public suffix used: e.g. facebook.com vs fb.com vs. fb.me
# File lib/url_scrubber.rb, line 294 def self.sc_facebook(url) url = url.gsub(/(_rdr=.+&)|(&_rdr=.+$)/,"") regex1 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>.*)[\/-](?<uid>[0-9]+))($|\/|\/(about|timeline|info|app_)?)/i regex2 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/profile.php\?id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i regex3 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/(((?<group>groups?)|pages?|pg)\/)*(?<uname>[^\?\/]*))($|\/$|\/(about|timeline|info|app_.*)?)/i regex4 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/)(?<php>home.php\?([#!]+\/)*)(?<uname>.*)/i regex5 = /^(?<url>(https?:\/\/)((business|www)\.)?facebook\.com\/(?<uid>[0-9]+))($|\/|\/.*|&.*)/i regex6 = /^(?<url>(https?:\/\/)((www|business)\.)?facebook\.com\/home\/accounts\?business_id=(?<uid>[0-9]+))($|\/|\/.*|&.*)/i # If the user gives us a path to a Post, "http://facebook.com/LoansByJanet/posts/1691075027771418" # then drop the post part, "/posts/1691075027771418" to get the base url, "http://facebook.com/LoansByJanet/" if mdata = /^(?<base_url>.+)\/posts\/(?<postid>[0-9]+).*$/.match(url) url = mdata[:base_url] end if url.match("/media/albums") || url.match("/media/set") url = url.match('\&') ? url.split('&',2)[0] : url elsif mdata = url.match(regex1) # "http://facebook.com/pages/Command-Canada/1434248516885065/timeline" url = mdata[:url] uname = mdata[:uname] uid = mdata[:uid] elsif mdata = url.match(regex2) # "https://www.facebook.com/profile.php?id=100009574328879" url, http_response = check_for_facebook_redirection(mdata[:url]) uid = mdata[:uid] elsif mdata = url.match(regex4) # "http://facebook.com/home.php?#!/person.name" url = mdata[:url] + mdata[:uname] url = drop_url_query!(url) elsif mdata = url.match(regex5) # "https://www.facebook.com/100009574328879" url = "http://facebook.com/" + mdata[:uid] uid = mdata[:uid] elsif mdata = url.match(regex6) # "http://business.facebook.com/home/accounts?business_id=1145724702268347" url = mdata[:url] uid = mdata[:uid] elsif mdata = url.match(regex3) # "http://facebook.com/TonyMollHomeLoans/timeline" # "http://facebook.com/pg/TonyMollHomeLoans/timeline" # "https://www.facebook.com/groups/practicewithclaritygroup" if ["group", "groups", "page", "pages", "pg"].exclude?(mdata[:uname]) url = (mdata[:group] ? "http://facebook.com/groups/" : "http://facebook.com/") + mdata[:uname] uname = mdata[:uname] end url = drop_url_query!(url) elsif url.include?("facebook.com/profile.php?id=") # puts "profile.php" # these were being truncated, they do redirect, but typically a 301 response is generated # so the url is returned unchanged. Better than truncation. url, http_response = check_for_facebook_redirection(url) else # puts "else" url = drop_url_query!(url) end # Due to the redirection check, "https" and "www." can be re-introduced url = url.sub(%r{^https?://www.}i, 'http://') url = url.sub(/\?_rdr.*/, '') url end
sc_flickr(url)
click to toggle source
# File lib/url_scrubber.rb, line 400 def self.sc_flickr(url) if url.include?('flickr.com/groups/') groups_partition = url.partition('flickr.com/groups/') if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != "" extraneous_slash_partition = groups_partition[2].partition('/') if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != "" # need to trim off the sub page stuff return "http://flickr.com/groups/" + extraneous_slash_partition[0] else return url end end end user_match = url.match(%r{flickr\.com/(photos/|people/)?([^/]+)}) return url unless user_match "http://flickr.com/#{user_match[2]}" end
sc_generic(url)
click to toggle source
# File lib/url_scrubber.rb, line 432 def self.sc_generic(url) drop_url_query!(url) url end
sc_google_plus(url)
click to toggle source
# File lib/url_scrubber.rb, line 384 def self.sc_google_plus(url) url.sub!('com/u/0/b/', 'com/') url.sub!('com/u/0/', 'com/') url.sub!('com/b/', 'com/') url.sub!('/photos', '') url.sub!('/of', '') url.sub!('/albums', '') community_page = url.include?('plus.google.com/communities/') path_match = community_page ? url.match(/^http:\/\/plus\.google\.com\/communities\/([^\/]+)/) : url.match(/^http:\/\/plus\.google\.com\/([^\/]+)/) return url unless path_match community_page ? "http://plus.google.com/communities/#{path_match[1]}" : "http://plus.google.com/#{path_match[1]}" end
sc_linkedin(url)
click to toggle source
TODO This needs to be rewritten to be independent of the LinkedIn domain and public suffix used: e.g. linkedin.com vs lnkd.in vs linkedin.ca
# File lib/url_scrubber.rb, line 361 def self.sc_linkedin(url) url.sub!('linkedin.com/companies/', 'linkedin.com/company/') if !!url.match(%r{com/company/}) drop_url_query!(url) elsif !!url.match(%r{com/in/}) drop_url_query!(url) elsif !!url.match(%r{com/pub/}) drop_url_query!(url) elsif url.include?('linkedin.com/groups/') drop_url_query!(url) elsif url.include?('linkedin.com/groups?gid=') drop_url_ampersand!(url) elsif url.include?('linkedin.com/groups?home=&gid=') id_partition = url.partition('linkedin.com/groups?home=&gid=') url = "http://linkedin.com/groups?gid=" + drop_url_ampersand!(id_partition[2]) elsif url.include?('linkedin.com/groups?homeNewMember=&gid=') id_partition = url.partition('linkedin.com/groups?homeNewMember=&gid=') url = "http://linkedin.com/groups?gid=" + drop_url_ampersand!(id_partition[2]) end url end
sc_pinterest(url)
click to toggle source
# File lib/url_scrubber.rb, line 420 def self.sc_pinterest(url) drop_url_query!(url) url end
sc_twitter(url)
click to toggle source
# File lib/url_scrubber.rb, line 274 def self.sc_twitter(url) url.sub!('twitter.com/@', 'twitter.com/') status_match = url.match(%r{(twitter\.com/[^/]+)/statuses/\d+}) if status_match url = "http://#{status_match[1]}" end search_match = url.match(%r{twitter\.com/search(?:/realtime)?(?:/|\?q=)(?:@|%40)(\S*)$}) if search_match url = "http://twitter.com/#{search_match[1]}" end url = drop_url_query!(url) url end
sc_vimeo(url)
click to toggle source
# File lib/url_scrubber.rb, line 259 def self.sc_vimeo(url) if url.include?('vimeo.com/groups/') groups_partition = url.partition('vimeo.com/groups/') if !groups_partition.nil? && !groups_partition[2].nil? && groups_partition[2] != "" extraneous_slash_partition = groups_partition[2].partition('/') if !extraneous_slash_partition.nil? && !extraneous_slash_partition[1].nil? && extraneous_slash_partition[1] != "" # need to trim off the sub page stuff return "http://vimeo.com/groups/" + extraneous_slash_partition[0] end end end url end
sc_yelp(url)
click to toggle source
# File lib/url_scrubber.rb, line 426 def self.sc_yelp(url) drop_url_query!(url) url end
sc_youtube(url)
click to toggle source
# File lib/url_scrubber.rb, line 249 def self.sc_youtube(url) # We need to allow the /user version of the URL due to how YouTube allows users to have their own URL # which is not separate channel with it's own customUrl. # url.sub!('youtube.com/user/', 'youtube.com/') url.sub!('youtube.com/profile?user=', 'youtube.com/') drop_url_query!(url) url end
special_cases(url)
click to toggle source
# File lib/url_scrubber.rb, line 193 def self.special_cases(url) #puts "special_cases" case service_of(url) when :youtube then return sc_youtube(url) when :twitter then return sc_twitter(url) when :facebook then return sc_facebook(url) when :linkedin then return sc_linkedin(url) when :google then return sc_google_plus(url) when :flickr then return sc_flickr(url) when :pinterest then return sc_pinterest(url) when :vimeo then return sc_vimeo(url) when :yelp then return sc_yelp(url) else sc_generic(url) end url end