class ContentLinkParser
ContentLinkParser
extracts links from HTML content and assigns them to a hash based on the location the link was found. The has contents can be configured in options, however, defaults to a pretty sensible default. Links can also be returned regardless of the location they were located and can be filtered by the scheme
Public Class Methods
Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
# File lib/content_link_parser.rb, line 8 def initialize(url, content, options = {}) @options = {}.merge(options) @url = url @base_url = '' @doc = Nokogiri::HTML(content) if @doc.at("base[href]") @base_url = @doc.at("base[href]").attr("href").to_s if @doc.at("base[href]").attr("href").to_s.present? end @options[:tags] = {} @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]] @options[:tags][:images] = [["img[src]", "src"]] @options[:tags][:related] = [["link[rel]", "href"]] @options[:tags][:scripts] = [["script[src]", "src"]] @options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", lambda{|array,tag| first_regex =/url\((['"]?)(.*?)\1\)/ tag.content.scan(first_regex) {|match| array << Addressable::URI.parse(match[1]).to_s} }]] #clear the default tags if required @options[:tags] = {} if @options[:ignore_default_tags] @options[:tags].merge!(@options[:additional_tags]) unless @options[:additional_tags].nil? end
Public Instance Methods
Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes. Also filters repeating folders (ie if the crawler got in a link loop situation)
# File lib/content_link_parser.rb, line 44 def all_links(options = {}) options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes data = link_data links = data.keys.map{|key| data[key]}.flatten.uniq links = links.map{|link| UriHelper.join_no_fragment(@url, UriHelper.join_no_fragment(@base_url, link))} .reject(&:nil?) .map(&:to_s) links = links.reject{|link| link =~ /\/([^\/]+?)\/\1\// } links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ } links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym} links end
Returns a hash with arrays of links
# File lib/content_link_parser.rb, line 35 def link_data data = {} @options[:tags].keys.each do |key| data[key.to_sym] = self.instance_eval(key.to_s) end data end
Returns the type of links as a method rather than using the hash e.g. 'content_link_parser.images'
# File lib/content_link_parser.rb, line 58 def method_missing(m) if @options[:tags].keys.include?(m) links = [] @options[:tags][m].each do |selector, attribute| find_matches(links, selector, attribute) end links.uniq else super end end
Private Instance Methods
Processes the content to find links based on options
# File lib/content_link_parser.rb, line 72 def find_matches(array, selector, attribute) if attribute.kind_of? String or attribute.kind_of? Symbol @doc.css(selector).each do |tag| begin array << Addressable::URI.parse(tag[attribute]).to_s rescue end end elsif attribute.instance_of? Regexp @doc.css(selector).each do |tag| begin tag.content.scan(attribute) {|match| array << Addressable::URI.parse(match[0]).to_s} rescue end end elsif attribute.instance_of? Proc @doc.css(selector).each do |tag| begin attribute.call(array, tag) rescue end end end end