class SitemapParser
Public Class Methods
new(url, opts = {})
click to toggle source
# File lib/sitemap-parser.rb, line 9 def initialize(url, opts = {}) @url = url @options = { followlocation: true, recurse: false, url_regex: nil }.merge(opts) end
Public Instance Methods
raw_sitemap()
click to toggle source
# File lib/sitemap-parser.rb, line 14 def raw_sitemap @raw_sitemap ||= begin if /\Ahttp/i.match?(@url) request_options = @options.dup.tap { |opts| opts.delete(:recurse); opts.delete(:url_regex) } request = Typhoeus::Request.new(@url, request_options) request.on_complete do |response| raise "HTTP request to #{@url} failed" unless response.success? return inflate_body_if_needed(response) end request.run elsif File.exist?(@url) && @url =~ %r{[\\/]sitemap\.xml\Z}i File.open(@url, &:read) end end end
sitemap()
click to toggle source
# File lib/sitemap-parser.rb, line 31 def sitemap @sitemap ||= Nokogiri::XML(raw_sitemap) end
to_a()
click to toggle source
# File lib/sitemap-parser.rb, line 53 def to_a urls.map { |url| url.at('loc').content } rescue NoMethodError raise 'Malformed sitemap, url without loc' end
urls()
click to toggle source
# File lib/sitemap-parser.rb, line 35 def urls if sitemap.at('urlset') filter_sitemap_urls(sitemap.at('urlset').search('url')) elsif sitemap.at('sitemapindex') found_urls = [] if @options[:recurse] urls = sitemap.at('sitemapindex').search('sitemap') filter_sitemap_urls(urls).each do |sitemap| child_sitemap_location = sitemap.at('loc').content found_urls << self.class.new(child_sitemap_location, recurse: false).urls end end found_urls.flatten else raise 'Malformed sitemap, no urlset' end end
Private Instance Methods
filter_sitemap_urls(urls)
click to toggle source
# File lib/sitemap-parser.rb, line 61 def filter_sitemap_urls(urls) return urls if @options[:url_regex].nil? urls.select { |url| url.at('loc').content.strip =~ @options[:url_regex] } end
inflate_body_if_needed(response)
click to toggle source
# File lib/sitemap-parser.rb, line 67 def inflate_body_if_needed(response) return response.body unless response.headers case response.headers['Content-Type'] when %r{application/gzip}, %r{application/x-gzip}, %r{application/octet-stream} Zlib.gunzip(response.body) else response.body end end