class DaimonSkycrawlers::SitemapParser
Parser for sitemap.xml
Based on github.com/benbalter/sitemap-parser See also www.sitemaps.org/
“`ruby urls = [“example.com/sitemap.xml”] sitemap_parser = DaimonSkycrawlers::SitemapParser.new(urls)
sitemap_urls = sitemap_parser.parse “`
Public Class Methods
new(urls)
click to toggle source
@param urls [Array] List of sitemap.xml URL
# File lib/daimon_skycrawlers/sitemap_parser.rb, line 29 def initialize(urls) @urls = urls end
Public Instance Methods
parse()
click to toggle source
Fetch and parse sitemap.xml
@return [Array] URLs in sitemap.xml
# File lib/daimon_skycrawlers/sitemap_parser.rb, line 38 def parse hydra = Typhoeus::Hydra.new(max_concurrency: 1) sitemap_urls = [] @urls.each do |url| uri = URI(url) if uri.scheme && uri.scheme.start_with?("http") request = Typhoeus::Request.new(url, followlocation: true) request.on_complete do |response| sitemap_urls.concat(on_complete(response)) end hydra.queue(request) else if File.exist?(url) sitemap_urls.concat(extract_urls(File.read(url))) end end end loop do hydra.run break if hydra.queued_requests.empty? end sitemap_urls end
Private Instance Methods
compressed?(response)
click to toggle source
# File lib/daimon_skycrawlers/sitemap_parser.rb, line 98 def compressed?(response) content_encoding = response.headers["Content-Encoding"] case content_encoding && content_encoding.downcase when "deflate", "gzip", "x-gzip" true else signature = response.body[0, 2].b signature == "\x1F\x8B".b end end
extract_urls(body)
click to toggle source
# File lib/daimon_skycrawlers/sitemap_parser.rb, line 70 def extract_urls(body) sitemap = Nokogiri::XML(body) case when sitemap.at("sitemapindex") urls = sitemap.search("sitemap").flat_map do |s| s.at("loc").content end SitemapParser.new(urls).parse when sitemap.at("urlset") sitemap.search("url").flat_map do |url| url.at("loc").content end else raise Error, "Malformed sitemap.xml no <sitemapindex> or <urlset>" end end
inflate_response(response)
click to toggle source
# File lib/daimon_skycrawlers/sitemap_parser.rb, line 87 def inflate_response(response) if compressed?(response) # We cannot inflate compressed data from NTFS filesystem (NT). # This can avoid errors stream = Zlib::Inflate.new(Zlib::MAX_WBITS + 32) stream.inflate(response.body) else response.body end end
on_complete(response)
click to toggle source
# File lib/daimon_skycrawlers/sitemap_parser.rb, line 64 def on_complete(response) raise Error, "HTTP requset to #{response.effective_url} failed. status: #{response.code}" unless response.success? raw_sitemap = inflate_response(response) extract_urls(raw_sitemap) end