class SitemapGenerator
Public Class Methods
new()
click to toggle source
# File lib/sitemap/commands/sitemap.rb, line 13 def initialize() log.debug('Initialising generator') end
Public Instance Methods
create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1)
click to toggle source
Public: Create the index recursively.
link - The URI to build the index from recursively. base_uri - The base URI (Host) to restrict which links are indexed filters - An array of Filters
to be applied before indexing transformers - An array of Transformers
to be applied before indexing link_index - Any index to start the build from. depth - The depth of recursion. 1 for no recursion, -1 for infinite, > 1 for specific depth
Returns an index containing URIs as keys and an object representing the page.
# File lib/sitemap/commands/sitemap.rb, line 50 def create_index(link, base_uri, filters, transformers, link_index = nil, depth = -1) if link_index.nil? log.debug('Creating new Index') link_index = Hash.new end if link.nil? || base_uri.nil? return end if (Filters::Util.apply_filters([link], link_index, base_uri, filters).length > 0) log.debug("Indexing document #{link} with base #{base_uri}, depth #{depth} and filters #{filters}") # Only continue in this part if page NOT in index and is indexable # Only fetch the document if it's not yet been indexed doc = get_document(link) ## All docs must be indexed, even if blacklisted... if !doc.nil? log.debug("New document found at #{link}, exploring links") depth = depth - 1 # Set page title and add to index link_index[link.to_s] = {'title' => doc.title} log.info("Adding link to index: #{link.to_s}") # Find all links on the page links = [] doc.css('a').each do |l| links << l.attributes["href"].to_s end # Transform URLs before indexing Transformers::Util.apply_transformers(links, transformers) # Filter out in-eligible links Filters::Util.apply_filters(links, link_index, base_uri, filters) links.each do |l| l = Filters::Util.remove_fragment_from_uri(l) if l && !l.empty? if depth != -1 create_index(Filters::Util.create_absolute_uri(l, base_uri), base_uri, filters, transformers, link_index, depth) end end end end end link_index end
fetch(uri, domain = nil, limit = 10)
click to toggle source
Public: Fetch a document from the Internet.
# File lib/sitemap/commands/sitemap.rb, line 108 def fetch(uri, domain = nil, limit = 10) uri = Filters::Util.make_URI(uri) if domain.nil? domain = uri end domain = Filters::Util.make_URI(domain) raise ArgumentError, 'too many HTTP redirects' if limit == 0 response = Net::HTTP.get_response(uri) case response when Net::HTTPSuccess then response.body when Net::HTTPRedirection then location = response['location'] location = Filters::Util.create_absolute_uri(location, uri) log.warn("Redirecting #{uri} to new location: #{location}") # Check new location belongs to current domain if location.host == domain.host fetch(location, uri, limit - 1) elsif log.warn("Redirecting from #{uri} to #{location} rejected due to cross-domain restrictions") end nil else nil end end
generate(uri, output_file, filters, transformers, format = 'csv', depth = -1)
click to toggle source
Create the Sitemap
# File lib/sitemap/commands/sitemap.rb, line 159 def generate(uri, output_file, filters, transformers, format = 'csv', depth = -1) log.debug("Generating sitemap from #{uri} to #{format} (output file? #{output_file}). Depth of recursion: #{depth}") index = create_index(uri, uri, filters, transformers, nil, depth) case format when 'json' write_index_to_json(index) when 'csv' write_index_to_file(index, output_file) else puts "Please specify a valid output format, you gave #{format} Options are ['csv', 'json']" exit(1) end end
get_document(uri)
click to toggle source
Public: Fetch a document
# File lib/sitemap/commands/sitemap.rb, line 142 def get_document(uri) log.debug("Fetching document at #{uri}") begin response = fetch(uri.to_s) doc = Nokogiri::HTML(response) if doc.instance_of? Nokogiri::HTML::Document return doc end rescue StandardError => bang log.error("Error reading document #{uri}: #{bang.message}") nil end end
write_index_to_file(index, output_file)
click to toggle source
Public: Write a Sitemap
index to file
# File lib/sitemap/commands/sitemap.rb, line 27 def write_index_to_file(index, output_file) csv = CSV.open(output_file, 'wb') csv << ['URI', 'Title'] # Flush Sitemap to CSV index.each do |key, value| csv << [key, value['title']] end end
write_index_to_json(index)
click to toggle source
Public: Output the index to JSON
# File lib/sitemap/commands/sitemap.rb, line 20 def write_index_to_json(index) puts JSON::generate(index) end