class WaybackArchiver::URLCollector

Retrive URLs from different sources

Public Class Methods

crawl(url, hosts: [], limit: WaybackArchiver.max_limit) { |page_url| ... } click to toggle source

Retrieve URLs by crawling. @return [Array<String>] of URLs defined found during crawl. @param [String] url domain to crawl URLs from. @param [Array<String, Regexp>] hosts to crawl. @example Crawl URLs defined on example.com

URLCollector.crawl('http://example.com')

@example Crawl URLs defined on example.com and limit the number of visited pages to 100

URLCollector.crawl('http://example.com', limit: 100)

@example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100

URLCollector.crawl('http://example.com', limit: -1)

@example Crawl multiple hosts

URLCollector.crawl(
  'http://example.com',
  hosts: [
    'example.com',
    /host[\d]+\.example\.com/
  ]
)
# File lib/wayback_archiver/url_collector.rb, line 37
def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
  urls = []
  start_at_url = Request.build_uri(url).to_s
  options = {
    robots: WaybackArchiver.respect_robots_txt,
    hosts: hosts,
    user_agent: WaybackArchiver.user_agent
  }
  options[:limit] = limit unless limit == -1

  Spidr.site(start_at_url, options) do |spider|
    spider.every_page do |page|
      page_url = page.url.to_s
      urls << page_url
      WaybackArchiver.logger.debug "Found: #{page_url}"
      yield(page_url) if block_given?
    end
  end
  urls
end
sitemap(url) click to toggle source

Retrieve URLs from Sitemap. @return [Array<String>] of URLs defined in Sitemap. @param [String] url domain to retrieve Sitemap from. @example Get URLs defined in Sitemap for google.com

URLCollector.sitemap('https://google.com/sitemap.xml')
# File lib/wayback_archiver/url_collector.rb, line 15
def self.sitemap(url)
  Sitemapper.urls(url: Request.build_uri(url))
end