class Sitetap::Scraper
Public Class Methods
new(url)
click to toggle source
# File lib/sitetap/scraper.rb, line 6 def initialize(url) @url = url.strip.gsub(/\/$/, '') end
scrape!(url)
click to toggle source
# File lib/sitetap/scraper.rb, line 10 def self.scrape!(url) scraper = Sitetap::Scraper.new(url) scraper.scrape! scraper end
Public Instance Methods
dir()
click to toggle source
# File lib/sitetap/scraper.rb, line 22 def dir root end
scrape!()
click to toggle source
# File lib/sitetap/scraper.rb, line 16 def scrape! verify_dir wget self end
Private Instance Methods
domain()
click to toggle source
# File lib/sitetap/scraper.rb, line 28 def domain @domain ||= @url.gsub(/http(s)?\:\/\//, '') end
html_dir()
click to toggle source
# File lib/sitetap/scraper.rb, line 36 def html_dir "#{root}/html" end
root()
click to toggle source
# File lib/sitetap/scraper.rb, line 32 def root @root ||= "#{Dir.pwd}/#{domain}" end
verify_dir()
click to toggle source
# File lib/sitetap/scraper.rb, line 40 def verify_dir unless Dir.exists?(html_dir) FileUtils.mkdir_p(html_dir) end end
wget()
click to toggle source
# File lib/sitetap/scraper.rb, line 58 def wget system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../") # add `-o #{log_dir}/scrape.log` to store logfile end
wget_options()
click to toggle source
# File lib/sitetap/scraper.rb, line 46 def wget_options [ '--recursive', '--page-requisites', '--html-extension', '--convert-links', '--restrict-file-names=windows', '--span-hosts', '-e robots=off' ] end