class Sitetap::Scraper

Public Class Methods

new(url) click to toggle source
# File lib/sitetap/scraper.rb, line 6
def initialize(url)
  @url = url.strip.gsub(/\/$/, '')
end
scrape!(url) click to toggle source
# File lib/sitetap/scraper.rb, line 10
def self.scrape!(url)
  scraper = Sitetap::Scraper.new(url)
  scraper.scrape!
  scraper
end

Public Instance Methods

dir() click to toggle source
# File lib/sitetap/scraper.rb, line 22
def dir
  root
end
scrape!() click to toggle source
# File lib/sitetap/scraper.rb, line 16
def scrape!
  verify_dir
  wget
  self
end

Private Instance Methods

domain() click to toggle source
# File lib/sitetap/scraper.rb, line 28
def domain
  @domain ||= @url.gsub(/http(s)?\:\/\//, '')
end
html_dir() click to toggle source
# File lib/sitetap/scraper.rb, line 36
def html_dir
  "#{root}/html"
end
root() click to toggle source
# File lib/sitetap/scraper.rb, line 32
def root
  @root ||= "#{Dir.pwd}/#{domain}"
end
verify_dir() click to toggle source
# File lib/sitetap/scraper.rb, line 40
def verify_dir
  unless Dir.exists?(html_dir)
    FileUtils.mkdir_p(html_dir)
  end
end
wget() click to toggle source
# File lib/sitetap/scraper.rb, line 58
def wget
  system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
  # add `-o #{log_dir}/scrape.log` to store logfile
end
wget_options() click to toggle source
# File lib/sitetap/scraper.rb, line 46
def wget_options
  [
    '--recursive',
    '--page-requisites',
    '--html-extension',
    '--convert-links',
    '--restrict-file-names=windows',
    '--span-hosts',
    '-e robots=off'
  ]
end