class Scrapers::RubyTapas::Scraper

Scraper provides the methods to download, extract and build a collection of DPD cart subscription episode from the RubyTapas RSS feed.

Attributes

debug[RW]
destination[RW]
dpdcart[R]
dry_run[RW]
episode_number[RW]
episodes[R]
netrc_reader[RW]
pw[RW]
subscription[RW]
user[RW]

Public Class Methods

new(episode_number, options) click to toggle source

episode_number is the RubyTapas episode number (note! not the post id!) of the episode to download. If the episode number is the symbol :all, then all episodes will be retrieved. Note that if any of the episodes have been previously retrieved to the destination, i.e., the associated directory already exists, that episode download will be skipped.

options contains the options from the cli, which include:

  • “user”: the username of the RubyTapas account

  • “pw”: the password of the RubyTapas account

  • “destination”: the root destination of the episode downloads

# File lib/scrapers/rubytapas/scraper.rb, line 29
def initialize(episode_number, options)
  self.subscription = options.fetch("subscription") # let this fail if no subscription given.
  self.episode_number = episode_number
  self.user = options["user"]
  self.pw = options["pw"]
  self.destination = options.fetch("destination", Dir.pwd)
  self.dry_run = options["dry_run"]
  self.debug = options["debug"]
  @dpdcart = Scrapers::RubyTapas::DpdCart.
    new(user, pw, subscription, {dry_run: dry_run, debug: debug})
  @episodes ||= fetch_episodes
end

Public Instance Methods

find_by_episode(episode_number) click to toggle source

Retrieves the episode associated with *episode number*.

# File lib/scrapers/rubytapas/scraper.rb, line 75
def find_by_episode(episode_number)
  episodes.detect {|e| e.number == episode_number}
end
list!() click to toggle source

Print a list of episodes

# File lib/scrapers/rubytapas/scraper.rb, line 66
def list!
  with_pager do |pager|
    episodes.each do |episode|
      pager.puts format_episode(episode)
    end
  end
end
scrape!() click to toggle source

Perform the scraping operation

# File lib/scrapers/rubytapas/scraper.rb, line 43
def scrape!
  if all_episodes?
    episodes.each do |episode|

      begin
        download(episode)
        friendly_pause unless dry_run
      rescue Errno::EEXIST
        warn "Episode previously downloaded. Skipping."
      end

    end
  else
    episode = find_by_episode(episode_number)
    if episode.nil?
      raise "Unknown episode for #{episode_number}"
    else
      download(episode)
    end
  end
end

Private Instance Methods

all_episodes?() click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 81
def all_episodes?
  episode_number.to_s.downcase.to_sym == :all
end
download(episode) click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 93
def download(episode)
  download_directory = make_download_directory(episode.slug)
  episode.file_list.each do |file|
    download_file(download_directory, file.href)
  end
end
download_file(dir, url) click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 100
def download_file(dir, url)
  warn "fetching #{url}" if debug
  name, body = dpdcart.download!(url)
  File.binwrite(File.join(dir,name), body) unless dry_run
  warn "saved #{name} to #{dir}" if debug
end
fetch_episodes() click to toggle source

Builds a collection of all the episodes listed in the feed

# File lib/scrapers/rubytapas/scraper.rb, line 86
def fetch_episodes
  feed = Nokogiri::XML.parse(dpdcart.feed!)
  feed.xpath("//item").map do |item|
    Episode.new(item)
  end
end
format_episode(episode) click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 132
def format_episode(episode)
  "%-5s\t%-40s\t%-15s" % [episode.number, episode.title, episode.pub_date.strftime("%Y-%b-%d")]
end
friendly_pause(delay=5) click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 118
def friendly_pause(delay=5)
  puts
  print "Sleeping #{delay} seconds"
  delay.downto(1) { sleep 1; print "." }
  puts "\n"
end
make_download_directory(slug) click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 108
def make_download_directory(slug)
  dir = File.join(File.realpath(destination), slug)
  warn "Downloading to #{dir}" if debug
  if dry_run
    "no dir for dry run"
  else
    FileUtils.mkdir(dir).first
  end
end
with_pager() { |pager| ... } click to toggle source
# File lib/scrapers/rubytapas/scraper.rb, line 125
def with_pager(&block)
  raise "Must be called with block" unless block_given?
  pager = open("|more","w")
  yield pager
  pager.close
end