class Scrapers::ManningBooks::Scraper
Attributes
delay_time[RW]
destination[RW]
dry_run[RW]
pw[RW]
user[RW]
Public Class Methods
new(options={})
click to toggle source
# File lib/scrapers/manning_books.rb, line 14 def initialize(options={}) netrc_reader = ::Scrapers::NetrcReader.new(NETRC_MANNING_ENTRY) @user = options.fetch("user", netrc_reader.user) @pw = options.fetch("pw", netrc_reader.pw) @delay_time = options.fetch("delay", DELAY_TIME) @destination = options.fetch("destination", ".") @dry_run = options.fetch("dry_run", false) end
Public Instance Methods
build_book_list(page)
click to toggle source
# File lib/scrapers/manning_books.rb, line 53 def build_book_list(page) page.search('.book').map do |book| { title: book.at('[data-type=title]').children.first.text, downloads: book.at('.book_downloads').search('a').map do |link| type = link.children.first.text.downcase next unless type.match(/download/) type = type.split(" ").last [type.to_sym, link.attr(:href)] end.compact.to_h } end end
download_books(agent, books)
click to toggle source
# File lib/scrapers/manning_books.rb, line 67 def download_books(agent, books) books.map do |book| puts "Retrieving #{book[:title]}" downloads = book[:downloads].map do |type, href| next unless %i[pdf epub kindle].include?(type) print " downloading #{type} ..." agent.get href unless dry_run agent.current_page.save! unless dry_run puts "saved #{agent.current_page.filename}" [agent.current_page.filename, href] end.compact.to_h wait_a_bit delay_time [book[:title], downloads] end.to_h end
login(agent) { |agent| ... }
click to toggle source
# File lib/scrapers/manning_books.rb, line 38 def login(agent, &block) raise "Must provide a block to execute after logged in to site" unless block_given? agent.get DASHBOARD_URL unless agent.current_page.uri == DASHBOARD_URL # log in agent.current_page.form.field_with(:type => 'email').value= user agent.current_page.form.field_with(:type => 'password').value= pw agent.current_page.form.submit sleep 2 raise "could not log in" unless agent.current_page.uri.to_s == DASHBOARD_URL end yield agent end
scrape()
click to toggle source
# File lib/scrapers/manning_books.rb, line 23 def scrape @results = nil Dir.chdir(destination) do |dir| Mechanize.start do |m| login(m) do |m| books = build_book_list(m.current_page) @results = download_books(m, books) end end end @results end
wait_a_bit(delay)
click to toggle source
# File lib/scrapers/manning_books.rb, line 83 def wait_a_bit(delay) puts "delaying for #{delay} second(s)" %w[- * | +].cycle do |c| print "\r#{c}" sleep 1 delay -= 1 break if delay < 1 end print "\r" end