class Zeitungen::Downloader

Public Class Methods

new(url, zeitungen, client, passwords, verbose: false) click to toggle source
# File lib/zeitungen/downloader.rb, line 3
def initialize(url, zeitungen, client, passwords, verbose: false)
  @url       = url
  @zeitungen = zeitungen
  @client    = client
  @passwords = passwords
  @verbose   = verbose
end

Public Instance Methods

run(date=Date.today, options={}) click to toggle source
# File lib/zeitungen/downloader.rb, line 11
def run(date=Date.today, options={})
  options = {thread_count: 5}.merge(options)
  @date_string = date.strftime("%Y-%m-%d")
  @client.date_string = @date_string

  page = IndexPage.new(@url, @passwords)
  zeitungen_links = page.links(date)
  puts "zeitungen_links.size: #{zeitungen_links.size}" if @verbose
  # puts zeitungen_links.inspect
  queue = enqueue(zeitungen_links)

  if queue.size==0
    puts "No zeitungen to download"
  else
    puts "Start downloading #{queue.size} zeitungen"
  end
  
  download(queue, options[:thread_count])
end

Private Instance Methods

download(queue, thread_count) click to toggle source
# File lib/zeitungen/downloader.rb, line 59
def download(queue, thread_count)
  threads = thread_count.times.map do
    Thread.new do    
      while !queue.empty? && z = queue.pop
        begin
          url = z.uri
          file = Tempfile.new('zeitungen')


          res = HTTP.get(url)
          i = 0
          puts "Response status: #{res.status.to_s}" if @verbose
          while res.status.to_s=="302 Found" and i<5 # max 5 redirect
            url = res.headers.get("Location")
            url = url.first if url.is_a? Array
            puts "Redirect URL: #{url}" if @verbose
            res = HTTP.get(url)
            puts "Response status: #{res.status.to_s}" if @verbose
            i += 1
          end
          puts "downloading #{z.final_name} (#{url})..."
          file.write(res.to_s)


          # res = HTTP.get()
          # Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme=='https') do |http|
          #   request = Net::HTTP::Get.new(uri)
          #   puts "downloading #{z.final_name} (#{z.uri})..."
          #   response = http.request request
          #   file.write(response.body)
          # end

          filename = filename_w_date(z.final_name)
          z.upload ? @client.mv_file_in_public_dest(filename, file) : @client.mv_file_in_private_dest(filename, file)
          file.close
          file.unlink

        rescue Exception => e
          puts "Error downloading a newpaper!"
          puts e.inspect
        end
      end
    end
  end
  threads.each(&:join)
  puts "Main thread finish here!"
end
enqueue(zeitungen_links) click to toggle source
# File lib/zeitungen/downloader.rb, line 37
def enqueue(zeitungen_links)
  queue = Queue.new
  @zeitungen.each do |z|
    if link = zeitungen_links.find{|l| z.regexp.match l.text } # se c'รจ un link per il zeitungen corrente
      uri = link.uri
      puts "uri: #{uri}" if @verbose
      if uri.host=="t.umblr.com"
        h = Hash[uri.query.split("&").map{|e| e.split("=")}]
        u = URI(URI.unescape(h["z"])+"\?directDownload\=true")
        puts "u: #{u}" if @verbose
        z.uri = u
      else
        z.uri = uri+"\?directDownload\=true"   # zeitungen.uri+"\?directDownload\=true"
      end
      filename = filename_w_date(z.final_name) # "Corriere della Sera - 2015-12-23.pdf"

      queue << z if !(@client.exist_in_public_dest?(filename) || @client.exist_in_private_dest?(filename))
    end
  end
  queue
end
filename_w_date(filename, date_string=nil) click to toggle source
# File lib/zeitungen/downloader.rb, line 33
def filename_w_date(filename, date_string=nil)
  "#{filename} - #{date_string || @date_string}.pdf"
end