class ScrapedPageArchive

Constants

VERSION

Attributes

storage[R]

Public Class Methods

new(storage) click to toggle source
# File lib/scraped_page_archive.rb, line 23
def initialize(storage)
  @storage = storage
end
record(*args, &block) click to toggle source
# File lib/scraped_page_archive.rb, line 17
def self.record(*args, &block)
  new(GitStorage.new).record(*args, &block)
end

Public Instance Methods

filename_from_url(url) click to toggle source
# File lib/scraped_page_archive.rb, line 53
def filename_from_url(url)
  File.join(URI.parse(url).host, Digest::SHA1.hexdigest(url))
end
open_from_archive(url) click to toggle source
# File lib/scraped_page_archive.rb, line 41
def open_from_archive(url)
  storage.chdir do
    filename = filename_from_url(url.to_s)
    meta = YAML.load_file(filename + '.yml') if File.exist?(filename + '.yml')
    response_body = File.read(filename + '.html') if File.exist?(filename + '.html')
    unless meta && response_body
      fail Error, "No archived copy of #{url} found."
    end
    response_from(meta, response_body)
  end
end
record() { || ... } click to toggle source
# File lib/scraped_page_archive.rb, line 27
def record(&block)
  if storage.github_repo_url.nil?
    warn "The 'scraped_page_archive' gem wants to store the scraped pages in a git repo," \
      'but it cannot determine which git repo it should use.  See ' \
      'https://github.com/everypolitician/scraped_page_archive#usage for details of how ' \
      "to specify the repo.\n\n"
    return yield
  end
  VCR::Archive::Persister.storage_location = storage.path
  ret = VCR.use_cassette('', &block)
  storage.save
  ret
end
response_from(meta, response_body) click to toggle source
# File lib/scraped_page_archive.rb, line 57
def response_from(meta, response_body)
  StringIO.new(response_body).tap do |response|
    OpenURI::Meta.init(response)
    meta['response']['headers'].each { |k, v| response.meta_add_field(k, v.join(', ')) }
    response.status = meta['response']['status'].values.map(&:to_s)
    response.base_uri = URI.parse(meta['request']['uri'])
  end
end