class Webcache::DiskCache

Public Instance Methods

cached?( url ) click to toggle source
# File lib/webget/webcache.rb, line 78
def cached?( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.exist?( body_path )
end
Also aliased as: exist?
exist?( url )
Alias for: cached?
read( url ) click to toggle source
# File lib/webget/webcache.rb, line 85
def read( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.open( body_path, 'r:utf-8' ) {|f| f.read }
end
read_csv( url ) click to toggle source
# File lib/webget/webcache.rb, line 97
def read_csv( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = CsvHash.parse( txt )
  data
end
read_json( url ) click to toggle source
# File lib/webget/webcache.rb, line 90
def read_json( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = JSON.parse( txt )
  data
end
record( url, response, path: nil, encoding: 'UTF-8', format: 'html' ) click to toggle source

add more save / put / etc. aliases - why? why not?

rename to record_html - why? why not?
# File lib/webget/webcache.rb, line 107
def record( url, response,
            path: nil,
            encoding: 'UTF-8',
            format: 'html' )

  body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
  meta_path = "#{body_path}.meta.txt"

  ## make sure path exits
  FileUtils.mkdir_p( File.dirname( body_path ) )


  puts "[cache] saving #{body_path}..."

  ## todo/check: verify content-type - why? why not?
  ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!
  if format == 'json'
    File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
  elsif format == 'csv'
    ## fix: newlines - always use "unix" style" - why? why not?
    ## fix:  use :newline => :universal option? translates to univeral "\n"
    text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  else   ## html or txt
    text = response.text( encoding: encoding )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  end


  File.open( meta_path, 'w:utf-8' ) do |f|
    ## todo/check:
    ##  do headers also need to converted (like text) if encoding is NOT utf-8 ???
    response.headers.each do |key, value|  # iterate all response headers
      f.write( "#{key}: #{value}" )
      f.write( "\n" )
    end
  end
end
url_to_id( str ) click to toggle source

note: use file path as id for DiskCache (is different for DbCache/SqlCache?)

use file:// instead of disk:// - why? why not?
# File lib/webget/webcache.rb, line 150
def url_to_id( str ) "disk://#{url_to_path( str )}"; end
url_to_path( str, path: nil ) click to toggle source

helpers

# File lib/webget/webcache.rb, line 154
def url_to_path( str, path: nil )
  ## map url to file path
  uri = URI.parse( str )

  ## note: ignore scheme (e.g. http/https)
  ##         and  post  (e.g. 80, 8080, etc.) for now
  ##    always downcase for now (internet domain is case insensitive)
  host_dir = uri.host.downcase

  req_path = if path   ## use "custom" (file)path for cache storage if passed in
               path
             else
              ## "/this/is/everything?query=params"
              ##   cut-off leading slash and
              ##    convert query ? =
               uri.request_uri[1..-1]
             end



  ### special "prettify" rule for weltfussball
  ##   /eng-league-one-2019-2020/  => /eng-league-one-2019-2020.html
  if host_dir.index( 'weltfussball.de' ) ||
     host_dir.index( 'worldfootball.net' )
        if req_path.end_with?( '/' )
           req_path = "#{req_path[0..-2]}.html"
        else
          puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
          exit 1
        end
  elsif host_dir.index( 'tipp3.at' )
    req_path = req_path.sub( '.jsp', '' )  # shorten - cut off .jsp extension

    ##   change ? to -I-
    ##   change = to ~
    ##   Example:
    ##   sportwetten/classicresults.jsp?oddsetProgramID=888
    ##     =>
    ##   sportwetten/classicresults-I-oddsetProgramID~888
    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.html"
  elsif host_dir.index( 'fbref.com' )
    req_path = req_path.sub( 'en/', '' )      # shorten - cut off en/
    req_path = "#{req_path}.html"             # auto-add html extension
  elsif host_dir.index( 'football-data.co.uk' )
    req_path = req_path.sub( 'mmz4281/', '' )  # shorten - cut off mmz4281/
    req_path = req_path.sub( 'new/', '' )      # shorten - cut off new/
  elsif host_dir.index( 'football-data.org' )
    req_path = req_path.sub( 'v2/', '' )  # shorten - cut off v2/

    ## flattern - make a file path - for auto-save
    ##   change ? to -I-
    ##   change / to ~~
    ##   change = to ~
    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '/', '~~' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.json"
  elsif host_dir.index( 'api.cryptokitties.co' )
    ## for now always auto-add .json extensions e.g.
    ##     kitties/1   => kitties/1.json
    ##     cattributes => cattributes.json
    req_path = "#{req_path}.json"
  else
    ## no special rule
  end

  page_path = "#{host_dir}/#{req_path}"
  page_path
end