class Webcache::DiskCache
Public Instance Methods
cached?( url )
click to toggle source
# File lib/webget/webcache.rb, line 78 def cached?( url ) body_path = "#{Webcache.root}/#{url_to_path( url )}" File.exist?( body_path ) end
Also aliased as: exist?
read( url )
click to toggle source
# File lib/webget/webcache.rb, line 85 def read( url ) body_path = "#{Webcache.root}/#{url_to_path( url )}" File.open( body_path, 'r:utf-8' ) {|f| f.read } end
read_csv( url )
click to toggle source
# File lib/webget/webcache.rb, line 97 def read_csv( url ) body_path = "#{Webcache.root}/#{url_to_path( url )}" txt = File.open( body_path, 'r:utf-8' ) {|f| f.read } data = CsvHash.parse( txt ) data end
read_json( url )
click to toggle source
# File lib/webget/webcache.rb, line 90 def read_json( url ) body_path = "#{Webcache.root}/#{url_to_path( url )}" txt = File.open( body_path, 'r:utf-8' ) {|f| f.read } data = JSON.parse( txt ) data end
record( url, response, path: nil, encoding: 'UTF-8', format: 'html' )
click to toggle source
add more save / put / etc. aliases - why? why not?
rename to record_html - why? why not?
# File lib/webget/webcache.rb, line 107 def record( url, response, path: nil, encoding: 'UTF-8', format: 'html' ) body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}" meta_path = "#{body_path}.meta.txt" ## make sure path exits FileUtils.mkdir_p( File.dirname( body_path ) ) puts "[cache] saving #{body_path}..." ## todo/check: verify content-type - why? why not? ## note - for now respone.text always assume (converted) to utf8!!!!!!!!! if format == 'json' File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) } elsif format == 'csv' ## fix: newlines - always use "unix" style" - why? why not? ## fix: use :newline => :universal option? translates to univeral "\n" text = response.text( encoding: encoding ).gsub( "\r\n", "\n" ) File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) } else ## html or txt text = response.text( encoding: encoding ) File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) } end File.open( meta_path, 'w:utf-8' ) do |f| ## todo/check: ## do headers also need to converted (like text) if encoding is NOT utf-8 ??? response.headers.each do |key, value| # iterate all response headers f.write( "#{key}: #{value}" ) f.write( "\n" ) end end end
url_to_id( str )
click to toggle source
note: use file path as id for DiskCache
(is different for DbCache/SqlCache?)
use file:// instead of disk:// - why? why not?
# File lib/webget/webcache.rb, line 150 def url_to_id( str ) "disk://#{url_to_path( str )}"; end
url_to_path( str, path: nil )
click to toggle source
helpers
# File lib/webget/webcache.rb, line 154 def url_to_path( str, path: nil ) ## map url to file path uri = URI.parse( str ) ## note: ignore scheme (e.g. http/https) ## and post (e.g. 80, 8080, etc.) for now ## always downcase for now (internet domain is case insensitive) host_dir = uri.host.downcase req_path = if path ## use "custom" (file)path for cache storage if passed in path else ## "/this/is/everything?query=params" ## cut-off leading slash and ## convert query ? = uri.request_uri[1..-1] end ### special "prettify" rule for weltfussball ## /eng-league-one-2019-2020/ => /eng-league-one-2019-2020.html if host_dir.index( 'weltfussball.de' ) || host_dir.index( 'worldfootball.net' ) if req_path.end_with?( '/' ) req_path = "#{req_path[0..-2]}.html" else puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<" exit 1 end elsif host_dir.index( 'tipp3.at' ) req_path = req_path.sub( '.jsp', '' ) # shorten - cut off .jsp extension ## change ? to -I- ## change = to ~ ## Example: ## sportwetten/classicresults.jsp?oddsetProgramID=888 ## => ## sportwetten/classicresults-I-oddsetProgramID~888 req_path = req_path.gsub( '?', '-I-' ) .gsub( '=', '~') req_path = "#{req_path}.html" elsif host_dir.index( 'fbref.com' ) req_path = req_path.sub( 'en/', '' ) # shorten - cut off en/ req_path = "#{req_path}.html" # auto-add html extension elsif host_dir.index( 'football-data.co.uk' ) req_path = req_path.sub( 'mmz4281/', '' ) # shorten - cut off mmz4281/ req_path = req_path.sub( 'new/', '' ) # shorten - cut off new/ elsif host_dir.index( 'football-data.org' ) req_path = req_path.sub( 'v2/', '' ) # shorten - cut off v2/ ## flattern - make a file path - for auto-save ## change ? to -I- ## change / to ~~ ## change = to ~ req_path = req_path.gsub( '?', '-I-' ) .gsub( '/', '~~' ) .gsub( '=', '~') req_path = "#{req_path}.json" elsif host_dir.index( 'api.cryptokitties.co' ) ## for now always auto-add .json extensions e.g. ## kitties/1 => kitties/1.json ## cattributes => cattributes.json req_path = "#{req_path}.json" else ## no special rule end page_path = "#{host_dir}/#{req_path}" page_path end