class Biblionet::Extractors::Base
Attributes
biblionet_id[R]
filepath[R]
page[R]
url[R]
Public Class Methods
decode_text(encoded_text)
click to toggle source
# File lib/bookshark/extractors/base.rb, line 147 def self.decode_text(encoded_text) # encoded_text = File.read(encoded_file_path) coder = HTMLEntities.new coder.decode(encoded_text) end
new(uri=nil)
click to toggle source
Public Instance Methods
decode_text(encoded_text)
click to toggle source
filepath=(filepath)
click to toggle source
load_page(uri=nil)
click to toggle source
Loads a page from the web or from local file storage depending on passed argument.
Attributes¶ ↑
-
uri
- It can be a url(starting with http/https) or a path/to/file.ext on local storage.
# File lib/bookshark/extractors/base.rb, line 41 def load_page(uri=nil) if uri.match(/\A#{URI::regexp(['http', 'https'])}\z/) load_page_from_url(uri) else load_page_from_file(uri) end unless uri.nil? end
load_page_from_file(filepath)
click to toggle source
Reads a page from the local file system.
Attributes¶ ↑
-
filepath
- The path to target file which will be read.
# File lib/bookshark/extractors/base.rb, line 96 def load_page_from_file(filepath) begin @filepath = filepath @biblionet_id = filepath[/\d+(?!.*\d+)/] unless filepath.nil? @page = open(filepath).read rescue StandardError => e puts e end end
load_page_from_url(url)
click to toggle source
Downloads a page from the web.
Attributes¶ ↑
-
url
- The url of webpage to download.
# File lib/bookshark/extractors/base.rb, line 55 def load_page_from_url(url) begin @url = url @biblionet_id = url[/\d+(?!.*\d+)/] unless url.nil? # id is expected to be the last number. pp "Downloading page: #{url}" open(url, :content_length_proc => lambda do |content_length| raise EmptyPageError.new(url, content_length) unless content_length.nil? or content_length > 1024 end) do |f| # pp f.status == ["200", "OK"] ? "success: #{f.status}" : f.status # pp f.meta # pp "Content-Type: " + f.content_type # pp "Content-Size: " + (f.meta)["content-length"] # pp "last modified" + f.last_modified.to_s + is_empty = (f.last_modified.nil?) ? 'Empty' : 'Not Empty' @page = f.read.gsub(/\s+/, " ") end rescue Errno::ENOENT => e pp "Page: #{url} NOT FOUND." pp e rescue EmptyPageError => e pp "Page: #{url} is EMPTY." pp e @page = nil rescue OpenURI::HTTPError => e pp e pp e.io.status rescue StandardError => e pp "Generic error #{e.class}. Will wait for 2 minutes and then try again." pp e sleep(120) retry end end
present?(value)
click to toggle source
# File lib/bookshark/extractors/base.rb, line 153 def present?(value) return (not value.nil? and not value.empty?) ? true : false end
save_page(path)
click to toggle source
url=(url)
click to toggle source
Attr writer method. Changes instance variables url, page and loads a new page by calling load_page_from_url
Attributes¶ ↑
-
url
- The new value of url instance var.
# File lib/bookshark/extractors/base.rb, line 112 def url=(url) load_page_from_url(url) end