class Biblionet::Extractors::Base

Attributes

biblionet_id[R]
filepath[R]
page[R]
url[R]

Public Class Methods

decode_text(encoded_text) click to toggle source
# File lib/bookshark/extractors/base.rb, line 147
def self.decode_text(encoded_text)
  # encoded_text = File.read(encoded_file_path)
  coder = HTMLEntities.new
  coder.decode(encoded_text)
end
new(uri=nil) click to toggle source

Initializes the Base class. Without arguments nothing happens. Otherwise loads a page by url or file.

Attributes

  • uri - It can be a url or a path/to/file.ext on local storage.

# File lib/bookshark/extractors/base.rb, line 31
def initialize(uri=nil)          
  load_page(uri)
end

Public Instance Methods

decode_text(encoded_text) click to toggle source

Decodes text with escaped html entities and returns the decoded text.

Params:

encoded_text

the text which contains encoded entities

# File lib/bookshark/extractors/base.rb, line 143
def decode_text(encoded_text)
  self.class.decode_text(encoded_text)
end
filepath=(filepath) click to toggle source

Attr writer method. Changes instance variables filepath, page and loads a new page by calling load_page_from_filename

Attributes

  • filepath - The path to target file which will be read.

# File lib/bookshark/extractors/base.rb, line 122
def filepath=(filepath)
  load_page_from_file(filepath)
end
load_page(uri=nil) click to toggle source

Loads a page from the web or from local file storage depending on passed argument.

Attributes

  • uri - It can be a url(starting with http/https) or a path/to/file.ext on local storage.

# File lib/bookshark/extractors/base.rb, line 41
def load_page(uri=nil)      
  if uri.match(/\A#{URI::regexp(['http', 'https'])}\z/)        
    load_page_from_url(uri)
  else                
    load_page_from_file(uri)
  end unless uri.nil?
end
load_page_from_file(filepath) click to toggle source

Reads a page from the local file system.

Attributes

  • filepath - The path to target file which will be read.

# File lib/bookshark/extractors/base.rb, line 96
def load_page_from_file(filepath)    
  begin        
    @filepath = filepath
    @biblionet_id = filepath[/\d+(?!.*\d+)/] unless filepath.nil?
    @page = open(filepath).read  
  rescue StandardError => e
    puts e
  end     
end
load_page_from_url(url) click to toggle source

Downloads a page from the web.

Attributes

  • url - The url of webpage to download.

# File lib/bookshark/extractors/base.rb, line 55
def load_page_from_url(url)
  begin
    @url = url
    @biblionet_id = url[/\d+(?!.*\d+)/] unless url.nil? # id is expected to be the last number.

    pp "Downloading page: #{url}"
    open(url, :content_length_proc => lambda do |content_length|
      raise EmptyPageError.new(url, content_length) unless content_length.nil? or content_length > 1024
    end) do |f|        
      # pp f.status == ["200", "OK"] ? "success: #{f.status}" : f.status
      # pp  f.meta
      # pp "Content-Type: " + f.content_type
      # pp "Content-Size: " + (f.meta)["content-length"]
      # pp "last modified" + f.last_modified.to_s + is_empty = (f.last_modified.nil?) ? 'Empty' : 'Not Empty'

      @page = f.read.gsub(/\s+/, " ")
    end
  rescue Errno::ENOENT => e
    pp "Page: #{url} NOT FOUND."
    pp e
  rescue EmptyPageError => e
    pp "Page: #{url} is EMPTY."
    pp e        
    @page = nil
  rescue OpenURI::HTTPError => e
    pp e
    pp e.io.status          
  rescue StandardError => e          
    pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
    pp e        
    sleep(120)
    retry        
  end
end
present?(value) click to toggle source
# File lib/bookshark/extractors/base.rb, line 153
def present?(value)
  return (not value.nil? and not value.empty?) ? true : false
end
save_page(path) click to toggle source

Saves page to file.

Attributes

  • path - The path to file(including filename) where content will be saved.

# File lib/bookshark/extractors/base.rb, line 132
def save_page(path)
  save_to(path, @page)
  pp "Saving page: #{path}"
end
url=(url) click to toggle source

Attr writer method. Changes instance variables url, page and loads a new page by calling load_page_from_url

Attributes

  • url - The new value of url instance var.

# File lib/bookshark/extractors/base.rb, line 112
def url=(url)
  load_page_from_url(url)
end