class Biblionet::Extractors::BookExtractor

Attributes

book[R]

Public Class Methods

new(uri=nil) click to toggle source
Calls superclass method Biblionet::Extractors::Base::new
# File lib/bookshark/extractors/book_extractor.rb, line 14
def initialize(uri=nil)
  super(uri)
  extract_book unless uri.nil? or @page.nil?
end

Public Instance Methods

extract_book(biblionet_id=@biblionet_id, book_page=@page) click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 117
def extract_book(biblionet_id=@biblionet_id, book_page=@page)
  # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
  log = Logger.new(STDOUT)

  page = BookDataExtractor.new(book_page)

  # End extraction if BookDataExtractor couldnt create a nodeset
  return nil if page.nodeset.nil?


  book_hash = Hash.new

  begin
    img = page.image
    raise NoImageError.new(biblionet_id) if img.nil?
  rescue NoImageError => e
    pp e
    log.warn(e.message)
  rescue StandardError => e
    pp err_msg = "Error #{e} at book: #{biblionet_id}"
    log.error(err_msg)
  end

  book_hash[:title] = page.title
  book_hash[:subtitle] = page.subtitle
  book_hash[:image] = img

  contributors = proccess_contributors(page.contributors)

  author = contributors[:author]
  contributors.delete(:author)

  # If author is empty, maybe its a collective work.
  if author.nil? or author.empty?
    if page.collective_work?
      # author = 'Συλλογικό έργο'
      author = ['Συλλογικό έργο']
    else
      pp err_msg = "No author has been found at book: #{biblionet_id}"
      log.warn(err_msg)
      author = []
    end
  end

  book_hash[:author]       = author
  book_hash[:contributors] = contributors
  book_hash[:publisher]    = page.publisher

  details = page.details
  if details.nil?
    pp err_msg = "No details at book: #{biblionet_id}"
    log.error(err_msg)
  end

  details_hash = proccess_details(details)

  # book_hash[:publication_year] = details_hash[:publication_year]
  # book_hash[:pages]            = details_hash[:pages]
  book_hash[:isbn] = details_hash[:isbn]

  if details_hash[:isbn_13].nil?
    if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13)
      book_hash[:isbn_13] = book_hash[:isbn]
    else
      book_hash[:isbn_13] = nil
    end
  else
    book_hash[:isbn_13] = details_hash[:isbn_13]
  end

  # book_hash[:isbn_13]          = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
  # book_hash[:status]           = details_hash[:status]
  # book_hash[:price]            = details_hash[:price]
  book_hash[:award]            = page.awards


  book_hash[:description] = page.description

  ddcs = page.ddcs.map do |ddc|
          # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
          ddc_biblionet_id = ddc[:href].split(/\//).last
          # Extact DdC id and DdC text.
          ddc = proccess_ddc(ddc.text)

          ddc.merge!(b_id: ddc_biblionet_id)

        end


  book_hash[:category]   = ddcs
  book_hash[:b_id] = biblionet_id

  uri = nil

  if @url
    uri =  "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
  elsif @filepath
    uri = File.dirname(@filepath) + "/" + "bg_record_#{biblionet_id}.html"
  end

  # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"

  bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
  bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)

  book_hash[:publisher]         = bibliographical_details[:publisher]
  book_hash[:publication]       = bibliographical_details[:publication]

  book_hash[:format]            = bibliographical_details[:format]

  book_hash[:original_language] = bibliographical_details[:original_language]
  book_hash[:original_title]    = bibliographical_details[:original_title]

  book_hash[:price]             = bibliographical_details[:price]
  book_hash[:availability]      = bibliographical_details[:availability]
  book_hash[:last_update]       = bibliographical_details[:last_update]

  book_hash[:series]            = bibliographical_details[:series]

  physical_description_hash = {}
  physical_description_hash[:pages]      = details_hash[:pages]
  physical_description_hash[:size]       = bibliographical_details[:physical_size]
  physical_description_hash[:cover_type] = bibliographical_details[:cover_type]

  book_hash[:physical_description] = physical_description_hash


  return @book = book_hash
end
load_and_extract_book(uri=nil) click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 19
def load_and_extract_book(uri=nil)
  load_page(uri)
  extract_book unless uri.nil? or @page.nil?
end
proccess_contributors(raw_contributors) click to toggle source

Converts the parsed contributors string to hash. String must have been processed into the following form: job1: contributor1, contributor2 job2: contributor3 The returned hash is in form: {job1 => [“contributor1”,“contributor2”],job2 => [“contributor3”]}

# File lib/bookshark/extractors/book_extractor.rb, line 28
def proccess_contributors(raw_contributors)
  contributors  = Hash.new
  partners      = Array.new
  job           = :author
  raw_contributors.each do |cb|
    if cb.is_a?(String) and cb.end_with? ":"
      job = cb[0..-2]
      partners.clear
    else
      partners << cb
      contributors[job] =  partners.clone
    end
  end unless raw_contributors.nil? or raw_contributors.empty?

  return contributors
end
proccess_ddc(ddc, extract_parents = false) click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 96
def proccess_ddc(ddc, extract_parents = false)
  # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
  id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/

  # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
  non_text_re = /\s*(\[.*\]|\(.*\))\s*/

  # Gets the dcc part from text and removes anything but digits in [DDC: digits].
  ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.

  # Extracts the parent tree of current ddc.
  # ddcparser.parse(ddc_id)

  # Gets text by reomoving anything but text.
  ddc_text = ddc.gsub(non_text_re, '').strip

  ddc_hash = { ddc: ddc_id, name: ddc_text }
  return ddc_hash
end
proccess_details(details) click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 45
def proccess_details(details)
  details_hash = Hash.new

  details.each do |detail|
    date_regex = /(^(\d{4})|(, \d{4})$)/
    status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
    detail = decode_text(detail)

    begin
      if detail =~ date_regex
        # puts "Publication Year: #{detail}"
        details_hash[:publication_year] = detail
      elsif detail.end_with? 'σελ.'
        pages = detail.gsub(/[^\d]/, '')
        # puts "Pages: #{pages}"
        details_hash[:pages] = pages
      elsif detail.start_with? 'ISBN-13'
        isbn13 = detail.gsub(/ISBN-13 /, '').gsub('&Chi', 'X').gsub("\u0081]", '-')
        details_hash[:isbn_13] = isbn13
        # puts "ISBN: #{isbn_13}"
      elsif detail.start_with? 'ISBN'
        isbn = detail.gsub(/ISBN /, '').gsub('&Chi', 'X').gsub("\u0081]", '-')
        # puts "ISBN: #{isbn}"
        details_hash[:isbn] = isbn
      elsif detail =~ status_regex
        status = detail.gsub(/\[|\]/, '')
        # puts "Status: #{status}"
        details_hash[:status] = status
      elsif detail.start_with? 'Τιμή'
        price = detail.gsub(/[^\d,\d]/, '')
        # puts "Price: #{price}"
        details_hash[:price] = price
      elsif detail.start_with? '<img src="/images/award.jpg" border="0" title="Βραβείο">'
        award = Sanitize.clean(detail).strip
        details_hash[:awards] = [] if details_hash[:awards].nil?
        details_hash[:awards] << award
      elsif detail.start_with? 'ISMN' # Special typo case
        isbn = detail.gsub(/ISMN /, '')
        # puts "ISBN: #{isbn}"
        details_hash[:isbn] = isbn
      else
        raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
      end
    rescue NoIdeaWhatThisIsError => e
      pp e
    end
  end

  return details_hash
end