class Biblionet::Extractors::BookExtractor
Attributes
book[R]
Public Class Methods
new(uri=nil)
click to toggle source
Calls superclass method
Biblionet::Extractors::Base::new
# File lib/bookshark/extractors/book_extractor.rb, line 14 def initialize(uri=nil) super(uri) extract_book unless uri.nil? or @page.nil? end
Public Instance Methods
extract_book(biblionet_id=@biblionet_id, book_page=@page)
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 117 def extract_book(biblionet_id=@biblionet_id, book_page=@page) # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+')) log = Logger.new(STDOUT) page = BookDataExtractor.new(book_page) # End extraction if BookDataExtractor couldnt create a nodeset return nil if page.nodeset.nil? book_hash = Hash.new begin img = page.image raise NoImageError.new(biblionet_id) if img.nil? rescue NoImageError => e pp e log.warn(e.message) rescue StandardError => e pp err_msg = "Error #{e} at book: #{biblionet_id}" log.error(err_msg) end book_hash[:title] = page.title book_hash[:subtitle] = page.subtitle book_hash[:image] = img contributors = proccess_contributors(page.contributors) author = contributors[:author] contributors.delete(:author) # If author is empty, maybe its a collective work. if author.nil? or author.empty? if page.collective_work? # author = 'Συλλογικό έργο' author = ['Συλλογικό έργο'] else pp err_msg = "No author has been found at book: #{biblionet_id}" log.warn(err_msg) author = [] end end book_hash[:author] = author book_hash[:contributors] = contributors book_hash[:publisher] = page.publisher details = page.details if details.nil? pp err_msg = "No details at book: #{biblionet_id}" log.error(err_msg) end details_hash = proccess_details(details) # book_hash[:publication_year] = details_hash[:publication_year] # book_hash[:pages] = details_hash[:pages] book_hash[:isbn] = details_hash[:isbn] if details_hash[:isbn_13].nil? if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13) book_hash[:isbn_13] = book_hash[:isbn] else book_hash[:isbn_13] = nil end else book_hash[:isbn_13] = details_hash[:isbn_13] end # book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13] # book_hash[:status] = details_hash[:status] # book_hash[:price] = details_hash[:price] book_hash[:award] = page.awards book_hash[:description] = page.description ddcs = page.ddcs.map do |ddc| # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id --- ddc_biblionet_id = ddc[:href].split(/\//).last # Extact DdC id and DdC text. ddc = proccess_ddc(ddc.text) ddc.merge!(b_id: ddc_biblionet_id) end book_hash[:category] = ddcs book_hash[:b_id] = biblionet_id uri = nil if @url uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}" elsif @filepath uri = File.dirname(@filepath) + "/" + "bg_record_#{biblionet_id}.html" end # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}" bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri) book_hash[:publisher] = bibliographical_details[:publisher] book_hash[:publication] = bibliographical_details[:publication] book_hash[:format] = bibliographical_details[:format] book_hash[:original_language] = bibliographical_details[:original_language] book_hash[:original_title] = bibliographical_details[:original_title] book_hash[:price] = bibliographical_details[:price] book_hash[:availability] = bibliographical_details[:availability] book_hash[:last_update] = bibliographical_details[:last_update] book_hash[:series] = bibliographical_details[:series] physical_description_hash = {} physical_description_hash[:pages] = details_hash[:pages] physical_description_hash[:size] = bibliographical_details[:physical_size] physical_description_hash[:cover_type] = bibliographical_details[:cover_type] book_hash[:physical_description] = physical_description_hash return @book = book_hash end
load_and_extract_book(uri=nil)
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 19 def load_and_extract_book(uri=nil) load_page(uri) extract_book unless uri.nil? or @page.nil? end
proccess_contributors(raw_contributors)
click to toggle source
Converts the parsed contributors string to hash. String must have been processed into the following form: job1: contributor1, contributor2 job2: contributor3 The returned hash is in form: {job1 => [“contributor1”,“contributor2”],job2 => [“contributor3”]}
# File lib/bookshark/extractors/book_extractor.rb, line 28 def proccess_contributors(raw_contributors) contributors = Hash.new partners = Array.new job = :author raw_contributors.each do |cb| if cb.is_a?(String) and cb.end_with? ":" job = cb[0..-2] partners.clear else partners << cb contributors[job] = partners.clone end end unless raw_contributors.nil? or raw_contributors.empty? return contributors end
proccess_ddc(ddc, extract_parents = false)
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 96 def proccess_ddc(ddc, extract_parents = false) # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300) id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/ # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300) non_text_re = /\s*(\[.*\]|\(.*\))\s*/ # Gets the dcc part from text and removes anything but digits in [DDC: digits]. ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text. # Extracts the parent tree of current ddc. # ddcparser.parse(ddc_id) # Gets text by reomoving anything but text. ddc_text = ddc.gsub(non_text_re, '').strip ddc_hash = { ddc: ddc_id, name: ddc_text } return ddc_hash end
proccess_details(details)
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 45 def proccess_details(details) details_hash = Hash.new details.each do |detail| date_regex = /(^(\d{4})|(, \d{4})$)/ status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/ detail = decode_text(detail) begin if detail =~ date_regex # puts "Publication Year: #{detail}" details_hash[:publication_year] = detail elsif detail.end_with? 'σελ.' pages = detail.gsub(/[^\d]/, '') # puts "Pages: #{pages}" details_hash[:pages] = pages elsif detail.start_with? 'ISBN-13' isbn13 = detail.gsub(/ISBN-13 /, '').gsub('&Chi', 'X').gsub("\u0081]", '-') details_hash[:isbn_13] = isbn13 # puts "ISBN: #{isbn_13}" elsif detail.start_with? 'ISBN' isbn = detail.gsub(/ISBN /, '').gsub('&Chi', 'X').gsub("\u0081]", '-') # puts "ISBN: #{isbn}" details_hash[:isbn] = isbn elsif detail =~ status_regex status = detail.gsub(/\[|\]/, '') # puts "Status: #{status}" details_hash[:status] = status elsif detail.start_with? 'Τιμή' price = detail.gsub(/[^\d,\d]/, '') # puts "Price: #{price}" details_hash[:price] = price elsif detail.start_with? '<img src="/images/award.jpg" border="0" title="Βραβείο">' award = Sanitize.clean(detail).strip details_hash[:awards] = [] if details_hash[:awards].nil? details_hash[:awards] << award elsif detail.start_with? 'ISMN' # Special typo case isbn = detail.gsub(/ISMN /, '') # puts "ISBN: #{isbn}" details_hash[:isbn] = isbn else raise NoIdeaWhatThisIsError.new(@biblionet_id, detail) end rescue NoIdeaWhatThisIsError => e pp e end end return details_hash end