class Biblionet::Extractors::BookDataExtractor
Attributes
nodeset[R]
Public Class Methods
new(document)
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 251 def initialize(document) # No need to operate on whole page. Just on part containing the book. content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m if (content_re.match(document)).nil? puts document end content = content_re.match(document)[0] unless (content_re.match(document)).nil? # If content is nil, there is something wrong with the html, so return nil if content.nil? @nodeset = nil else @nodeset = Nokogiri::HTML(content) end end
Public Instance Methods
awards()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 373 def awards awards = [] @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item| award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')} awards << award end return awards end
collective_work?()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 356 def collective_work? return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false end
contributors()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 302 def contributors contributors = [] @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item| pre_text = item.previous.text.strip contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':' contributor = {} contributor[:name] = item.text contributor[:b_id] = (item[:href].split("/"))[2] contributors << contributor end # Alternative way based on intersecting sets # set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()" # set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()" # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other| # text = other.inner_text.strip # other = text == "," ? nil : text # end.compact contributors end
ddcs()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 352 def ddcs @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]") end
description()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 341 def description desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n') desc = Sanitize.clean(desc, elements: ['br']) if (desc =~ /\p{Word}{3,}/).nil? return nil else return desc end end
details()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 323 def details details = @nodeset.css('.book_details')[0] .inner_html .gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, '<br>') .split('<br>').map(&:strip) .reject(&:empty?) if details.nil? details = @nodeset.css('.book_details')[1] .inner_html .gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, '<br>') .split('<br>') .map(&:strip) .reject(&:empty?) end details end
image()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 267 def image img_node = nil img_nodes = @nodeset.xpath("/html/body//img").each do |i| img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]") img_node = img_candidate unless img_candidate.nil? or img_candidate.empty? end img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src] return img end
publisher()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 293 def publisher publisher_hash = {} @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item| publisher_hash[:name] = item.text publisher_hash[:b_id] = (item[:href].split("/"))[2] end publisher_hash end
subtitle()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 283 def subtitle subtitle = nil @nodeset.xpath("//h1[@class='book_title']").each do |item| if item.next_element.name == 'br' and item.next_element.next.name != 'br' subtitle = item.next_element.next.text.strip end end subtitle end
title()
click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 279 def title @nodeset.css('h1.book_title').text end