class Biblionet::Extractors::BookDataExtractor

Attributes

nodeset[R]

Public Class Methods

new(document) click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 251
def initialize(document)
  # No need to operate on whole page. Just on part containing the book.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content)
  end
end

Public Instance Methods

awards() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 373
def awards
  awards = []
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
    award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
    awards << award
  end

  return awards
end
collective_work?() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 356
def collective_work?
  return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
end
contributors() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 302
def contributors
  contributors = []
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
    pre_text = item.previous.text.strip
    contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
    contributor = {}
    contributor[:name] = item.text
    contributor[:b_id] = (item[:href].split("/"))[2]
    contributors << contributor
  end
  # Alternative way based on intersecting sets
  # set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
  # set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"

  # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
  #           text = other.inner_text.strip
  #           other = text == "," ? nil : text
  #         end.compact
  contributors
end
ddcs() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 352
def ddcs
  @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
end
description() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 341
def description
  desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
  desc = Sanitize.clean(desc, elements: ['br'])

  if (desc =~ /\p{Word}{3,}/).nil?
    return nil
  else
    return desc
  end
end
details() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 323
def details
  details = @nodeset.css('.book_details')[0]
                    .inner_html
                    .gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, '<br>')
                    .split('<br>').map(&:strip)
                    .reject(&:empty?)
  if details.nil?
    details = @nodeset.css('.book_details')[1]
                      .inner_html
                      .gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, '<br>')
                      .split('<br>')
                      .map(&:strip)
                      .reject(&:empty?)
  end

  details
end
has_contributors_but_no_authors?() click to toggle source

Special case in which there is no author but there are contributors

# File lib/bookshark/extractors/book_extractor.rb, line 361
def has_contributors_but_no_authors?
  node_start = "//h1[@class='book_title']/following::text()"
  node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
  between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip

  if !between.empty? and between.end_with? ':'
    true
  else
    false
  end
end
image() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 267
def image
  img_node = nil
  img_nodes = @nodeset.xpath("/html/body//img").each do |i|
    img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
    img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
  end

  img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]

  return img
end
publisher() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 293
def publisher
  publisher_hash = {}
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
    publisher_hash[:name] = item.text
    publisher_hash[:b_id] = (item[:href].split("/"))[2]
  end
  publisher_hash
end
subtitle() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 283
def subtitle
  subtitle = nil
  @nodeset.xpath("//h1[@class='book_title']").each do |item|
    if item.next_element.name == 'br' and item.next_element.next.name != 'br'
      subtitle = item.next_element.next.text.strip
    end
  end
  subtitle
end
title() click to toggle source
# File lib/bookshark/extractors/book_extractor.rb, line 279
def title
  @nodeset.css('h1.book_title').text
end