class Biblionet::Extractors::AuthorDataExtractor

Attributes

nodeset[R]

Public Class Methods

new(document) click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 86
def initialize(document)
  # No need to operate on whole page. Just on part containing the content.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  @nodeset = Nokogiri::HTML(content)        
end

Public Instance Methods

awards() click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 111
def awards
  awards = []        
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
    award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}          
    awards << award
  end

  return awards
end
bio() click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 101
def bio
  @nodeset.css('//p[align="justify"]').text
end
fullname() click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 97
def fullname
  @nodeset.css('h1.page_title').text
end
image() click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 105
def image
  img_node  = @nodeset.xpath("//img[@src[contains(.,'/persons/')]][1]")                                                   
  img       = (img_node.nil? or img_node.empty?) ? nil : BASE_URL+(img_node.first)['src']                             
  return img         
end