class Biblionet::Extractors::AuthorDataExtractor
Attributes
nodeset[R]
Public Class Methods
new(document)
click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 86 def initialize(document) # No need to operate on whole page. Just on part containing the content. content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m if (content_re.match(document)).nil? puts document end content = content_re.match(document)[0] unless (content_re.match(document)).nil? @nodeset = Nokogiri::HTML(content) end
Public Instance Methods
awards()
click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 111 def awards awards = [] @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item| award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')} awards << award end return awards end
bio()
click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 101 def bio @nodeset.css('//p[align="justify"]').text end
fullname()
click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 97 def fullname @nodeset.css('h1.page_title').text end
image()
click to toggle source
# File lib/bookshark/extractors/author_extractor.rb, line 105 def image img_node = @nodeset.xpath("//img[@src[contains(.,'/persons/')]][1]") img = (img_node.nil? or img_node.empty?) ? nil : BASE_URL+(img_node.first)['src'] return img end