class Biblionet::Extractors::PublisherDataExtractor
Attributes
nodeset[R]
Public Class Methods
new(document)
click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 48 def initialize(document) # No need to operate on whole page. Just on part containing the content. content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m if (content_re.match(document)).nil? puts document end content = content_re.match(document)[0] unless (content_re.match(document)).nil? # If content is nil, there is something wrong with the html, so return nil if content.nil? @nodeset = nil else @nodeset = Nokogiri::HTML(content) end end
Public Instance Methods
bookstores()
click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 107 def bookstores bookstores_hash = Hash.new { |h,k| h[k] = {} } address_array = [] tel_array = [] # Defaunt key in case there is none. key = 'Βιβλιοπωλείο' @nodeset.css('//p[align="justify"]').inner_html.split('<br><br>').map(&:strip).reject(&:empty?).each do |item_group| if item_group.end_with?(":") key = item_group[0..-2] address_array = [] tel_array = [] else if bookstores_hash[key].any? key[-1].to_i key += ((key[-1].to_i > 0) ? (' '+(key[-1].to_i+1).to_s) : ' 2') address_array = [] tel_array = [] end item_group.split('<br>').each do |item| regex_tel = /\d{3,5} \d{5,7}/ regex_tk = /\d{3} \d{2}/ regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i regex_url = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix if item.end_with?(":") key = item[0..-2] address_array = [] tel_array = [] elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel bookstores_hash[key][:fax] = item.gsub(/[^\d{3} \d{2}]/, '').strip elsif item =~ regex_tel tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip bookstores_hash[key][:telephone] = tel_array elsif item =~ regex_tk address_array << item.gsub(/,$/, '').strip bookstores_hash[key][:address] = address_array elsif item =~ regex_email bookstores_hash[key][:email] = (regex_email.match(item))[0] elsif item =~ regex_url bookstores_hash[key][:website] = item[regex_url,1] else address_array << item.gsub(/,$/, '').strip bookstores_hash[key][:address] = address_array end end end end bookstores_hash.delete_if { |k, v| v.empty? } return bookstores_hash end
headquarters()
click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 72 def headquarters headquarters_hash = {} temp_array = [] current_key = nil last_key = nil @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item| key = item.children[0].text.strip current_key = key.end_with?(":") ? key[0..-2] : last_key value = item.children[1].text.strip unless key.empty? and value.empty? if current_key == last_key temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array) temp_array << value.gsub(/,$/, '').strip unless value.empty? headquarters_hash[current_key] = temp_array else temp_array = [] headquarters_hash[current_key] = value.gsub(/,$/, '').strip end end last_key = current_key end # Change keys. Use the same as in bookstores. mappings = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website} headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }] headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil? headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',') headquarters_hash[:address] = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array) return headquarters_hash end
name()
click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 64 def name @nodeset.css('h1.page_title').text.strip end
owner()
click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 68 def owner return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip end