class Biblionet::Extractors::PublisherDataExtractor

Attributes

nodeset[R]

Public Class Methods

new(document) click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 48
def initialize(document)
  # No need to operate on whole page. Just on part containing the content.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content)
  end                 
end

Public Instance Methods

bookstores() click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 107
def bookstores
    bookstores_hash = Hash.new { |h,k| h[k] = {} }
    address_array   = []
    tel_array       = []

    # Defaunt key in case there is none.
    key = 'Βιβλιοπωλείο'

    @nodeset.css('//p[align="justify"]').inner_html.split('<br><br>').map(&:strip).reject(&:empty?).each do |item_group|
      if item_group.end_with?(":")
        key           = item_group[0..-2]
        address_array = []
        tel_array     = []
      else        
        if bookstores_hash[key].any?          
          key[-1].to_i
          key += ((key[-1].to_i > 0) ? (' '+(key[-1].to_i+1).to_s) : ' 2')
          address_array = []
          tel_array     = []         
        end        
        item_group.split('<br>').each do |item|          
          regex_tel   = /\d{3,5} \d{5,7}/
          regex_tk    = /\d{3} \d{2}/
          regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i
          regex_url   = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix
          
          if item.end_with?(":")                   
            key           = item[0..-2]
            address_array = []
            tel_array     = []
          elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel            
            bookstores_hash[key][:fax]        = item.gsub(/[^\d{3} \d{2}]/, '').strip            
          elsif item =~ regex_tel
            tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip            
            bookstores_hash[key][:telephone]  = tel_array            
          elsif item =~ regex_tk
            address_array << item.gsub(/,$/, '').strip                       
            bookstores_hash[key][:address]    = address_array            
          elsif item =~ regex_email            
            bookstores_hash[key][:email]      = (regex_email.match(item))[0]                        
          elsif item =~ regex_url            
            bookstores_hash[key][:website]    = item[regex_url,1]          
          else     
            address_array << item.gsub(/,$/, '').strip            
            bookstores_hash[key][:address]   = address_array                            
          end
        end                
      end      
    end
    bookstores_hash.delete_if { |k, v| v.empty? }
    return bookstores_hash
  end
headquarters() click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 72
def headquarters
  headquarters_hash   = {}
  temp_array          = []
  current_key         = nil
  last_key            = nil

  @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item|
    key         = item.children[0].text.strip
    current_key = key.end_with?(":") ? key[0..-2] : last_key
    value       = item.children[1].text.strip

    unless key.empty? and value.empty?
      if current_key == last_key              
        temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array)
        temp_array << value.gsub(/,$/, '').strip unless value.empty?
        headquarters_hash[current_key] = temp_array
      else
        temp_array                      = []
        headquarters_hash[current_key]  = value.gsub(/,$/, '').strip
      end
    end

    last_key = current_key          
  end

  # Change keys. Use the same as in bookstores.
  mappings                      = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
  headquarters_hash             = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
  headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
  headquarters_hash[:website]   = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
  headquarters_hash[:address]   = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array) 

  return headquarters_hash                
end
name() click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 64
def name
  @nodeset.css('h1.page_title').text.strip
end
owner() click to toggle source
# File lib/bookshark/extractors/publisher_extractor.rb, line 68
def owner 
  return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip        
end