class Biblionet::Extractors::CategoryExtractor

Attributes

categories[R]

Public Class Methods

new(uri=nil) click to toggle source
Calls superclass method Biblionet::Extractors::Base::new
# File lib/bookshark/extractors/category_extractor.rb, line 12
def initialize(uri=nil)
  super(uri)        
  extract_categories unless uri.nil? or @page.nil?         
end

Public Instance Methods

extract_categories(category_page=@page) click to toggle source
# File lib/bookshark/extractors/category_extractor.rb, line 17
def extract_categories(category_page=@page)
  page = Nokogiri::HTML(category_page)  
  parent, previous_indent, previous_id = nil, nil, nil,

  @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category|      
    # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
    biblionet_id = category[:href].split(/\//).last

    # Get the text before <a>. It is expected to be a number of space characters
    spaces = category.previous_sibling.text # TODO: make sure text is only spaces
    # Indent size
    indent = spaces.size

    # Determine parent-child-sibling relationships based on indent.
    # Indent size seems to be inconsistent, so it better to compare sizes than actually use them.
    if (indent <=> previous_indent).nil?
      previous_indent = indent
    elsif (indent <=> previous_indent)>0
      parent = previous_id
      previous_indent = indent        
    end

    previous_id = biblionet_id

    # Extact DdC id and DdC text.
    category = proccess_category(category.text)

    category.merge!(parent: parent)
    
    category_hash = {biblionet_id => category.clone}
  end.reduce({}, :update) unless @page.nil?               

  if present?(@categories)
    @categories[:current] = (@categories[@biblionet_id.to_s].clone)
    @categories[:current][:b_id] = @biblionet_id
    return @categories
  else
    return nil
  end                
end
extract_categories_from(uri=nil) click to toggle source
# File lib/bookshark/extractors/category_extractor.rb, line 58
def extract_categories_from(uri=nil)
  load_page(uri)
  extract_categories unless uri.nil? or @page.nil? 
end

Private Instance Methods

proccess_category(category) click to toggle source
# File lib/bookshark/extractors/category_extractor.rb, line 66
def proccess_category(category)
  # matches the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
  ddc_re = /(\[\d*(?:[\.|\s]\d*)*\])/

  # matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
  non_text_re = /\s*(\[.*\]|\(\d*\))\s*/
  
  category_ddc = category.scan(ddc_re).join.gsub(/[\[\]]/, '')
  category_name = category.gsub(non_text_re, '').strip

  category_hash = { ddc: category_ddc, name: category_name } 
  return category_hash
end