class Biblionet::Extractors::CategoryExtractor
Attributes
categories[R]
Public Class Methods
new(uri=nil)
click to toggle source
Calls superclass method
Biblionet::Extractors::Base::new
# File lib/bookshark/extractors/category_extractor.rb, line 12 def initialize(uri=nil) super(uri) extract_categories unless uri.nil? or @page.nil? end
Public Instance Methods
extract_categories(category_page=@page)
click to toggle source
# File lib/bookshark/extractors/category_extractor.rb, line 17 def extract_categories(category_page=@page) page = Nokogiri::HTML(category_page) parent, previous_indent, previous_id = nil, nil, nil, @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category| # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id --- biblionet_id = category[:href].split(/\//).last # Get the text before <a>. It is expected to be a number of space characters spaces = category.previous_sibling.text # TODO: make sure text is only spaces # Indent size indent = spaces.size # Determine parent-child-sibling relationships based on indent. # Indent size seems to be inconsistent, so it better to compare sizes than actually use them. if (indent <=> previous_indent).nil? previous_indent = indent elsif (indent <=> previous_indent)>0 parent = previous_id previous_indent = indent end previous_id = biblionet_id # Extact DdC id and DdC text. category = proccess_category(category.text) category.merge!(parent: parent) category_hash = {biblionet_id => category.clone} end.reduce({}, :update) unless @page.nil? if present?(@categories) @categories[:current] = (@categories[@biblionet_id.to_s].clone) @categories[:current][:b_id] = @biblionet_id return @categories else return nil end end
extract_categories_from(uri=nil)
click to toggle source
# File lib/bookshark/extractors/category_extractor.rb, line 58 def extract_categories_from(uri=nil) load_page(uri) extract_categories unless uri.nil? or @page.nil? end
Private Instance Methods
proccess_category(category)
click to toggle source
# File lib/bookshark/extractors/category_extractor.rb, line 66 def proccess_category(category) # matches the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300) ddc_re = /(\[\d*(?:[\.|\s]\d*)*\])/ # matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300) non_text_re = /\s*(\[.*\]|\(\d*\))\s*/ category_ddc = category.scan(ddc_re).join.gsub(/[\[\]]/, '') category_name = category.gsub(non_text_re, '').strip category_hash = { ddc: category_ddc, name: category_name } return category_hash end