class Biblionet::Extractors::BibliographicalBookDataExtractor

Attributes

nodeset[R]

Public Class Methods

new(document) click to toggle source
# File lib/bookshark/extractors/bibliographical_book_extractor.rb, line 59
def initialize(document)
  # No need to operate on whole page. Just on part containing the book.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content)
  end
end

Public Instance Methods

details() click to toggle source
# File lib/bookshark/extractors/bibliographical_book_extractor.rb, line 85
def details
  details_hash        = {}
  isbn_regex          = /(?<= )\d+-\d+-\d+-\d+(?= |,)/
  isbn_13_regex       = /\d+-\d+-\d+-\d+-\d+/
  last_update_regex   = /\d{1,2}\/\d{1,2}\/\d{2,4}/
  cover_type_regex    = /(?<=\()\p{Word}+( \p{Word}+)?(?=\))/
  availability_regex  = /(?<=\[).+(?=\])/
  price_regex         = /(?<=€ )\d+,\d*/

  @nodeset.xpath("//span[@class='small'][1]").inner_html.split('<br>').each do |detail|
    detail = BibliographicalBookExtractor.decode_text(detail)

    if detail.start_with? "Γλώσσα πρωτοτύπου:"
      original_language = detail.gsub(/Γλώσσα πρωτοτύπου:/, "").strip
      details_hash[:original_language] = original_language
    elsif detail.start_with? "Τίτλος πρωτοτύπου:"
      original_title = detail.gsub(/Τίτλος πρωτοτύπου:/, "").strip
      details_hash[:original_title] = original_title
    end

    details_hash[:isbn]         = detail[isbn_regex] if detail =~ isbn_regex

    details_hash[:isbn_13]      = detail[isbn_13_regex] if detail =~ isbn_13_regex

    details_hash[:last_update]  = detail[last_update_regex] if detail =~ last_update_regex

    details_hash[:cover_type]   = detail[cover_type_regex] if detail =~ cover_type_regex

    details_hash[:availability] = detail[availability_regex] if detail =~ availability_regex

    details_hash[:price]        = detail[price_regex] if detail =~ price_regex

  end

  # Splits availability with date like "Υπό Έκδοση 4/2017" to availablity and last update
  unless details_hash[:availability].nil?
    availability = details_hash[:availability]
    details_hash[:availability] = "Υπό Έκδοση" if availability.include? "Υπό Έκδοση"

    if details_hash[:last_update].nil?
      if availability =~ /(\d{1,2}\/)?\d{1,2}\/\d{2,}/
        last_update = availability.match(/(\d{1,2}\/)?\d{1,2}\/\d{2,}/)[0]
        last_update = "10/" + last_update if last_update.split('/').length == 2
        details_hash[:last_update] = last_update
      end
    end
  end

  pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
  pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)

  series_regex        = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)* · \d+(?=\))/
  series_regex_no_vol = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)*(?=\))/
  series_name_regex   = /\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= ·)/
  series_volume_regex = /(?<=· )\d+/
  physical_size_regex = /\d+x\d+/

  series_hash = {}
  if pre_details_text =~ series_regex
    series = pre_details_text[series_regex]
    series_hash[:name]    = series[series_name_regex] if series =~ series_name_regex
    series_hash[:volume]  = series[series_volume_regex] if series =~ series_volume_regex
  elsif pre_details_text =~ series_regex_no_vol
    series = pre_details_text[series_regex_no_vol]
    series_hash[:name]    = series
    series_hash[:volume]  = nil
  end

  details_hash[:series] = series_hash

  details_hash[:physical_size] = (pre_details_text =~ physical_size_regex) ? pre_details_text[physical_size_regex] : nil

  format_regex = /(?<=\[).+(?=\])/

  after_title_text = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][1]").first.next_sibling.text.strip
  format = after_title_text[format_regex] if after_title_text =~ format_regex

  details_hash[:format] = format.nil? ? 'Βιβλίο' : format

  publisher_node = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").first
  if !publisher_node.nil?
    publisher_hash = {}
    publisher_hash[:text] = publisher_node.text
    publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2]

    pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text)
    after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text)

    publication_hash = {}
    publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/]
    publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/]
    publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]

    details_hash[:publisher] = publisher_hash
    details_hash[:publication] = publication_hash
  else
    publisher_node = @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/com/') ]][1]").first
    if !publisher_node.nil?
      details_hash[:publisher] = {
        text: publisher_node.text,
        b_id: (publisher_node[:href].split("/"))[2]
      }
      last_author = @nodeset
        .xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]][last()]").last

      if !last_author.nil?
        after_last_author_text = last_author.next_sibling.text.strip
      else
        last_book = @nodeset
          .xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][last()]").last
        after_last_author_text = last_book.next_sibling.text.strip
      end

      details_hash[:publication] = {
        year: after_last_author_text[/(?<=: )\d+(?=\.)/],
        version: after_last_author_text[/(?<=- )\d+(?=η)/],
        place: after_last_author_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
      }
    else
      details_hash[:publisher] = {text: nil, b_id: nil}
      details_hash[:publication] = {year: nil, version: nil, place: nil}
    end
  end

  details_hash
end
series() click to toggle source
# File lib/bookshark/extractors/bibliographical_book_extractor.rb, line 79
def series
  series_regex        = /(?<=\()\p{Word}+( \p{Word}+)* · \d+(?=\))/
  series_name_regex   = /\p{Word}+( \p{Word}+)*(?= ·)/
  series_volume_regex = /(?<=· )\d+/
end
size() click to toggle source
# File lib/bookshark/extractors/bibliographical_book_extractor.rb, line 75
def size
  size_regex = /\d+x\d+/
end