class Bookshark::Extractor

Attributes

format[RW]
site[RW]

Public Class Methods

new(options = {}) click to toggle source
# File lib/bookshark.rb, line 37
def initialize(options = {})
  options = DEFAULTS.merge(options)
  @site   = options[:site]
  @format = options[:format]
end

Public Instance Methods

author(options = {}) click to toggle source
# File lib/bookshark.rb, line 43
def author(options = {})
  uri = process_options(options, __method__)
  options[:format] ||= @format

  author_extractor = Biblionet::Extractors::AuthorExtractor.new
  author = author_extractor.load_and_extract_author(uri)

  response = {}
  response[:author] = !author.nil? ? [author] : []
  response = change_format(response, options[:format])
  return response
end
authors_from_storage() click to toggle source

def books_from_storage

extract_from_storage_and_save('book', 'html_book_pages', 'json_book_pages')

end

# File lib/bookshark.rb, line 171
def authors_from_storage
  extract_from_storage_and_save('author', 'html_author_pages', 'json_author_pages')
end
book(options = {}) click to toggle source
# File lib/bookshark.rb, line 72
def book(options = {})
  options[:site] ||= @site

  if options[:site] == 'biblionet'
    book_extractor = Biblionet::Extractors::BookExtractor.new

    if book_extractor.present?(options[:isbn])
      search_engine = Biblionet::Extractors::Search.new
      options[:id]  = search_engine.search_by_isbn(options[:isbn])
    end

    uri = process_options(options, __method__)
    options[:format]  ||= @format
    options[:eager]   ||= false
    options[:nilify]  ||= false

    if options[:eager]
      book = eager_extract_book(uri)
    else
      book = book_extractor.load_and_extract_book(uri)
    end

    response = {}
    response[:book] = !book.nil? ? [book] : []

    return nil if response[:book].empty? and options[:nilify]

    response = change_format(response, options[:format])

    response = book_extractor.decode_text(response) if response.class == "String"

    return response
  elsif options[:site] == 'nlg'
    book_extractor = Nlg::Extractors::BookExtractor.new

    options[:format] ||= @format

    # if !options[:uri].nil?
    #   uri = "#{options[:uri]}/Export?style=MARCXML"
    # elsif !options[:id].nil?
    #   uri = "http://nbib.nlg.gr/Record/#{options[:id]}/Export?style=MARCXML"
    # end

    book = book_extractor.load_and_extract_book(options[:id])

    response = {}
    response[:book] = !book.nil? ? [book] : []
  end
end
categories_from_storage() click to toggle source
# File lib/bookshark.rb, line 179
def categories_from_storage
  extract_from_storage_and_save('category', 'html_category_pages', 'json_category_pages')
end
category(options = {}) click to toggle source

puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)

# File lib/bookshark.rb, line 139
def category(options = {})
  uri = process_options(options, __method__)
  options[:format] ||= @format

  category_extractor = Biblionet::Extractors::CategoryExtractor.new
  category = category_extractor.extract_categories_from(uri)

  response = {}
  response[:category] = !category.nil? ? [category] : []
  response = change_format(response, options[:format])

  return response
end
extract_books_from_storage_and_save(start_id, finish_id, format = 'pretty_json') click to toggle source
# File lib/bookshark.rb, line 183
def extract_books_from_storage_and_save(start_id, finish_id, format = 'pretty_json')
  start_id.upto(finish_id) do |book_id|
    record = book(id: book_id, local: true, format: format, nilify: true)

    dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json"

    save_to(dir_to_save, record) unless record.nil?
  end
end
extract_from_storage_and_save(metadata_type, source_dir, target_dir) click to toggle source
# File lib/bookshark.rb, line 194
def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
  list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
    dir_to_save = dir.gsub(source_dir, target_dir)

    list_files(path: dir, extension: 'html', all:true).each do |file|
      puts "Extracting from file: " + file.to_s

      # Extract publisher metadata form local file.
      options = {uri: file, format: 'pretty_json', local: true}

      case metadata_type
      when 'author'
        record = author(options)
      when 'publisher'
        record = publisher(options)
      # when 'book'
      #   record = book(options)
      when 'category'
        record = category(options)
      end

      # Prepare a path to save the new file.
      filename  = File.basename(file,".*")
      path_to_save = "#{dir_to_save}#{filename}.json"

      # Save to file.
      save_to("#{path_to_save}", record)

    end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
  end
end
parse_all_books() click to toggle source
# File lib/bookshark.rb, line 246
def parse_all_books
  bp = Biblionet::Extractors::BookExtractor.new

  list_directories(path: 'storage/raw_html_pages').each do |dir|
    dir_to_save = dir.gsub(/raw_html_pages/, 'books')

    list_files(path: dir, extension: 'html', all:true).each do |file|

      # Load the book from html file and parse the data.
      # pp "Parsing book: #{file}"
      pp file
      book = bp.load_and_extract_book(file)

      # Prepare a path to save the new file.
      filename  = File.basename(file,".*")
      path_to_save = "#{dir_to_save}#{filename}.json"

      # Save to file.
      bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
      # pp "Book #{file} saved!"
    end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
  end
end
parse_all_categories(will_save=false) click to toggle source
# File lib/bookshark.rb, line 226
def parse_all_categories(will_save=false)
  # list_directories('raw_ddc_pages').each do |dir|
    # p dir
  # end
  category_extractor = Biblionet::Extractors::CategoryExtractor.new
  all_categories = Hash.new

  list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
    categories = category_extractor.extract_categories_from(file)
    all_categories.merge!(categories) unless categories.nil? or categories.empty?
  end

  if will_save
    all_categories_json = all_categories.to_json
    save_to('storage/all_categories.json',all_categories_json)
  end

  all_categories
end
publisher(options = {}) click to toggle source
# File lib/bookshark.rb, line 56
def publisher(options = {})
  uri = process_options(options, __method__)
  options[:format] ||= @format

  publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
  publisher = publisher_extractor.load_and_extract_publisher(uri)

  response = {}
  response[:publisher] = !publisher.nil? ? [publisher] : []
  response = change_format(response, options[:format])
  response = publisher_extractor.decode_text(response)

  return response
  # return uri
end
publishers_from_storage() click to toggle source
# File lib/bookshark.rb, line 175
def publishers_from_storage
  extract_from_storage_and_save('publisher', 'html_publisher_pages', 'json_publisher_pages')
end

Private Instance Methods

change_format(hash, format) click to toggle source
# File lib/bookshark.rb, line 304
def change_format(hash, format)
  case format
  when 'hash'
    return hash
  when 'json'
    hash = hash.to_json
  when 'pretty_json'
    hash = JSON.pretty_generate(hash)
  end
  return hash
end
eager_extract_book(uri) click to toggle source
# File lib/bookshark.rb, line 316
def eager_extract_book(uri)
  book_extractor      = Biblionet::Extractors::BookExtractor.new
  author_extractor    = Biblionet::Extractors::AuthorExtractor.new
  publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
  category_extractor  = Biblionet::Extractors::CategoryExtractor.new

  book = book_extractor.load_and_extract_book(uri)

  tmp_data = []
  book[:author].each do |author|
    tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
  end
  book[:author] = tmp_data

  tmp_data, tmp_hash = [], {}
  book[:contributors].each do |job, contributors|
    contributors.each do |contributor|
      tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
    end
    tmp_hash[job] = tmp_data
    tmp_data = []
  end
  book[:contributors] = tmp_hash

  tmp_data, tmp_hash = [], {}
  book[:category].each do |category|
    tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
  end
  book[:category] = tmp_data

  tmp_data = []
  tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
  book[:publisher] = tmp_data

  book
end
process_options(options = {}, caller = nil) click to toggle source
# File lib/bookshark.rb, line 272
def process_options(options = {}, caller = nil)
  # puts "Called from method: " + caller.to_s

  id = options[:id]

  if id
    case caller.to_s
    when 'author'
      url_method    = 'author'
      local_path    = "html_author_pages/#{((id-1)/1000)}/author_#{id}.html"
    when 'publisher'
      url_method    = 'com'
      local_path    = "html_publisher_pages/#{((id-1)/100)}/publisher_#{id}.html"
    when 'book'
      url_method    = 'book'
      local_path    = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
    when 'category'
      url_method    = 'index'
      local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
    else
      puts "Called from unknown method. Probably its rspec."
    end

    options[:local] ||= false
    url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
    url = "http://www.biblionet.gr/#{url_method}/#{id}" unless options[:local]
  end
  uri = options[:uri] ||= url

  return uri
end