class Bookshark::Extractor
Attributes
format[RW]
site[RW]
Public Class Methods
new(options = {})
click to toggle source
# File lib/bookshark.rb, line 37 def initialize(options = {}) options = DEFAULTS.merge(options) @site = options[:site] @format = options[:format] end
Public Instance Methods
book(options = {})
click to toggle source
# File lib/bookshark.rb, line 72 def book(options = {}) options[:site] ||= @site if options[:site] == 'biblionet' book_extractor = Biblionet::Extractors::BookExtractor.new if book_extractor.present?(options[:isbn]) search_engine = Biblionet::Extractors::Search.new options[:id] = search_engine.search_by_isbn(options[:isbn]) end uri = process_options(options, __method__) options[:format] ||= @format options[:eager] ||= false options[:nilify] ||= false if options[:eager] book = eager_extract_book(uri) else book = book_extractor.load_and_extract_book(uri) end response = {} response[:book] = !book.nil? ? [book] : [] return nil if response[:book].empty? and options[:nilify] response = change_format(response, options[:format]) response = book_extractor.decode_text(response) if response.class == "String" return response elsif options[:site] == 'nlg' book_extractor = Nlg::Extractors::BookExtractor.new options[:format] ||= @format # if !options[:uri].nil? # uri = "#{options[:uri]}/Export?style=MARCXML" # elsif !options[:id].nil? # uri = "http://nbib.nlg.gr/Record/#{options[:id]}/Export?style=MARCXML" # end book = book_extractor.load_and_extract_book(options[:id]) response = {} response[:book] = !book.nil? ? [book] : [] end end
categories_from_storage()
click to toggle source
# File lib/bookshark.rb, line 179 def categories_from_storage extract_from_storage_and_save('category', 'html_category_pages', 'json_category_pages') end
category(options = {})
click to toggle source
puts Bookshark::Extractor.new
(format: 'pretty_json').bibliographical_book(id: 103788)
# File lib/bookshark.rb, line 139 def category(options = {}) uri = process_options(options, __method__) options[:format] ||= @format category_extractor = Biblionet::Extractors::CategoryExtractor.new category = category_extractor.extract_categories_from(uri) response = {} response[:category] = !category.nil? ? [category] : [] response = change_format(response, options[:format]) return response end
extract_books_from_storage_and_save(start_id, finish_id, format = 'pretty_json')
click to toggle source
# File lib/bookshark.rb, line 183 def extract_books_from_storage_and_save(start_id, finish_id, format = 'pretty_json') start_id.upto(finish_id) do |book_id| record = book(id: book_id, local: true, format: format, nilify: true) dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json" save_to(dir_to_save, record) unless record.nil? end end
extract_from_storage_and_save(metadata_type, source_dir, target_dir)
click to toggle source
# File lib/bookshark.rb, line 194 def extract_from_storage_and_save(metadata_type, source_dir, target_dir) list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir| dir_to_save = dir.gsub(source_dir, target_dir) list_files(path: dir, extension: 'html', all:true).each do |file| puts "Extracting from file: " + file.to_s # Extract publisher metadata form local file. options = {uri: file, format: 'pretty_json', local: true} case metadata_type when 'author' record = author(options) when 'publisher' record = publisher(options) # when 'book' # record = book(options) when 'category' record = category(options) end # Prepare a path to save the new file. filename = File.basename(file,".*") path_to_save = "#{dir_to_save}#{filename}.json" # Save to file. save_to("#{path_to_save}", record) end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/' end end
parse_all_books()
click to toggle source
# File lib/bookshark.rb, line 246 def parse_all_books bp = Biblionet::Extractors::BookExtractor.new list_directories(path: 'storage/raw_html_pages').each do |dir| dir_to_save = dir.gsub(/raw_html_pages/, 'books') list_files(path: dir, extension: 'html', all:true).each do |file| # Load the book from html file and parse the data. # pp "Parsing book: #{file}" pp file book = bp.load_and_extract_book(file) # Prepare a path to save the new file. filename = File.basename(file,".*") path_to_save = "#{dir_to_save}#{filename}.json" # Save to file. bp.save_to("#{path_to_save}", JSON.pretty_generate(book)) # pp "Book #{file} saved!" end unless File.directory?(dir_to_save) # if dir.end_with? '/195/' end end
parse_all_categories(will_save=false)
click to toggle source
# File lib/bookshark.rb, line 226 def parse_all_categories(will_save=false) # list_directories('raw_ddc_pages').each do |dir| # p dir # end category_extractor = Biblionet::Extractors::CategoryExtractor.new all_categories = Hash.new list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file| categories = category_extractor.extract_categories_from(file) all_categories.merge!(categories) unless categories.nil? or categories.empty? end if will_save all_categories_json = all_categories.to_json save_to('storage/all_categories.json',all_categories_json) end all_categories end
publisher(options = {})
click to toggle source
# File lib/bookshark.rb, line 56 def publisher(options = {}) uri = process_options(options, __method__) options[:format] ||= @format publisher_extractor = Biblionet::Extractors::PublisherExtractor.new publisher = publisher_extractor.load_and_extract_publisher(uri) response = {} response[:publisher] = !publisher.nil? ? [publisher] : [] response = change_format(response, options[:format]) response = publisher_extractor.decode_text(response) return response # return uri end
publishers_from_storage()
click to toggle source
# File lib/bookshark.rb, line 175 def publishers_from_storage extract_from_storage_and_save('publisher', 'html_publisher_pages', 'json_publisher_pages') end
search(options = {})
click to toggle source
# File lib/bookshark.rb, line 153 def search(options = {}) options[:format] ||= @format options[:results_type] ||= 'metadata' search_engine = Biblionet::Extractors::Search.new search_results = search_engine.perform_search(options) response = {} response[:book] = search_results response = change_format(response, options[:format]) return response end
Private Instance Methods
change_format(hash, format)
click to toggle source
# File lib/bookshark.rb, line 304 def change_format(hash, format) case format when 'hash' return hash when 'json' hash = hash.to_json when 'pretty_json' hash = JSON.pretty_generate(hash) end return hash end
eager_extract_book(uri)
click to toggle source
# File lib/bookshark.rb, line 316 def eager_extract_book(uri) book_extractor = Biblionet::Extractors::BookExtractor.new author_extractor = Biblionet::Extractors::AuthorExtractor.new publisher_extractor = Biblionet::Extractors::PublisherExtractor.new category_extractor = Biblionet::Extractors::CategoryExtractor.new book = book_extractor.load_and_extract_book(uri) tmp_data = [] book[:author].each do |author| tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}") end book[:author] = tmp_data tmp_data, tmp_hash = [], {} book[:contributors].each do |job, contributors| contributors.each do |contributor| tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}") end tmp_hash[job] = tmp_data tmp_data = [] end book[:contributors] = tmp_hash tmp_data, tmp_hash = [], {} book[:category].each do |category| tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}") end book[:category] = tmp_data tmp_data = [] tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}") book[:publisher] = tmp_data book end
process_options(options = {}, caller = nil)
click to toggle source
# File lib/bookshark.rb, line 272 def process_options(options = {}, caller = nil) # puts "Called from method: " + caller.to_s id = options[:id] if id case caller.to_s when 'author' url_method = 'author' local_path = "html_author_pages/#{((id-1)/1000)}/author_#{id}.html" when 'publisher' url_method = 'com' local_path = "html_publisher_pages/#{((id-1)/100)}/publisher_#{id}.html" when 'book' url_method = 'book' local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html" when 'category' url_method = 'index' local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html" else puts "Called from unknown method. Probably its rspec." end options[:local] ||= false url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local] url = "http://www.biblionet.gr/#{url_method}/#{id}" unless options[:local] end uri = options[:uri] ||= url return uri end