class EPUBChop::Chop
Attributes
base[R]
book[R]
resource_allowed_word_count[R]
resource_word_count[R]
text1[R]
text2[R]
words[R]
Public Class Methods
new(input, options ={})
click to toggle source
# File lib/EPUBChop/chop.rb, line 12 def initialize(input, options ={}) set_defaults(options) raise 'Please supply an input file name' if input.nil? #count the number of words in a file @resource_word_count = count_words(input) end
Public Instance Methods
chop(options = {})
click to toggle source
# File lib/EPUBChop/chop.rb, line 31 def chop(options = {}) set_defaults(options) original_zip_file = @book.table_of_contents.parser.zip_file extract_dir = extract_epub_to_tmp_dir(original_zip_file) chop_files_in_tmp_dir(extract_dir) remove_unused_media_from_tmp_dir(extract_dir) return rebuild_epub_from_tmp_dir(extract_dir) rescue Zip::Error => e raise RuntimeError, "Error processing EPUB #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace rescue Exception => e puts e.backtrace.join("\n") raise RuntimeError, "Chopping went wrong for #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace ensure FileUtils.remove_entry_secure(extract_dir) end
total_words()
click to toggle source
# File lib/EPUBChop/chop.rb, line 22 def total_words @resource_word_count.values.inject(0) { |sum, i| sum + i } end
Private Instance Methods
allowed_words(words, base)
click to toggle source
# File lib/EPUBChop/chop.rb, line 343 def allowed_words(words, base) @allowed_words ||= begin case base.to_s when 'percentage' @allowed_words = (total_words * (words / 100.0)).to_i else @allowed_words = words end end end
chop_file(resource, processed_file_size)
click to toggle source
# File lib/EPUBChop/chop.rb, line 103 def chop_file(resource, processed_file_size) #TODO: get a better algorithm to determine where to chop return resource if resource.nil? resource.css('script').remove resource.css('style').remove resource_text = resource.at_css('body').text.split[0..processed_file_size] # get a string that can be found data = nil window_begin = default_window_begin = 5 window_end = 0 while data.nil? puts "data window:#{(processed_file_size - window_begin)}..#{(processed_file_size - window_end)}" if @verbose processed_window_begin = processed_file_size - window_begin processed_window_end = processed_file_size - window_end processed_window_begin = 0 if processed_window_begin < 0 processed_window_end = processed_file_size look_for = resource_text[processed_window_begin..processed_window_end] if look_for.nil? window_begin = default_window_begin += 5 window_end = 0 else look_for.map! {|m| m.gsub("'", "\'")} data = resource.at_css("p:contains(\"#{look_for.join(' ')}\")") data = resource.at_css("body:contains(\"#{look_for.join(' ')}\")") if data.nil? window_begin -= 1 window_end += 1 if window_begin == window_end window_begin = default_window_begin += 5 window_end = 0 end end end #limit on found string if data next_data = data.next_element while next_data in_resource = resource.css(next_data.css_path) in_resource.remove next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element end end meta = Nokogiri::XML::Node.new('meta', resource) meta['http-equiv'] = "Content-Type" meta['content'] = "text/html; charset=UTF-8" # meta_charset = Nokogiri::XML::Node.new('meta', resource) # meta_charset['charset'] = 'UTF-8' resource.css('head').first << meta # resource.css('head').first << meta_charset resource end
chop_files_in_tmp_dir(extract_dir)
click to toggle source
# File lib/EPUBChop/chop.rb, line 66 def chop_files_in_tmp_dir(extract_dir) #fix spine files filename_list = @resource_word_count.keys filename_list.each do |filename| original_file_size = @resource_word_count[filename] processed_file_size = resource_allowed_word_count[filename] if original_file_size != processed_file_size if processed_file_size == 0 FileUtils.rm("#{extract_dir}/#{filename}", :force => true) FileUtils.touch "#{extract_dir}/#{filename}" File.open("#{extract_dir}/#{filename}", 'w') do |f| f.puts empty_file_with_cover(filename) end else #noinspection RubyResolve resource = Nokogiri::HTML(@book.table_of_contents.resources[filename].force_encoding('UTF-8')) do |config| config.noblanks.nonet end # resource.encoding = 'UTF-8' resource = chop_file(resource, processed_file_size) #persist page save_options = Nokogiri::XML::Node::SaveOptions::FORMAT | Nokogiri::XML::Node::SaveOptions::NO_DECLARATION | Nokogiri::XML::Node::SaveOptions::AS_XHTML File.open("#{extract_dir}/#{filename}", 'w:UTF-8') do |f| #f.puts resource.serialize(:encoding => 'ISO-8859-1', :save_with => save_options) f.puts resource.serialize(:encoding => 'UTF-8', :save_with => save_options) #f.puts resource.serialize(:encoding => resource.encoding, :save_with => save_options) end end end end end
count_words(input)
click to toggle source
# File lib/EPUBChop/chop.rb, line 321 def count_words(input) @book = EPUBInfo.get(input) resource_word_count = {} if @book resources = @book.table_of_contents.resources.to_a chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine chop_by.each do |resource| raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config| #noinspection RubyResolve config.noblanks.nonet end raw.css('script').remove raw.css('style').remove size = raw.at_css('body').text.split.size resource_word_count[resource[:uri]] = size end end # resource_word_count.values.inject(0){|sum, i| sum + i} resource_word_count end
empty_file_with_cover(filename)
click to toggle source
# File lib/EPUBChop/chop.rb, line 271 def empty_file_with_cover(filename) number_of_subdirectories = filename.split('/').size - 1 cover_path = '' number_of_subdirectories.times { cover_path += '../' } cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : '' data = <<DATA <?xml version="1.0" encoding="utf-8" standalone="no"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Read more</title> </head> <body> <div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;"> <div style='text-align:center;'> <h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2> <span>#{CGI.escape_html(@text2 ? @text2 : '')}</span> </div> <div style="margin-top:20px;"> <div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;"> <img id="epubchop_coverimg" src="#{cover_path}" alt="" style="width:100%" /> </div> <div style='padding-top:10px;'> <h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '')}</h3> </div> <div> <h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4> </div> </div> <br /> <div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div> </div> </body> </html> DATA data end
extract_epub_to_tmp_dir(original_zip_file)
click to toggle source
# File lib/EPUBChop/chop.rb, line 53 def extract_epub_to_tmp_dir(original_zip_file) #unzip in temp dir extract_dir = Dir.mktmpdir('epub_extract') original_zip_file.entries.each do |e| file_dir = File.split(e.name)[0] FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.') original_zip_file.extract(e, File.join(extract_dir, e.name)) end extract_dir end
files_allowed(allowed_words)
click to toggle source
# File lib/EPUBChop/chop.rb, line 355 def files_allowed(allowed_words) word_counter = 0 resource_allowed_word_count = @resource_word_count.select do |r| (word_counter += @resource_word_count[r]) < allowed_words end word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i } how_many_words_left = allowed_words - word_counter if how_many_words_left > 0 resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length] #noinspection RubyLocalVariableNamingConvention word_count_of_resource_to_split = @resource_word_count[resource_to_split_name] if how_many_words_left < word_count_of_resource_to_split resource_allowed_word_count.store(resource_to_split_name, how_many_words_left) end end @resource_word_count.keys[resource_allowed_word_count.length..@resource_word_count.length].each do |r| resource_allowed_word_count.store(r, 0) end resource_allowed_word_count end
rebuild_epub_from_tmp_dir(extract_dir)
click to toggle source
# File lib/EPUBChop/chop.rb, line 168 def rebuild_epub_from_tmp_dir(extract_dir) #zip new ebook Zip.setup do |z| z.write_zip64_support = false z.on_exists_proc = true z.continue_on_exists_proc = true z.unicode_names = true z.default_compression = Zlib::BEST_COMPRESSION end new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir) new_ebook_name_path = new_ebook_name.path new_ebook_name_path.gsub!('-', '') zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE) epub_files = Dir[File.join(extract_dir, '**', '**')] #minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB mimetype = epub_files.delete("#{extract_dir}/mimetype") mimetype_entry = Zip::Entry.new(zipfile, #@zipfile mimetype.sub("#{extract_dir}/", ''), #@name '', #@comment '', #@extra 0, #@compressed_size 0, #@crc Zip::Entry::STORED) #@compression_method zipfile.add(mimetype_entry, mimetype) unless mimetype.nil? #all the other files epub_files.each do |file| zipfile.add(file.sub("#{extract_dir}/", ''), file) end zipfile.close new_ebook_name_path end
remove_unused_images_from_tmp_dir(extract_dir)
click to toggle source
noinspection RubyInstanceMethodNamingConvention
# File lib/EPUBChop/chop.rb, line 216 def remove_unused_images_from_tmp_dir(extract_dir) puts 'removing unused media' if @verbose not_to_be_deleted_images = [] all_images = @book.table_of_contents.resources.images.map { |i| i[:uri] } @book.table_of_contents.resources.html.each do |resource| file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}")) all_images.each do |image| next if image.nil? i = image.split('/').last data = file.at_css("img[src$='#{i}']") if data not_to_be_deleted_images << image end end end metadata = Nokogiri::XML(File.read("#{extract_dir}/#{@book.table_of_contents.parser.metadata_path}")) to_be_deleted_images = (all_images - not_to_be_deleted_images) to_be_deleted_images.each do |image| next if image.nil? puts "\t\tremoving #{image}" if @verbose File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}") image_in_metadata = metadata.xpath("//default:item[contains(@href,'#{File.basename(image)}' )]", :default => 'http://www.idpf.org/2007/opf') if image_in_metadata.count > 0 image_in_metadata.remove end end File.open("#{extract_dir}/#{@book.table_of_contents.parser.metadata_path}", 'wb') do |f| f.puts metadata.to_xml(encoding: 'UTF-8') end to_be_deleted_images end
remove_unused_media_from_tmp_dir(extract_dir)
click to toggle source
noinspection RubyInstanceMethodNamingConvention
# File lib/EPUBChop/chop.rb, line 209 def remove_unused_media_from_tmp_dir(extract_dir) #TODO: remove other media #TODO: rebuild toc.ncx and content.opf remove_unused_images_from_tmp_dir(extract_dir) end
set_defaults(options)
click to toggle source
# File lib/EPUBChop/chop.rb, line 256 def set_defaults(options) @words = options[:words] || 10 @base = options[:base] || :percentage if options[:text].is_a?(Array) @text1 = options[:text][0] || 'Continue reading?' @text2 = options[:text][1] || 'Go to your local library or buy the book.' else @text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.' @text2 = '' end @chop_by = options[:chop_by] || :spine @verbose = options[:verbose] || false end