class EPUBChop::Chop

Attributes

base[R]
book[R]
resource_allowed_word_count[R]
resource_word_count[R]
text1[R]
text2[R]
words[R]

Public Class Methods

new(input, options ={}) click to toggle source
# File lib/EPUBChop/chop.rb, line 12
def initialize(input, options ={})
  set_defaults(options)

  raise 'Please supply an input file name' if input.nil?

  #count the number of words in a file
  @resource_word_count = count_words(input)

end

Public Instance Methods

chop(options = {}) click to toggle source
# File lib/EPUBChop/chop.rb, line 31
def chop(options = {})
  set_defaults(options)

  original_zip_file = @book.table_of_contents.parser.zip_file
  extract_dir = extract_epub_to_tmp_dir(original_zip_file)

  chop_files_in_tmp_dir(extract_dir)
  remove_unused_media_from_tmp_dir(extract_dir)


  return rebuild_epub_from_tmp_dir(extract_dir)
rescue Zip::Error => e
  raise RuntimeError, "Error processing EPUB #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace
rescue Exception => e
  puts e.backtrace.join("\n")
  raise RuntimeError, "Chopping went wrong for #{@book.table_of_contents.parser.path}.\n #{e.message}", e.backtrace
ensure
  FileUtils.remove_entry_secure(extract_dir)
end
total_words() click to toggle source
# File lib/EPUBChop/chop.rb, line 22
def total_words
  @resource_word_count.values.inject(0) { |sum, i| sum + i }
end

Private Instance Methods

allowed_words(words, base) click to toggle source
# File lib/EPUBChop/chop.rb, line 343
def allowed_words(words, base)
  @allowed_words ||= begin
    case base.to_s
      when 'percentage'
        @allowed_words = (total_words * (words / 100.0)).to_i
      else
        @allowed_words = words
    end
  end

end
chop_file(resource, processed_file_size) click to toggle source
# File lib/EPUBChop/chop.rb, line 103
def chop_file(resource, processed_file_size)
  #TODO: get a better algorithm to determine where to chop
  return resource if resource.nil?

  resource.css('script').remove
  resource.css('style').remove
  resource_text = resource.at_css('body').text.split[0..processed_file_size]

  # get a string that can be found
  data = nil
  window_begin = default_window_begin = 5
  window_end = 0
  while data.nil?
    puts "data window:#{(processed_file_size - window_begin)}..#{(processed_file_size - window_end)}" if @verbose
    processed_window_begin = processed_file_size - window_begin
    processed_window_end   = processed_file_size - window_end

    processed_window_begin = 0 if processed_window_begin < 0
    processed_window_end   = processed_file_size

    look_for = resource_text[processed_window_begin..processed_window_end]

    if look_for.nil?
      window_begin = default_window_begin += 5
      window_end = 0
    else
      look_for.map! {|m| m.gsub("'", "\'")}
      data = resource.at_css("p:contains(\"#{look_for.join(' ')}\")")
      data = resource.at_css("body:contains(\"#{look_for.join(' ')}\")") if data.nil?

      window_begin -= 1
      window_end += 1

      if window_begin == window_end
        window_begin = default_window_begin += 5
        window_end = 0
      end
    end
  end

  #limit on found string
  if data
    next_data = data.next_element
    while next_data
      in_resource = resource.css(next_data.css_path)
      in_resource.remove

      next_data = data.nil? || data.next_element.to_s.length == 1 ? nil : data.next_element
    end
  end

  meta = Nokogiri::XML::Node.new('meta', resource)
  meta['http-equiv'] = "Content-Type"
  meta['content'] = "text/html; charset=UTF-8"

#  meta_charset = Nokogiri::XML::Node.new('meta', resource)
#  meta_charset['charset'] = 'UTF-8'

  resource.css('head').first << meta
# resource.css('head').first << meta_charset

  resource
end
chop_files_in_tmp_dir(extract_dir) click to toggle source
# File lib/EPUBChop/chop.rb, line 66
    def chop_files_in_tmp_dir(extract_dir)
      #fix spine files
      filename_list = @resource_word_count.keys
      filename_list.each do |filename|
        original_file_size = @resource_word_count[filename]
        processed_file_size = resource_allowed_word_count[filename]

        if original_file_size != processed_file_size
          if processed_file_size == 0
            FileUtils.rm("#{extract_dir}/#{filename}", :force => true)
            FileUtils.touch "#{extract_dir}/#{filename}"
            File.open("#{extract_dir}/#{filename}", 'w') do |f|
              f.puts empty_file_with_cover(filename)
            end

          else
            #noinspection RubyResolve
            resource = Nokogiri::HTML(@book.table_of_contents.resources[filename].force_encoding('UTF-8')) do |config|
              config.noblanks.nonet
            end

#            resource.encoding = 'UTF-8'
            resource = chop_file(resource, processed_file_size)

            #persist page
            save_options = Nokogiri::XML::Node::SaveOptions::FORMAT | Nokogiri::XML::Node::SaveOptions::NO_DECLARATION | Nokogiri::XML::Node::SaveOptions::AS_XHTML
            File.open("#{extract_dir}/#{filename}", 'w:UTF-8') do |f|
              #f.puts resource.serialize(:encoding => 'ISO-8859-1', :save_with => save_options)
              f.puts resource.serialize(:encoding => 'UTF-8', :save_with => save_options)
              #f.puts resource.serialize(:encoding => resource.encoding, :save_with => save_options)
            end

          end
        end
      end
    end
count_words(input) click to toggle source
# File lib/EPUBChop/chop.rb, line 321
def count_words(input)
  @book = EPUBInfo.get(input)
  resource_word_count = {}
  if @book
    resources = @book.table_of_contents.resources.to_a
    chop_by = @chop_by.eql?(:ncx) ? @book.table_of_contents.resources.ncx : @book.table_of_contents.resources.spine

    chop_by.each do |resource|
      raw = Nokogiri::HTML(@book.table_of_contents.resources[resource[:uri]]) do |config|
        #noinspection RubyResolve
        config.noblanks.nonet
      end
      raw.css('script').remove
      raw.css('style').remove
      size = raw.at_css('body').text.split.size
      resource_word_count[resource[:uri]] = size
    end
  end
  # resource_word_count.values.inject(0){|sum, i| sum + i}
  resource_word_count
end
empty_file_with_cover(filename) click to toggle source
# File lib/EPUBChop/chop.rb, line 271
    def empty_file_with_cover(filename)
      number_of_subdirectories = filename.split('/').size - 1

      cover_path = ''
      number_of_subdirectories.times { cover_path += '../' }

      cover_path += @book.cover && @book.cover.exists? ? @book.cover.exists?.to_s : ''

      data = <<DATA
<?xml version="1.0" encoding="utf-8" standalone="no"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
      <title>Read more</title>
  </head>

  <body>
  <div style="margin-top:100px;width:500px;margin-left:auto;margin-right:auto;">
    <div style='text-align:center;'>
      <h2>#{CGI.escape_html(@text1 ? @text1 : '')}</h2>
      <span>#{CGI.escape_html(@text2 ? @text2 : '')}</span>
    </div>

    <div style="margin-top:20px;">
      <div style="float:left;margin-right:30px;max-height: 190px; min-height: 120px; width: 125px;">
        <img id="epubchop_coverimg" src="#{cover_path}" alt="" style="width:100%" />
      </div>

      <div style='padding-top:10px;'>
        <h3>#{CGI.escape_html(@book.titles.first ? @book.titles.first : '')}</h3>
      </div>

      <div>
        <h4>#{CGI.escape_html(@book.creators.first ? @book.creators.first.name : '')}</h4>
      </div>

    </div>

    <br />

    <div style="clear:both;text-align:center;font-size:0.5em;"> #{CGI.escape_html(@book.rights ? @book.rights : '')} </div>
  </div>
</body>
</html>

DATA

      data
    end
extract_epub_to_tmp_dir(original_zip_file) click to toggle source
# File lib/EPUBChop/chop.rb, line 53
def extract_epub_to_tmp_dir(original_zip_file)
  #unzip in temp dir
  extract_dir = Dir.mktmpdir('epub_extract')
  original_zip_file.entries.each do |e|
    file_dir = File.split(e.name)[0]
    FileUtils.mkdir_p(File.join(extract_dir, file_dir)) unless Dir.exists?(File.join(extract_dir, file_dir)) || file_dir.eql?('.')
    original_zip_file.extract(e, File.join(extract_dir, e.name))
  end

  extract_dir
end
files_allowed(allowed_words) click to toggle source
# File lib/EPUBChop/chop.rb, line 355
def files_allowed(allowed_words)
  word_counter = 0
  resource_allowed_word_count = @resource_word_count.select do |r|
    (word_counter += @resource_word_count[r]) < allowed_words
  end

  word_counter = resource_allowed_word_count.values.inject(0) { |sum, i| sum + i }

  how_many_words_left = allowed_words - word_counter
  if how_many_words_left > 0
    resource_to_split_name = @resource_word_count.keys[resource_allowed_word_count.length]

    #noinspection RubyLocalVariableNamingConvention
    word_count_of_resource_to_split = @resource_word_count[resource_to_split_name]
    if  how_many_words_left < word_count_of_resource_to_split
      resource_allowed_word_count.store(resource_to_split_name, how_many_words_left)
    end
  end

  @resource_word_count.keys[resource_allowed_word_count.length..@resource_word_count.length].each do |r|
    resource_allowed_word_count.store(r, 0)
  end

  resource_allowed_word_count
end
rebuild_epub_from_tmp_dir(extract_dir) click to toggle source
# File lib/EPUBChop/chop.rb, line 168
def rebuild_epub_from_tmp_dir(extract_dir)
  #zip new ebook

  Zip.setup do |z|
    z.write_zip64_support = false
    z.on_exists_proc = true
    z.continue_on_exists_proc = true
    z.unicode_names = true
    z.default_compression = Zlib::BEST_COMPRESSION
  end

  new_ebook_name = Tempfile.new(['epub', '.epub'], Dir.tmpdir)
  new_ebook_name_path = new_ebook_name.path
  new_ebook_name_path.gsub!('-', '')

  zipfile = Zip::File.open(new_ebook_name_path, Zip::File::CREATE)

  epub_files = Dir[File.join(extract_dir, '**', '**')]

  #minetype should be the first entry and should not be zipped. Else FIDO will not know that this is an EPUB
  mimetype = epub_files.delete("#{extract_dir}/mimetype")
  mimetype_entry = Zip::Entry.new(zipfile,                              #@zipfile
                                  mimetype.sub("#{extract_dir}/", ''),  #@name
                                  '',                                   #@comment
                                  '',                                   #@extra
                                  0,                                    #@compressed_size
                                  0,                                    #@crc
                                  Zip::Entry::STORED)                   #@compression_method

  zipfile.add(mimetype_entry, mimetype) unless mimetype.nil?

  #all the other files
  epub_files.each do |file|
    zipfile.add(file.sub("#{extract_dir}/", ''), file)
  end
  zipfile.close

  new_ebook_name_path
end
remove_unused_images_from_tmp_dir(extract_dir) click to toggle source

noinspection RubyInstanceMethodNamingConvention

# File lib/EPUBChop/chop.rb, line 216
def remove_unused_images_from_tmp_dir(extract_dir)
  puts 'removing unused media' if @verbose
  not_to_be_deleted_images = []
  all_images = @book.table_of_contents.resources.images.map { |i| i[:uri] }
  @book.table_of_contents.resources.html.each do |resource|
    file = Nokogiri::HTML(File.read("#{extract_dir}/#{resource[:uri]}"))

    all_images.each do |image|
      next if image.nil?
      i = image.split('/').last
      data = file.at_css("img[src$='#{i}']")

      if data
        not_to_be_deleted_images << image
      end
    end
  end

  metadata = Nokogiri::XML(File.read("#{extract_dir}/#{@book.table_of_contents.parser.metadata_path}"))
  to_be_deleted_images = (all_images - not_to_be_deleted_images)
  to_be_deleted_images.each do |image|
    next if image.nil?
    puts "\t\tremoving #{image}" if @verbose
    File.delete("#{extract_dir}/#{image}") if File.exists?("#{extract_dir}/#{image}")


    image_in_metadata = metadata.xpath("//default:item[contains(@href,'#{File.basename(image)}' )]", :default => 'http://www.idpf.org/2007/opf')
    if image_in_metadata.count > 0
      image_in_metadata.remove
    end
  end

  File.open("#{extract_dir}/#{@book.table_of_contents.parser.metadata_path}", 'wb') do |f|
    f.puts metadata.to_xml(encoding: 'UTF-8')
  end

  to_be_deleted_images
end
remove_unused_media_from_tmp_dir(extract_dir) click to toggle source

noinspection RubyInstanceMethodNamingConvention

# File lib/EPUBChop/chop.rb, line 209
def remove_unused_media_from_tmp_dir(extract_dir)
  #TODO: remove other media
  #TODO: rebuild toc.ncx and content.opf
  remove_unused_images_from_tmp_dir(extract_dir)
end
set_defaults(options) click to toggle source
# File lib/EPUBChop/chop.rb, line 256
def set_defaults(options)
  @words = options[:words] || 10
  @base = options[:base] || :percentage
  if options[:text].is_a?(Array)
    @text1 = options[:text][0] || 'Continue reading?'
    @text2 = options[:text][1] || 'Go to your local library or buy the book.'
  else
    @text1 = options[:text] || 'Continue reading? Go to your local library or buy the book.'
    @text2 = ''
  end

  @chop_by = options[:chop_by] || :spine
  @verbose = options[:verbose] || false
end