class Transpotter

Constants

DEFAULT_SAMPLE_SIZE
ENCODING_ERRORS
MOST_COMMON_ENCODINGS

Determined from here: w3techs.com/technologies/overview/character_encoding/all

Attributes

filename[R]

Public Class Methods

new(filename, data = nil, samplesize = nil) click to toggle source
# File lib/transpotter.rb, line 38
def initialize(filename, data = nil, samplesize = nil)
  @samplesize = samplesize || DEFAULT_SAMPLE_SIZE
  @filename = filename
  @data = data
end

Public Instance Methods

each_line() { |convert(line)| ... } click to toggle source
# File lib/transpotter.rb, line 59
def each_line
  return unless sample # don't do anything if we can't grab sample
  if @filename
    open_encoded_file do |io|
      io.each(line_endings.encode(@encoding)) { |line| yield convert(line) }
    end
  elsif @data
    convert(@data).split(line_endings).each { |line| yield line }
  end
end
encoding() click to toggle source
# File lib/transpotter.rb, line 44
def encoding
  @encoding ||= (charlock || brute_force)
end
read() click to toggle source
# File lib/transpotter.rb, line 48
def read
  return unless sample # don't do anything if we can't grab sample
  if @filename
    open_encoded_file do |io|
      return convert(io.read)
    end
  elsif @data
    convert(@data)
  end
end

Private Instance Methods

brute_force() click to toggle source
# File lib/transpotter.rb, line 107
def brute_force
  return nil if sample.nil?
  MOST_COMMON_ENCODINGS.each do |encoding|
    return encoding.name if valid_encoding?(encoding)
  end
  nil
end
charlock() click to toggle source
# File lib/transpotter.rb, line 100
def charlock
  if sample
    encoding = sample.detect_encoding
    encoding[:encoding] if encoding[:confidence] > 75
  end
end
convert(string) click to toggle source
# File lib/transpotter.rb, line 90
def convert(string)
  return string if encoding == 'UTF-8'
  string.force_encoding(encoding).encode('UTF-8')
end
line_endings() click to toggle source
# File lib/transpotter.rb, line 81
def line_endings
  @line_endings = case convert(sample)
                  when /\r\n/ then "\r\n"
                  when /\n/ then "\n"
                  when /\r/ then "\r"
                  else "\n"
                  end
end
open_encoded_file() { |io| ... } click to toggle source
# File lib/transpotter.rb, line 72
def open_encoded_file
  File.open(@filename,
            'rb',
            external_encoding: encoding,
            internal_encoding: encoding) do |io|
    yield io
  end
end
sample() click to toggle source
# File lib/transpotter.rb, line 95
def sample
  @sample ||= File.read(@filename, @samplesize) if File.file?(@filename.to_s)
  @sample ||= @data
end
valid_encoding?(encoding) click to toggle source
# File lib/transpotter.rb, line 115
def valid_encoding?(encoding)
  sample.force_encoding(encoding.name).encode('UTF-8').valid_encoding?
rescue *ENCODING_ERRORS
  return false
end