class Transpotter
Constants
- DEFAULT_SAMPLE_SIZE
- ENCODING_ERRORS
- MOST_COMMON_ENCODINGS
Determined from here: w3techs.com/technologies/overview/character_encoding/all
Attributes
filename[R]
Public Class Methods
new(filename, data = nil, samplesize = nil)
click to toggle source
# File lib/transpotter.rb, line 38 def initialize(filename, data = nil, samplesize = nil) @samplesize = samplesize || DEFAULT_SAMPLE_SIZE @filename = filename @data = data end
Public Instance Methods
each_line() { |convert(line)| ... }
click to toggle source
# File lib/transpotter.rb, line 59 def each_line return unless sample # don't do anything if we can't grab sample if @filename open_encoded_file do |io| io.each(line_endings.encode(@encoding)) { |line| yield convert(line) } end elsif @data convert(@data).split(line_endings).each { |line| yield line } end end
encoding()
click to toggle source
# File lib/transpotter.rb, line 44 def encoding @encoding ||= (charlock || brute_force) end
read()
click to toggle source
# File lib/transpotter.rb, line 48 def read return unless sample # don't do anything if we can't grab sample if @filename open_encoded_file do |io| return convert(io.read) end elsif @data convert(@data) end end
Private Instance Methods
brute_force()
click to toggle source
# File lib/transpotter.rb, line 107 def brute_force return nil if sample.nil? MOST_COMMON_ENCODINGS.each do |encoding| return encoding.name if valid_encoding?(encoding) end nil end
charlock()
click to toggle source
# File lib/transpotter.rb, line 100 def charlock if sample encoding = sample.detect_encoding encoding[:encoding] if encoding[:confidence] > 75 end end
convert(string)
click to toggle source
# File lib/transpotter.rb, line 90 def convert(string) return string if encoding == 'UTF-8' string.force_encoding(encoding).encode('UTF-8') end
line_endings()
click to toggle source
# File lib/transpotter.rb, line 81 def line_endings @line_endings = case convert(sample) when /\r\n/ then "\r\n" when /\n/ then "\n" when /\r/ then "\r" else "\n" end end
open_encoded_file() { |io| ... }
click to toggle source
# File lib/transpotter.rb, line 72 def open_encoded_file File.open(@filename, 'rb', external_encoding: encoding, internal_encoding: encoding) do |io| yield io end end
sample()
click to toggle source
# File lib/transpotter.rb, line 95 def sample @sample ||= File.read(@filename, @samplesize) if File.file?(@filename.to_s) @sample ||= @data end
valid_encoding?(encoding)
click to toggle source
# File lib/transpotter.rb, line 115 def valid_encoding?(encoding) sample.force_encoding(encoding.name).encode('UTF-8').valid_encoding? rescue *ENCODING_ERRORS return false end