module DocDiff::CharString
Constants
- EOLChars
- Encodings
Public Class Methods
guess_encoding(string)
click to toggle source
# File lib/docdiff/charstring.rb, line 138 def CharString.guess_encoding(string) if string string.encoding.to_s else nil end end
guess_eol(string)
click to toggle source
# File lib/docdiff/charstring.rb, line 72 def CharString.guess_eol(string) # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary), # 'NONE'(1-line), or nil return nil if string == nil #=> nil (argument missing) bin_string = string.dup.force_encoding("ASCII-8BIT") eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size, 'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size, 'CRLF' => bin_string.scan(/(\r\n)/o).size} eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL eols = eol_counts.keys eol_variety = eols.size # numbers of flavors found if eol_variety == 1 # Only one type of EOL found return eols[0] #=> 'CR', 'LF', or 'CRLF' elsif eol_variety == 0 # No EOL found return 'NONE' #=> 'NONE' (might be 1-line file) else # Multiple types of EOL found return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data) end end
new(string)
click to toggle source
Calls superclass method
# File lib/docdiff/charstring.rb, line 12 def initialize(string) =begin unnecessary # @encoding = CharString.guess_encoding(string) # @eol = CharString.guess_eol(string) =end unnecessary super end
register_encoding(mod)
click to toggle source
# File lib/docdiff/charstring.rb, line 64 def CharString.register_encoding(mod) Encodings[mod::Encoding] = mod end
register_eol(mod)
click to toggle source
# File lib/docdiff/charstring.rb, line 68 def CharString.register_eol(mod) EOLChars[mod::EOL] = mod end
Public Instance Methods
count_blank_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 107 def count_blank_char() count_latin_blank_char() + count_ja_blank_char() end
count_blank_line()
click to toggle source
# File lib/docdiff/charstring.rb, line 241 def count_blank_line() split_to_line.collect{|line| line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" + "#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?", Regexp::MULTILINE).match line.encode('UTF-8') }.compact.size end
count_byte()
click to toggle source
Note that some languages (like Japanese) do not have 'word' or 'phrase', thus some of the following methods are not 'linguistically correct'.
# File lib/docdiff/charstring.rb, line 95 def count_byte() split_to_byte().size end
count_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 99 def count_char() # eol = 1 char split_to_char().size end
count_empty_line()
click to toggle source
# File lib/docdiff/charstring.rb, line 123 def count_empty_line() split_to_line.collect{|line| line if /^(?:#{eol_char})|^$/m.match line }.compact.size end
count_graph_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 103 def count_graph_char() count_latin_graph_char() + count_ja_graph_char() end
count_graph_line()
click to toggle source
# File lib/docdiff/charstring.rb, line 233 def count_graph_line() split_to_line.collect{|line| line if Regexp.new("[#{Encodings['UTF-8']::GRAPH}" + "#{Encodings['UTF-8']::JA_GRAPH}]", Regexp::MULTILINE).match line.encode('UTF-8') }.compact.size end
count_ja_blank_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 180 def count_ja_blank_char() encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]", Regexp::MULTILINE) ).size end
count_ja_graph_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 168 def count_ja_graph_char() encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]", Regexp::MULTILINE) ).size end
count_ja_valid_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 213 def count_ja_valid_word() split_to_word.collect{|word| word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]", Regexp::MULTILINE).match word.encode('UTF-8') }.compact.size end
count_ja_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 199 def count_ja_word() split_to_word.collect{|word| word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]", Regexp::MULTILINE).match word.encode('UTF-8') }.compact.size end
count_latin_blank_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 174 def count_latin_blank_char() encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]", Regexp::MULTILINE) ).size end
count_latin_graph_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 162 def count_latin_graph_char() encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]", Regexp::MULTILINE) ).size end
count_latin_valid_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 206 def count_latin_valid_word() split_to_word.collect{|word| word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]", Regexp::MULTILINE).match word.encode('UTF-8') }.compact.size end
count_latin_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 192 def count_latin_word() split_to_word.collect{|word| word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]", Regexp::MULTILINE).match word.encode('UTF-8') }.compact.size end
count_line()
click to toggle source
# File lib/docdiff/charstring.rb, line 119 def count_line() # this is common to all encodings. split_to_line.size end
count_valid_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 115 def count_valid_word() count_latin_valid_word() + count_ja_valid_word() end
count_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 111 def count_word() split_to_word().size end
debug()
click to toggle source
# File lib/docdiff/charstring.rb, line 45 def debug() case when @encoding == nil raise "@encoding is nil." when Encodings[@encoding] == nil raise "Encodings[@encoding(=#{@encoding})] is nil." when Encodings[@encoding].class != Module raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module." when @eol == nil raise "@eol is nil." when EOLChars[@eol] == nil raise "EOLChars[@eol(=#{@eol})] is nil." else # should I do some alert? end ["id: #{self.id}, class: #{self.class}, self: #{self}, ", "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join end
encoding()
click to toggle source
for Ruby-1.9
# File lib/docdiff/charstring.rb, line 130 def encoding() String.new(self).encoding.to_s end
encoding=(cs)
click to toggle source
# File lib/docdiff/charstring.rb, line 134 def encoding=(cs) force_encoding(cs) if self end
eol()
click to toggle source
# File lib/docdiff/charstring.rb, line 20 def eol() @eol # if @eol # @eol # else # @eol = CharString.guess_eol(self) # # raise "eol is not set.\n" # end end
eol=(e)
click to toggle source
# File lib/docdiff/charstring.rb, line 30 def eol=(e) @eol = e extend EOLChars[@eol] end
eol_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 35 def eol_char() if @eol_char @eol_char else nil # extend EOLChars[eol] # eol_char end end
split_to_byte()
click to toggle source
# File lib/docdiff/charstring.rb, line 146 def split_to_byte() encode("ASCII-8BIT").scan(/./nm) end
Also aliased as: to_bytes
split_to_char()
click to toggle source
# File lib/docdiff/charstring.rb, line 150 def split_to_char() if eol_char # sometimes string has no end-of-line char encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)", Regexp::MULTILINE) ).map{|e| e.encode(self.encoding)} else # it seems that no EOL module was extended... encode('UTF-8').scan(Regexp.new("(?:.)", Regexp::MULTILINE) ).map{|e| e.encode(self.encoding)} end end
Also aliased as: to_chars
split_to_line()
click to toggle source
# File lib/docdiff/charstring.rb, line 220 def split_to_line() raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol] if defined? eol_char encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+", Regexp::MULTILINE) ).map{|e| e.encode(self.encoding)} else encode('UTF-8').scan(Regexp.new(".+", Regexp::MULTILINE) ).map{|e| e.encode(self.encoding)} end end
Also aliased as: to_lines
split_to_word()
click to toggle source
# File lib/docdiff/charstring.rb, line 186 def split_to_word() encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC, Regexp::MULTILINE) ).map{|e| e.encode(self.encoding)} end
Also aliased as: to_words