module DocDiff::CharString

Constants

EOLChars
Encodings

Public Class Methods

guess_encoding(string) click to toggle source
# File lib/docdiff/charstring.rb, line 138
def CharString.guess_encoding(string)
  if string
    string.encoding.to_s
  else
    nil
  end
end
guess_eol(string) click to toggle source
# File lib/docdiff/charstring.rb, line 72
def CharString.guess_eol(string)
  # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
  # 'NONE'(1-line), or nil
  return nil if string == nil  #=> nil (argument missing)
  bin_string = string.dup.force_encoding("ASCII-8BIT")
  eol_counts = {'CR'   => bin_string.scan(/(\r)(?!\n)/o).size,
                'LF'   => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
                'CRLF' => bin_string.scan(/(\r\n)/o).size}
  eol_counts.delete_if{|eol, count| count == 0}  # Remove missing EOL
  eols = eol_counts.keys
  eol_variety = eols.size  # numbers of flavors found
  if eol_variety == 1          # Only one type of EOL found
    return eols[0]         #=> 'CR', 'LF', or 'CRLF'
  elsif eol_variety == 0       # No EOL found
    return 'NONE'              #=> 'NONE' (might be 1-line file)
  else                         # Multiple types of EOL found
    return 'UNKNOWN'           #=> 'UNKNOWN' (might be binary data)
  end
end
new(string) click to toggle source
Calls superclass method
# File lib/docdiff/charstring.rb, line 12
  def initialize(string)
=begin unnecessary
#    @encoding = CharString.guess_encoding(string)
#    @eol     = CharString.guess_eol(string)
=end unnecessary
    super
  end
register_encoding(mod) click to toggle source
# File lib/docdiff/charstring.rb, line 64
def CharString.register_encoding(mod)
  Encodings[mod::Encoding] = mod
end
register_eol(mod) click to toggle source
# File lib/docdiff/charstring.rb, line 68
def CharString.register_eol(mod)
  EOLChars[mod::EOL] = mod
end

Public Instance Methods

count_blank_char() click to toggle source
# File lib/docdiff/charstring.rb, line 107
def count_blank_char()
  count_latin_blank_char() + count_ja_blank_char()
end
count_blank_line() click to toggle source
# File lib/docdiff/charstring.rb, line 241
def count_blank_line()
  split_to_line.collect{|line|
    line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" + 
                       "#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?", 
                       Regexp::MULTILINE).match line.encode('UTF-8')
  }.compact.size
end
count_byte() click to toggle source

Note that some languages (like Japanese) do not have 'word' or 'phrase', thus some of the following methods are not 'linguistically correct'.

# File lib/docdiff/charstring.rb, line 95
def count_byte()
  split_to_byte().size
end
count_char() click to toggle source
# File lib/docdiff/charstring.rb, line 99
def count_char()  # eol = 1 char
  split_to_char().size
end
count_empty_line() click to toggle source
# File lib/docdiff/charstring.rb, line 123
def count_empty_line()
  split_to_line.collect{|line|
    line if /^(?:#{eol_char})|^$/m.match line
  }.compact.size
end
count_graph_char() click to toggle source
# File lib/docdiff/charstring.rb, line 103
def count_graph_char()
  count_latin_graph_char() + count_ja_graph_char()
end
count_graph_line() click to toggle source
# File lib/docdiff/charstring.rb, line 233
def count_graph_line()
  split_to_line.collect{|line|
    line if Regexp.new("[#{Encodings['UTF-8']::GRAPH}" + 
                       "#{Encodings['UTF-8']::JA_GRAPH}]", 
                       Regexp::MULTILINE).match line.encode('UTF-8')
  }.compact.size
end
count_ja_blank_char() click to toggle source
# File lib/docdiff/charstring.rb, line 180
def count_ja_blank_char()
  encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]", 
                  Regexp::MULTILINE)
  ).size
end
count_ja_graph_char() click to toggle source
# File lib/docdiff/charstring.rb, line 168
def count_ja_graph_char()
  encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]", 
                  Regexp::MULTILINE)
  ).size
end
count_ja_valid_word() click to toggle source
# File lib/docdiff/charstring.rb, line 213
def count_ja_valid_word()
  split_to_word.collect{|word|
    word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]", 
                       Regexp::MULTILINE).match word.encode('UTF-8')
  }.compact.size
end
count_ja_word() click to toggle source
# File lib/docdiff/charstring.rb, line 199
def count_ja_word()
  split_to_word.collect{|word|
    word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]", 
                       Regexp::MULTILINE).match word.encode('UTF-8')
  }.compact.size
end
count_latin_blank_char() click to toggle source
# File lib/docdiff/charstring.rb, line 174
def count_latin_blank_char()
  encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]", 
                  Regexp::MULTILINE)
  ).size
end
count_latin_graph_char() click to toggle source
# File lib/docdiff/charstring.rb, line 162
def count_latin_graph_char()
  encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]", 
                  Regexp::MULTILINE)
  ).size
end
count_latin_valid_word() click to toggle source
# File lib/docdiff/charstring.rb, line 206
def count_latin_valid_word()
  split_to_word.collect{|word|
    word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]", 
                       Regexp::MULTILINE).match word.encode('UTF-8')
  }.compact.size
end
count_latin_word() click to toggle source
# File lib/docdiff/charstring.rb, line 192
def count_latin_word()
  split_to_word.collect{|word|
    word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]", 
                       Regexp::MULTILINE).match word.encode('UTF-8')
  }.compact.size
end
count_line() click to toggle source
# File lib/docdiff/charstring.rb, line 119
def count_line()  # this is common to all encodings.
  split_to_line.size
end
count_valid_word() click to toggle source
# File lib/docdiff/charstring.rb, line 115
def count_valid_word()
  count_latin_valid_word() + count_ja_valid_word()
end
count_word() click to toggle source
# File lib/docdiff/charstring.rb, line 111
def count_word()
  split_to_word().size
end
debug() click to toggle source
# File lib/docdiff/charstring.rb, line 45
def debug()
  case
  when @encoding  == nil
    raise "@encoding is nil."
  when Encodings[@encoding] == nil
    raise "Encodings[@encoding(=#{@encoding})] is nil."
  when Encodings[@encoding].class != Module
    raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
  when @eol == nil
    raise "@eol is nil."
  when EOLChars[@eol] == nil
    raise "EOLChars[@eol(=#{@eol})] is nil."
  else
    # should I do some alert?
  end
  ["id: #{self.id}, class: #{self.class}, self: #{self}, ", 
   "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
end
encoding() click to toggle source

for Ruby-1.9

# File lib/docdiff/charstring.rb, line 130
def encoding()
  String.new(self).encoding.to_s
end
encoding=(cs) click to toggle source
# File lib/docdiff/charstring.rb, line 134
def encoding=(cs)
  force_encoding(cs) if self
end
eol() click to toggle source
# File lib/docdiff/charstring.rb, line 20
  def eol()
    @eol
#     if @eol
#       @eol
#     else
#       @eol = CharString.guess_eol(self)
#       # raise "eol is not set.\n"
#     end
  end
eol=(e) click to toggle source
# File lib/docdiff/charstring.rb, line 30
def eol=(e)
  @eol = e
  extend EOLChars[@eol]
end
eol_char() click to toggle source
# File lib/docdiff/charstring.rb, line 35
  def eol_char()
    if @eol_char
      @eol_char
    else
      nil
#       extend EOLChars[eol]
#       eol_char
    end
  end
split_to_byte() click to toggle source
# File lib/docdiff/charstring.rb, line 146
def split_to_byte()
  encode("ASCII-8BIT").scan(/./nm)
end
Also aliased as: to_bytes
split_to_char() click to toggle source
# File lib/docdiff/charstring.rb, line 150
def split_to_char()
  if eol_char  # sometimes string has no end-of-line char
    encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)", 
                    Regexp::MULTILINE)
    ).map{|e| e.encode(self.encoding)}
  else                  # it seems that no EOL module was extended...
    encode('UTF-8').scan(Regexp.new("(?:.)", 
                    Regexp::MULTILINE)
    ).map{|e| e.encode(self.encoding)}
  end
end
Also aliased as: to_chars
split_to_line() click to toggle source
# File lib/docdiff/charstring.rb, line 220
def split_to_line()
  raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
  if defined? eol_char
    encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+", 
                    Regexp::MULTILINE)
    ).map{|e| e.encode(self.encoding)}
  else
    encode('UTF-8').scan(Regexp.new(".+", 
                    Regexp::MULTILINE)
    ).map{|e| e.encode(self.encoding)}
  end
end
Also aliased as: to_lines
split_to_word() click to toggle source
# File lib/docdiff/charstring.rb, line 186
def split_to_word()
  encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC, 
                  Regexp::MULTILINE)
  ).map{|e| e.encode(self.encoding)}
end
Also aliased as: to_words
to_bytes()
Alias for: split_to_byte
to_chars()
Alias for: split_to_char
to_lines()
Alias for: split_to_line
to_words()
Alias for: split_to_word