class CharDet::UniversalDetector
Attributes
Public Class Methods
Source
# File lib/rchardet/universaldetector.rb, line 41 def initialize @highBitDetector = /[\x80-\xFF]/n @escDetector = /(\033|\~\{)/n @escCharSetProber = nil @charSetProbers = [] reset() end
Public Instance Methods
Source
# File lib/rchardet/universaldetector.rb, line 146 def close return if @done if !@gotData $stderr << "no data received!\n" if $debug return end @done = true if @inputState == EPureAscii @result = {'encoding' => 'ascii', 'confidence' => 1.0} return @result end if @inputState == EHighbyte confidences = {} @charSetProbers.each{ |prober| confidences[prober] = prober.get_confidence } maxProber = @charSetProbers.max{ |a,b| confidences[a] <=> confidences[b] } if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD @result = {'encoding' => maxProber.get_charset_name(), 'confidence' => maxProber.get_confidence()} return @result end end if $debug $stderr << "no probers hit minimum threshhold\n" if $debug for prober in @charSetProbers[0].probers next if !prober $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug end end end
Source
# File lib/rchardet/universaldetector.rb, line 64 def feed(aBuf) return if @done aLen = aBuf.length return if aLen == 0 if !@gotData # If the data starts with BOM, we know it is UTF if aBuf[0, 3] == "\xEF\xBB\xBF" # EF BB BF UTF-8 with BOM @result = {'encoding' => "UTF-8", 'confidence' => 1.0} elsif aBuf[0, 4] == "\xFF\xFE\x00\x00" # FF FE 00 00 UTF-32, little-endian BOM @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0} elsif aBuf[0, 4] == "\x00\x00\xFE\xFF" # 00 00 FE FF UTF-32, big-endian BOM @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0} elsif aBuf[0, 4] == "\xFE\xFF\x00\x00" # FE FF 00 00 UCS-4, unusual octet order BOM (3412) @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0} elsif aBuf[0, 4] == "\x00\x00\xFF\xFE" # 00 00 FF FE UCS-4, unusual octet order BOM (2143) @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0} elsif aBuf[0, 2] == "\xFF\xFE" # FF FE UTF-16, little endian BOM @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0} elsif aBuf[0, 2] == "\xFE\xFF" # FE FF UTF-16, big endian BOM @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0} elsif aBuf[0, 3] == "\x2B\x2F\x76" && ["\x38", "\x39", "\x2B", "\x2F"].include?(aBuf[3, 1]) # NOTE: Ruby only includes "dummy" support for UTF-7. # A Ruby UTF-7 string can't have methods called on it, nor can it be converted to anything else, but "BINARY"/"ASCII-8BIT". # Still, this doesn't make detection useless, as UTF-7 encodings exist in the wild, and the scenario may need to be handled. # 2B 2F 76 38 UTF-7 # 2B 2F 76 39 UTF-7 # 2B 2F 76 2B UTF-7 # 2B 2F 76 2F UTF-7 # 2B 2F 76 38 2D UTF-7 with no following character (empty string) @result = {'encoding' => "UTF-7", 'confidence' => 0.99} end end @gotData = true if @result['encoding'] and (@result['confidence'] > 0.0) @done = true return end if @inputState == EPureAscii if @highBitDetector =~ (aBuf) @inputState = EHighbyte elsif (@inputState == EPureAscii) and @escDetector =~ (@lastChar + aBuf) @inputState = EEscAscii end end @lastChar = aBuf[-1, 1] if @inputState == EEscAscii if !@escCharSetProber @escCharSetProber = EscCharSetProber.new() end if @escCharSetProber.feed(aBuf) == EFoundIt @result = {'encoding' => @escCharSetProber.get_charset_name(), 'confidence' => @escCharSetProber.get_confidence() } @done = true end elsif @inputState == EHighbyte if @charSetProbers.nil? || @charSetProbers.empty? @charSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()] end for prober in @charSetProbers if prober.feed(aBuf) == EFoundIt @result = {'encoding' => prober.get_charset_name(), 'confidence' => prober.get_confidence()} @done = true break end end end end
Source
# File lib/rchardet/universaldetector.rb, line 49 def reset @result = {'encoding' => nil, 'confidence' => 0.0} @done = false @start = true @gotData = false @inputState = EPureAscii @lastChar = '' if @escCharSetProber @escCharSetProber.reset() end for prober in @charSetProbers prober.reset() end end