class String

extend String class

Public Class Methods

is_codepoint_nikkud_cp1255(cp) click to toggle source
# File lib/hebrew.rb, line 143
def self.is_codepoint_nikkud_cp1255(cp)
  return ((cp > 191 && cp < 205) or [209, 210].include?(cp))
  #NIKKUD_CP1255.include?(cp) # cleaner, but much slower
end
is_codepoint_nikkud_utf8(cp) click to toggle source
# File lib/hebrew.rb, line 147
def self.is_codepoint_nikkud_utf8(cp)
  return ((cp > 0x05af && cp < 0x05bd) or [0x05c1, 0x05c2].include?(cp))
  #NIKKUD_UTF8.include?(cp) # cleaner, but much slower
end
is_final_by_encoding(c, encoding) click to toggle source

this will return true if the first parameter is a final letter in the encoding of the second parameter

# File lib/hebrew.rb, line 162
def self.is_final_by_encoding(c, encoding)
  case encoding
  when Encoding::UTF_8
    FIANLS_UTF8.include?(c)
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    FINALS_CP1255.include?(c)
  end
end
is_nikkud_by_encoding(c, encoding) click to toggle source

this will return true if the first parameter is a nikkud character in the encoding of the second parameter

# File lib/hebrew.rb, line 152
def self.is_nikkud_by_encoding(c, encoding)
  case encoding
  when Encoding::UTF_8
    self.is_codepoint_nikkud_utf8(c.codepoints.first)
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    self.is_codepoint_nikkud_cp1255(c.codepoints.first)
  # TODO: add Mac encoding?
  end
end

Public Instance Methods

any_hebrew?() click to toggle source

this will return true if the string contains any Hebrew character (short circuit)

# File lib/hebrew.rb, line 77
def any_hebrew?
  case self.encoding
  when Encoding::UTF_8
    self.each_codepoint {|cp| return true if is_hebrew_codepoint_utf8(cp) }
    return false
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    self.each_codepoint {|cp| return true if is_hebrew_codepoint_cp1255(cp) }
    return false
  else
    return false
  end
end
any_nikkud?() click to toggle source
# File lib/hebrew.rb, line 116
def any_nikkud?
  func = case self.encoding
    when Encoding::UTF_8
      :is_codepoint_nikkud_utf8
    when Encoding::WINDOWS_1255 || Encoding::CP1255
      :is_codepoint_nikkud_cp1255
    else
      :falsehood
    end
  self.each_codepoint{|cp| return true if String.send(func, cp)}
  return false
end
falsehood() click to toggle source
# File lib/hebrew.rb, line 90
def falsehood
  false
end
is_hebrew_codepoint_cp1255(cp) click to toggle source
# File lib/hebrew.rb, line 129
def is_hebrew_codepoint_cp1255(cp)
  return ((cp > 191 && cp < 202) or [203, 204, 209, 210].include?(cp))
end
is_hebrew_codepoint_utf8(cp) click to toggle source
# File lib/hebrew.rb, line 132
def is_hebrew_codepoint_utf8(cp)
  return (cp >= HEB_UTF8_START && cp <= HEB_UTF8_END)
end
is_nikkud(c) click to toggle source

this will return true if the parameter is a nikkud character

# File lib/hebrew.rb, line 139
def is_nikkud(c)
  self.class.is_nikkud_by_encoding(c, self.encoding) # delegate to class method based on instance encoding
end
naive_full_nikkud() click to toggle source

this will add matres lectionis (yods and vavs as vowels) after diacritics that denote those vowels. The result won't always be morphologically correct Hebrew, but is useful for generating mostly-likely variants users may search for, when typing inputs (almost no Hebrew users know how to produce diacritics on the keyboard).

# File lib/hebrew.rb, line 95
def naive_full_nikkud
  ret = ''
  prev_char = nil
  case self.encoding
  when Encoding::UTF_8
    self.each_char do |c|
      if c.codepoints[0] == HEB_UTF8_QUBBUTS
        ret += 'וּ' # replace Qubbuts with vav and shuruk
      else
        ret += c
      end
      ret += 'י' if c.codepoints[0] == HEB_UTF8_XIRIK
      ret += 'ו' if c.codepoints[0] == HEB_UTF8_XOLAM && prev_char != 'ו'
      prev_char = c
    end
    return ret.gsub("\u05b4יי","\u05b4י").gsub("\u05b4י\u05bcי", "\u05b4\u05bcי") # get rid of extraneous yods possibly added because we weren't looking ahead
  else
    return nil # not implemented for other encodings for now.
  end
end
strip_hebrew() click to toggle source
# File lib/hebrew.rb, line 23
def strip_hebrew
  case self.encoding
  when Encoding::UTF_8
    strip_hebrew_utf8
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    strip_hebrew_cp1255
  end
end
strip_hebrew_cp1255() click to toggle source
# File lib/hebrew.rb, line 40
def strip_hebrew_cp1255
  target = ''.force_encoding('windows-1255')
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_cp1255(cp) or self.is_hebrew_codepoint_cp1255(cp)
      target << cp.chr(Encoding::CP1255) # is there a neater way?
    end
  }
  return target
end
strip_hebrew_utf8() click to toggle source
# File lib/hebrew.rb, line 31
def strip_hebrew_utf8
  target = ''
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_utf8(cp) or self.is_hebrew_codepoint_utf8(cp)
      target << cp.chr(Encoding::UTF_8)
    end
  }
  return target
end
strip_nikkud() click to toggle source

this will return the string, stripped of any Hebrew nikkud characters

# File lib/hebrew.rb, line 50
def strip_nikkud
  case self.encoding
  when Encoding::UTF_8
    strip_nikkud_utf8
  when Encoding::WINDOWS_1255 || Encoding::CP1255
    strip_nikkud_cp1255
  end
end
strip_nikkud_cp1255() click to toggle source
# File lib/hebrew.rb, line 58
def strip_nikkud_cp1255
  target = ''.force_encoding('windows-1255')
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_cp1255(cp)
      target << cp.chr(Encoding::CP1255) # is there a neater way?
    end
  }
  return target
end
strip_nikkud_utf8() click to toggle source
# File lib/hebrew.rb, line 67
def strip_nikkud_utf8
  target = ''
  self.each_codepoint {|cp|
    unless self.class.is_codepoint_nikkud_utf8(cp)
      target << cp.chr(Encoding::UTF_8)
    end
  }
  return target
end