class Twkorean::TwitterKoreanText

Public Class Methods

new(normalization = true, stemming = true) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 4
def initialize(normalization = true, stemming = true)
  jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(File::PATH_SEPARATOR)
  Rjb::load(jars, ['-Xmx512M'])
end

Public Instance Methods

extract_phrases(tokens) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 34
def extract_phrases(tokens)
  phrases = korean_processor.extractPhrases(tokens, true, true)
  phrases.toArray.map{|x| x.toString}
end
normalize(text) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 9
def normalize(text)
  korean_processor.normalize(text).toString
end
stem(tokens) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 28
def stem(tokens)
  # Deprecated method
  # For legacy Code, Version less 0.0.6
  tokens_to_token_list(tokens)
end
tokenize(text) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 13
def tokenize(text)
  tokens = korean_processor.tokenize(text)
  tokens
end
tokens_to_string_list(tokens) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 18
def tokens_to_string_list(tokens)
  tokens = korean_processor.tokensToJavaStringList(tokens)
  tokens.toArray.map{|x| x.toString}
end
tokens_to_token_list(tokens) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 23
def tokens_to_token_list(tokens)
  tokens = korean_processor.tokensToJavaKoreanTokenList(tokens)
  tokens.toArray.map{|x| parser(x.toString)}
end

Private Instance Methods

korean_processor() click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 44
def korean_processor
  @korean_processor ||= Rjb::import('org.openkoreantext.processor.OpenKoreanTextProcessorJava')
end
parser(text) click to toggle source
# File lib/twkorean/twitter_korean_text.rb, line 40
def parser(text)
  text.match(/(.*)\(([a-zA-Z]*)(\(.*\))?: ([0-9]+), ([0-9]+)\)/).to_a
end