class Corenlp::Token

Constants

Enclitics
IGNORED_ENTITIES
NumberRegexp
PunctRegexp
STANFORD_TEXT_REPLACEMENTS

The character replacements that Stanford performs which we reverse:

WebsiteRegexp
WordRegexp

Attributes

index[RW]
ner[RW]
penn_treebank_tag[RW]
stanford_lemma[RW]
text[RW]
type[RW]

Public Class Methods

clean_stanford_text(text) click to toggle source
# File lib/corenlp/token.rb, line 33
def self.clean_stanford_text(text)
  Token::STANFORD_TEXT_REPLACEMENTS.each_pair do |original, replacement|
    text.gsub!(replacement, original)
  end
  text
end
new(attrs = {}) click to toggle source
# File lib/corenlp/token.rb, line 5
def initialize(attrs = {})
  @index = attrs[:index]
  @text = attrs[:text]
  @penn_treebank_tag = attrs[:penn_treebank_tag]
  @stanford_lemma = attrs[:stanford_lemma]
  @type = attrs[:type]
  @ner = attrs[:ner]
end
token_subclass_from_text(text) click to toggle source
# File lib/corenlp/token.rb, line 58
def self.token_subclass_from_text(text)
  case
  when Enclitics.include?(text)
    Enclitic
  when (text =~ WordRegexp && text != '-') || (text =~ WebsiteRegexp)
    Word
  when text =~ PunctRegexp
    Punctuation
  when text =~ NumberRegexp
    Number
  else
    Token
  end
end

Public Instance Methods

==(other) click to toggle source
# File lib/corenlp/token.rb, line 24
def ==(other)
  index == other.index && \
    penn_treebank_tag == other.penn_treebank_tag && type == other.type
end
content?() click to toggle source
# File lib/corenlp/token.rb, line 16
def content?
  is_a?(Word) || is_a?(Enclitic)
end
ignored_entity?() click to toggle source
# File lib/corenlp/token.rb, line 54
def ignored_entity?
  IGNORED_ENTITIES.include?(self.ner)
end
top_level_penn_treebank_category() click to toggle source
# File lib/corenlp/token.rb, line 20
def top_level_penn_treebank_category
  penn_treebank_tag[0]
end
website_text?() click to toggle source
# File lib/corenlp/token.rb, line 29
def website_text?
  text =~ /http:\/\//
end