class Corenlp::Token
Constants
- Enclitics
- IGNORED_ENTITIES
- NumberRegexp
- PunctRegexp
- STANFORD_TEXT_REPLACEMENTS
The character replacements that Stanford performs which we reverse:
- WebsiteRegexp
- WordRegexp
Attributes
index[RW]
ner[RW]
penn_treebank_tag[RW]
stanford_lemma[RW]
text[RW]
type[RW]
Public Class Methods
clean_stanford_text(text)
click to toggle source
# File lib/corenlp/token.rb, line 33 def self.clean_stanford_text(text) Token::STANFORD_TEXT_REPLACEMENTS.each_pair do |original, replacement| text.gsub!(replacement, original) end text end
new(attrs = {})
click to toggle source
# File lib/corenlp/token.rb, line 5 def initialize(attrs = {}) @index = attrs[:index] @text = attrs[:text] @penn_treebank_tag = attrs[:penn_treebank_tag] @stanford_lemma = attrs[:stanford_lemma] @type = attrs[:type] @ner = attrs[:ner] end
token_subclass_from_text(text)
click to toggle source
# File lib/corenlp/token.rb, line 58 def self.token_subclass_from_text(text) case when Enclitics.include?(text) Enclitic when (text =~ WordRegexp && text != '-') || (text =~ WebsiteRegexp) Word when text =~ PunctRegexp Punctuation when text =~ NumberRegexp Number else Token end end
Public Instance Methods
==(other)
click to toggle source
# File lib/corenlp/token.rb, line 24 def ==(other) index == other.index && \ penn_treebank_tag == other.penn_treebank_tag && type == other.type end
content?()
click to toggle source
# File lib/corenlp/token.rb, line 16 def content? is_a?(Word) || is_a?(Enclitic) end
ignored_entity?()
click to toggle source
# File lib/corenlp/token.rb, line 54 def ignored_entity? IGNORED_ENTITIES.include?(self.ner) end
top_level_penn_treebank_category()
click to toggle source
# File lib/corenlp/token.rb, line 20 def top_level_penn_treebank_category penn_treebank_tag[0] end
website_text?()
click to toggle source
# File lib/corenlp/token.rb, line 29 def website_text? text =~ /http:\/\// end