class CorpusProcessor::Parsers::Lampada

The parser for the corpus in LâMPADA format.

Attributes

categories[RW]
current_category[RW]

Public Class Methods

new(categories = CorpusProcessor::Categories.default) click to toggle source

@param (see Generators::StanfordNer#initialize)

# File lib/corpus-processor/parsers/lampada.rb, line 5
def initialize categories = CorpusProcessor::Categories.default
  self.categories = categories.fetch :input
end

Public Instance Methods

parse(corpus) click to toggle source

Parse the corpus in LâMPADA format.

@param corpus [String] the original corpus. @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.

# File lib/corpus-processor/parsers/lampada.rb, line 13
def parse corpus
  process_nodes Nokogiri::XML(corpus).css('P')
end

Protected Instance Methods

extract(categories_string) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 89
def extract categories_string
  category = categories_string.split('|').find { |category_string|
    categories.include? category_string
  }

  categories[category]
end
period_token() click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 105
def period_token
  @period_token ||= CorpusProcessor::Token.new('.')
end
process_alt(alt) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 65
def process_alt alt
  alternatives  = alt.inner_html.encode('UTF-8').split('|')
  fake_xmls     = alternatives.map { |alternative|
    Nokogiri::XML "<document>#{ alternative }</document>"
  }
  alternatives_tokens = fake_xmls.map { |fake_xml|
    process_nodes fake_xml.children
  }
  alternatives_tokens.max_by { |alternative_tokens|
    alternative_tokens.count { |alternative_token|
      ! alternative_token.category.nil?
    }
  }
end
process_element(element) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 46
def process_element element
  case element.name
  when 'P'   then process_p     element
  when 'EM'  then process_em    element
  when 'ALT' then process_alt   element
  else            process_nodes element.children
  end
end
process_em(em) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 61
def process_em em
  with_category em.attributes['CATEG'] { process_nodes em.children }
end
process_node(node) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 26
def process_node node
  case node
  when Nokogiri::XML::Text    then process_text    node.text
  when Nokogiri::XML::Element then process_element node
  else
    raise ArgumentError, "#{ node } cannot be handled by " \
                         "#{ self.class }. This is probably a bug, "\
                         "please report."
  end
end
process_nodes(nodes) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 22
def process_nodes nodes
  nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) }
end
process_p(p) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 55
def process_p p
  tokens = process_nodes p.children
  tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct
  tokens
end
process_text(text) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 37
def process_text text
  text.gsub(punct, ' \0 ')
      .strip
      .split(spaces)
      .map { |word|
    CorpusProcessor::Token.new(word, current_category)
  }
end
punct() click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 97
def punct
  /[[:punct:]]/
end
spaces() click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 101
def spaces
  /\s+/
end
with_category(categories_attribute, &block) click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 80
def with_category categories_attribute, &block
  unless categories_attribute.nil?
    self.current_category = extract categories_attribute.text
  end
  tokens = block.call
  self.current_category = nil
  tokens
end