class CorpusProcessor::Parsers::Lampada
The parser for the corpus in LâMPADA format.
Attributes
categories[RW]
current_category[RW]
Public Class Methods
new(categories = CorpusProcessor::Categories.default)
click to toggle source
@param (see Generators::StanfordNer#initialize)
# File lib/corpus-processor/parsers/lampada.rb, line 5 def initialize categories = CorpusProcessor::Categories.default self.categories = categories.fetch :input end
Public Instance Methods
parse(corpus)
click to toggle source
Parse the corpus in LâMPADA format.
@param corpus [String] the original corpus. @return [Array<CorpusProcessor::Token>] the tokens extracted from corpus.
# File lib/corpus-processor/parsers/lampada.rb, line 13 def parse corpus process_nodes Nokogiri::XML(corpus).css('P') end
Protected Instance Methods
extract(categories_string)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 89 def extract categories_string category = categories_string.split('|').find { |category_string| categories.include? category_string } categories[category] end
period_token()
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 105 def period_token @period_token ||= CorpusProcessor::Token.new('.') end
process_alt(alt)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 65 def process_alt alt alternatives = alt.inner_html.encode('UTF-8').split('|') fake_xmls = alternatives.map { |alternative| Nokogiri::XML "<document>#{ alternative }</document>" } alternatives_tokens = fake_xmls.map { |fake_xml| process_nodes fake_xml.children } alternatives_tokens.max_by { |alternative_tokens| alternative_tokens.count { |alternative_token| ! alternative_token.category.nil? } } end
process_element(element)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 46 def process_element element case element.name when 'P' then process_p element when 'EM' then process_em element when 'ALT' then process_alt element else process_nodes element.children end end
process_em(em)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 61 def process_em em with_category em.attributes['CATEG'] { process_nodes em.children } end
process_node(node)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 26 def process_node node case node when Nokogiri::XML::Text then process_text node.text when Nokogiri::XML::Element then process_element node else raise ArgumentError, "#{ node } cannot be handled by " \ "#{ self.class }. This is probably a bug, "\ "please report." end end
process_nodes(nodes)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 22 def process_nodes nodes nodes.reduce([]) { |tokens, node| tokens.push(*process_node(node)) } end
process_p(p)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 55 def process_p p tokens = process_nodes p.children tokens << period_token if ! tokens.empty? && tokens.last.word !~ punct tokens end
process_text(text)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 37 def process_text text text.gsub(punct, ' \0 ') .strip .split(spaces) .map { |word| CorpusProcessor::Token.new(word, current_category) } end
punct()
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 97 def punct /[[:punct:]]/ end
spaces()
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 101 def spaces /\s+/ end
with_category(categories_attribute, &block)
click to toggle source
# File lib/corpus-processor/parsers/lampada.rb, line 80 def with_category categories_attribute, &block unless categories_attribute.nil? self.current_category = extract categories_attribute.text end tokens = block.call self.current_category = nil tokens end