class BioInterchange::TextMining::PDFxXMLReader::MyListener
Public Class Methods
new()
click to toggle source
# File lib/biointerchange/textmining/pdfx_xml_reader.rb, line 56 def initialize @map = {} #sections can nest, so "stack" them @map['sec_s'] = [] @map['sec_l'] = [] end
Public Instance Methods
document()
click to toggle source
# File lib/biointerchange/textmining/pdfx_xml_reader.rb, line 158 def document @doc end
tag_end(name)
click to toggle source
TODO add deal with <author> type tags
# File lib/biointerchange/textmining/pdfx_xml_reader.rb, line 121 def tag_end(name) #puts "tag_end: #{name}" if name =~ /^job$/ @map['id'] = false @map['id_done'] = true elsif name =~ /^article-title$/ @map['title'] = false dc = BioInterchange::TextMining::Content.new(@map['title_s'], @map['title_l'], BioInterchange::TextMining::Content::TITLE, @process) dc.setContext(@doc) @doc.add(dc) @map['title_done'] = true elsif name =~ /^abstract$/ @map['abs'] = false dc = BioInterchange::TextMining::Content.new(@map['abs_s'], @map['abs_l'], BioInterchange::TextMining::Content::ABSTRACT, @process) dc.setContext(@doc) @doc.add(dc) @map['abs_done'] = true elsif name =~ /^body$/ @map['body'] = false dc = BioInterchange::TextMining::Content.new(@map['body_s'], @map['body_l'], BioInterchange::TextMining::Content::SECTION, @process) dc.setContext(@doc) @doc.add(dc) @map['body_done'] = true elsif name =~ /^article$/ @map['art'] = false dc = BioInterchange::TextMining::Content.new(@map['art_s'], @map['art_l'], BioInterchange::TextMining::Content::DOCUMENT, @process) dc.setContext(@doc) @doc.add(dc) @map['art_done'] = true elsif name =~ /^section$/ raise 'Error with section stack, stacks not equal in size' unless @map['sec_s'].size == @map['sec_l'].size dc = BioInterchange::TextMining::Content.new(@map['sec_s'].pop, @map['sec_l'].pop, BioInterchange::TextMining::Content::SECTION, @process) dc.setContext(@doc) @doc.add(dc) end end
tag_start(name, attr)
click to toggle source
# File lib/biointerchange/textmining/pdfx_xml_reader.rb, line 64 def tag_start(name, attr) #puts "tag_start: #{name}" if name =~ /^job$/ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <job> tags, cannot parse multiple documents within a single file.' if @map['id_done'] @map['id'] = true elsif name =~ /^article-title$/ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article-title> tags defined, cannot parse multiple documents within a single file.' if @map['title_done'] @map['title'] = true @map['title_s'] = @map['art_l'] @map['title_l'] = 0 elsif name =~ /^abstract$/ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <abstract> tags defined, cannot parse multiple documents within a single file.' if @map['abs_done'] @map['abs'] = true @map['abs_s'] = @map['art_l'] @map['abs_l'] = 0 elsif name =~ /^body$/ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <body> tags defined, cannot parse multiple documents within a single file.' if @map['body_done'] @map['body'] = true @map['body_s'] = @map['art_l'] @map['body_l'] = 0 elsif name =~ /^article$/ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article> tags defined, cannot parse multiple documents within a single file.' if @map['art_done'] @map['art'] = true @map['art_s'] = 0 @map['art_l'] = 0 elsif name =~ /^section$/ raise BioInterchange::Exceptions::InputFormatError, 'Error with section stack, stacks not equal in size: Possibly not a well formed XML input file. Check <section> tags all match up and do not overlap (nesting is fine).' unless @map['sec_s'].size == @map['sec_l'].size @map['sec_s'].push @map['art_l'] @map['sec_l'].push 0 end end
text(data)
click to toggle source
# File lib/biointerchange/textmining/pdfx_xml_reader.rb, line 96 def text(data) if @map['art'] @map['art_l'] += data.length end if @map['id'] @doc = BioInterchange::TextMining::Document.new("http://pdfx.cs.man.ac.uk/" + data) elsif @map['title'] @map['title_l'] += data.length elsif @map['abs'] @map['abs_l'] += data.length end if @map['body'] @map['body_l'] += data.length end if @map['sec_l'].size != 0 #add length to *all* current sections @map['sec_l'].size.times do |i| @map['sec_l'][i] += data.length end end end