class YDocx::Parser
Attributes
code[RW]
images[RW]
indecies[RW]
lang[RW]
result[RW]
space[RW]
Public Class Methods
new(doc, rel) { |self| ... }
click to toggle source
# File lib/ydocx/parser.rb, line 12 def initialize(doc, rel) @doc = Nokogiri::XML.parse(doc) @rel = Nokogiri::XML.parse(rel) @coder = HTMLEntities.new @indecies = [] @images = [] @result = [] @space = ' ' @image_path = 'images' @image_style = '' init if block_given? yield self end end
Public Instance Methods
chapters()
click to toggle source
Fachinfo Chapters
1. name 2. composition 3. galenic form 4. indications 5. usage 6. contra_indications 7. restrictions 8. interactions 9. pregnancy
-
driving_ability
-
unwanted_effects
-
overdose
-
effects
-
kinetic
-
preclinic
-
other_advice
-
iksnr
-
packages
-
registration_owner
-
date
# File lib/ydocx/templates/fachinfo.rb, line 37 def chapters chapters = { :de => { 'name' => /^Name\s+des\s+Präparates$/u, # 1 'composition' => /^Zusammensetzung|Wirkstoffe|Hilsstoffe/u, # 2 'galenic_form' => /^Galenische\s+Form\s*(und|\/)\s*Wirkstoffmenge\s+pro\s+Einheit$/iu, # 3 'indications' => /^Indikationen(\s+|\s*(\/|und)\s*)Anwendungsmöglichkeiten$/u, # 4 'usage' => /^Dosierung\s*(\/|und)\s*Anwendung/u, # 5 'contra_indications' => /^Kontraindikationen($|\s*\(\s*absolute\s+Kontraindikationen\s*\)$)/u, # 6 'restrictions' => /^Warnhinweise\s+und\s+Vorsichtsmassnahmen($|\s*\/\s*(relative\s+Kontraindikationen|Warnhinweise\s*und\s*Vorsichtsmassnahmen)$)/u, # 7 'interactions' => /^Interaktionen$/u, # 8 'pregnancy' => /^Schwangerschaft(,\s*|\s*\/\s*|\s+und\s+)Stillzeit$/u, # 9 'driving_ability' => /^Wirkung\s+auf\s+die\s+Fahrtüchtigkeit\s+und\s+auf\s+das\s+Bedienen\s+von\s+Maschinen$/u, # 10 'unwanted_effects' => /^Unerwünschte\s+Wirkungen$/u, # 11 'overdose' => /^Überdosierung$/u, # 12 'effects' => /^Eigenschaften\s*\/\s*Wirkungen($|\s*\(\s*(ATC\-Code|Wirkungsmechanismus|Pharmakodyamik|Klinische\s+Wirksamkeit)\s*\)\s*$)/iu, # 13 'kinetic' => /^Pharmakokinetik($|\s*\((Absorption,\s*Distribution,\s*Metabolisms,\s*Elimination\s|Kinetik\s+spezieller\s+Patientengruppen)*\)$)/iu, # 14 'preclinic' => /^Präklinische\s+Daten$/u, # 15 'other_advice' => /^Sonstige\s*Hinweise($|\s*\(\s*(Inkompatibilitäten|Beeinflussung\s*diagnostischer\s*Methoden|Haltbarkeit|Besondere\s*Lagerungshinweise|Hinweise\s+für\s+die\s+Handhabung)\s*\)$)|^Remarques/u, # 16 'iksnrs' => /^Zulassungsnummer(n|:|$|\s*\(\s*Swissmedic\s*\)$)/u, # 17 'packages' => /^Packungen($|\s*\(\s*mit\s+Angabe\s+der\s+Abgabekategorie\s*\)$)/u, # 18 'registration_owner' => /^Zulassungsinhaberin($|\s*\(\s*Firma\s+und\s+Sitz\s+gemäss\s*Handelsregisterauszug\s*\))/u, # 19 'date' => /^Stand\s+der\s+Information$/iu, # 20 'fabrication' => /^Herstellerin/u, 'company' => /^Vertriebsfirma/u, }, :fr => { 'name' => /^Nom$/u, # 1 'composition' => /^Composition$/u, # 2 'galenic_form' => /^Forme\s+galénique\s+et\s+quantité\s+de\s+principe\s+actif\s+par\s+unité|^Forme\s*gal.nique/iu, # 3 'indications' => /^Indications/u, # 4 'usage' => /^Posologiei/u, # 5 'contra_indications' => /^Contre\-indications/iu, # 6 'restrictions' => /^Mises/u, # 7 'interactions' => /^Interactions/u, # 8 'pregnancy' => /^Grossesse\s*\/\s*Allaitement/u, # 9 'driving_ability' => /^Effet\s+sur\s+l'aptitude\s+à;\s+la\s+conduite\s+et\s+l'utilisation\s+de\s+machines/u, # 10 'unwanted_effects' => /^Effets/u, # 11 'overdose' => /^Surdosage$/u, # 12 'effects' => /^Propriétés/iu, # 13 'kinetic' => /^Pharmacocinétique$/iu, # 14 'preclinic' => /^Données\s+précliniques$/u, # 15 'other_advice' => /^Remarques/u, # 16 'iksnrs' => /^Numéro\s+d'autorisation$/u, # 17 'packages' => /^Présentation/iu, # 18 'registration_owner' => /^Titulaire\s+de\s+l'autorisation$/u, # 19 'date' => /^Mise à jour/iu, # 20 'fabrication' => /^Fabricant$/u, 'company' => /^Distributeur/u, } } if @lang == 'fr' || @lang == :fr chapters[:fr] else chapters[:de] end end
init()
click to toggle source
# File lib/ydocx/parser.rb, line 27 def init end
parse()
click to toggle source
# File lib/ydocx/parser.rb, line 29 def parse @doc.xpath('//w:document//w:body').children.map do |node| case node.node_name when 'text' @result << parse_paragraph(node) when 'tbl' @result << parse_table(node) when 'p' @result << parse_paragraph(node) else # skip end end @result end
Private Instance Methods
apply_align(rpr, text)
click to toggle source
# File lib/ydocx/parser.rb, line 64 def apply_align(rpr, text) unless rpr.xpath('w:vertAlign').empty? script = rpr.xpath('w:vertAlign').first['val'].to_sym if script == :subscript text = markup(:sub, text) elsif script == :superscript if text =~ /^[0-9]$/ text = "&sup" + text + ";" else text = markup(:sup, text) end end end text end
apply_fonts(rpr, text)
click to toggle source
# File lib/ydocx/parser.rb, line 45 def apply_fonts(rpr, text) symbol = false unless rpr.xpath('w:rFonts').empty? rpr.xpath('w:rFonts').each do |font| if font.values.include? 'Symbol' symbol = true end break if symbol end end if symbol _text = '' text.unpack('U*').each do |char| _text << character_replace(char.to_s(16)) end text = _text end text end
character_encode(text)
click to toggle source
# File lib/ydocx/parser.rb, line 79 def character_encode(text) text.force_encoding('utf-8') # NOTE # :named only for escape at Builder text = @coder.encode(text, :named) text end
character_replace(code)
click to toggle source
# File lib/ydocx/parser.rb, line 86 def character_replace(code) code = '0x' + code # NOTE # replace with rsemble html character ref # Symbol Font to HTML Character named ref case code when '0xf020' # '61472' "" when '0xf025' # '61477' "%" when '0xf02b' # '61482' "*" when '0xf02b' # '61483' "+" when '0xf02d' # '61485' "-" when '0xf02f' # '61487' "/" when '0xf03c' # '61500' "<" when '0xf03d' # '61501' "=" when '0xf03e' # '61502' ">" when '0xf040' # '61504' "≅" when '0xf068' # '61544' "η" when '0xf071' # '61553' "θ" when '0xf06d' # '61549' "μ" when '0xf0a3' # '61603' "≤" when '0xf0ab' # '61611' "↔" when '0xf0ac' # '61612' "←" when '0xf0ad' # '61613' "↑" when '0xf0ae' # '61614' "→" when '0xf0ad' # '61615' "↓" when '0xf0b1' # '61617' "±" when '0xf0b2' # '61618' "″" when '0xf0b3' # '61619' "≥" when '0xf0b4' # '61620' "×" when '0xf0b7' # '61623' "⋅" else #p "code : " + ("&#%s;" % code) #p "hex : " + code.hex.to_s #p "char : " + @coder.decode("&#%s;" % code.hex.to_s) end end
escape_id(text)
click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 95 def escape_id(text) CGI.escape(text. gsub(/&(.)uml;/, '\1e').gsub(/'/, '').gsub(/&(eacute|agrave);/, 'e'). gsub(/\s*\/\s*|\s+|\/|\-/, '_').gsub(/\./, '').downcase) end
optional_escape(text)
click to toggle source
# File lib/ydocx/parser.rb, line 146 def optional_escape(text) text end
parse_block(node)
click to toggle source
# File lib/ydocx/parser.rb, line 149 def parse_block(node) nil # default no block element end
parse_code(text)
click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 100 def parse_code(text) # swissmedic number if text.gsub(@@figure_pattern, '') =~ /^\s*(\d{5})(.*|\s*)\s*\(\s*Swiss\s*medic\s*\)(\s*|.)$/iu @code = "%5d" % $1 else nil end end
parse_heading(text, id)
click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 108 def parse_heading(text, id) return markup(:h2, text, {:id => id}) end
parse_image(r)
click to toggle source
# File lib/ydocx/parser.rb, line 152 def parse_image(r) id = nil additional_namespaces = { 'xmlns:a' => 'http://schemas.openxmlformats.org/drawingml/2006/main', 'xmlns:pic' => 'http://schemas.openxmlformats.org/drawingml/2006/picture' } ns = r.namespaces.merge additional_namespaces [ { # old type shape :attr => 'r:id', :path => 'w:pict//v:shape//v:imagedata', :wrap => 'w:pict//v:shape//w10:wrap', :type => '', }, { # in anchor :attr => 'r:embed', :path => 'w:drawing//wp:anchor//a:graphic//a:graphicData//pic:pic//pic:blipFill//a:blip', :wrap => 'w:drawing//wp:anchor//wp:wrapTight', :type => 'wrapText', }, { # stand alone :attr => 'r:embed', :path => 'w:drawing//a:graphic//a:graphicData//pic:pic//pic:blipFill//a:blip', :wrap => 'w:drawing//wp:wrapTight', :type => 'wrapText', }, ].each do |element| if image = r.xpath(element[:path], ns) and !image.empty? if wrap = r.xpath("#{element[:wrap]}", ns).first # TODO # wrap handling (currently all wrap off) # wrap[element[:type]] has "bothSides", "topAndBottom" and "wrapText" @image_style = 'display:block;' end (id = image.first[element[:attr].to_s]) && break end end if id @rel.xpath('/').children.each do |rel| rel.children.each do |r| if r['Id'] == id and r['Target'] target = r['Target'] source = source_path(target) @images << { :origin => target, :source => source } attributes = {:src => source} attributes.merge!({:style => @image_style}) unless @image_style.empty? return markup :img, [], attributes end end end end nil end
parse_paragraph(node)
click to toggle source
# File lib/ydocx/parser.rb, line 217 def parse_paragraph(node) content = [] if block = parse_block(node) content << block else # as p pos = 0 node.xpath('w:r').each do |r| unless r.xpath('w:t').empty? content << parse_text(r, (pos == 0)) # rm indent pos += 1 else unless r.xpath('w:tab').empty? if content.last != @space and pos != 0 # ignore tab at line head content << @space pos += 1 end end unless r.xpath('w:sym').empty? code = r.xpath('w:sym').first['w:char'].downcase # w:char content << character_replace(code) pos += 1 end if !r.xpath('w:pict').empty? or !r.xpath('w:drawing').empty? content << parse_image(r) end end end end content.compact! unless content.empty? paragraph = content.select do |c| c.is_a?(Hash) and c[:tag].to_s =~ /^h[1-9]/u end.empty? if paragraph markup :p, content else content.first end else {} end end
parse_table(node)
click to toggle source
# File lib/ydocx/parser.rb, line 259 def parse_table(node) table = markup :table node.xpath('w:tr').each do |tr| cells = markup :tr tr.xpath('w:tc').each do |tc| attributes = {} tc.xpath('w:tcPr').each do |tcpr| if span = tcpr.xpath('w:gridSpan') and !span.empty? attributes[:colspan] = span.first['val'] # w:val end end cell = markup :td, [], attributes tc.xpath('w:p').each do |p| cell[:content] << parse_paragraph(p) end cells[:content] << cell end table[:content] << cells end table end
parse_text(r, lstrip=false)
click to toggle source
# File lib/ydocx/parser.rb, line 280 def parse_text(r, lstrip=false) text = r.xpath('w:t').map(&:text).join('') text = character_encode(text) text = optional_escape(text) text = text.lstrip if lstrip if rpr = r.xpath('w:rPr') text = apply_fonts(rpr, text) text = apply_align(rpr, text) unless rpr.xpath('w:u').empty? text = markup(:span, text, {:style => "text-decoration:underline;"}) end unless rpr.xpath('w:i').empty? text = markup(:em, text) end unless rpr.xpath('w:b').empty? text = markup(:strong, text) end end text end
parse_title(node, text)
click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 111 def parse_title(node, text) if @indecies.empty? and !text.empty? and node.previous and (node.parent.previous.nil? or node.previous.inner_text.strip.empty?) # The first line as package name title = (@lang == 'fr' ? 'Titre' : 'Titel') @indecies << {:text => title, :id => title.downcase} return markup(:h1, text, {:id => title.downcase}) else return nil end end
source_path(target)
click to toggle source
# File lib/ydocx/parser.rb, line 208 def source_path(target) source = @image_path + '/' if defined? Magick::Image and ext = File.extname(target).match(/\.wmf$/).to_a[0] source << File.basename(target, ext) + '.png' else source << File.basename(target) end end