class YDocx::Parser

Attributes

code[RW]
images[RW]
indecies[RW]
lang[RW]
result[RW]
space[RW]

Public Class Methods

new(doc, rel) { |self| ... } click to toggle source
# File lib/ydocx/parser.rb, line 12
def initialize(doc, rel)
  @doc = Nokogiri::XML.parse(doc)
  @rel = Nokogiri::XML.parse(rel)
  @coder = HTMLEntities.new
  @indecies = []
  @images = []
  @result = []
  @space = ' '
  @image_path = 'images'
  @image_style = ''
  init
  if block_given?
    yield self
  end
end

Public Instance Methods

chapters() click to toggle source

Fachinfo Chapters

1. name
2. composition
3. galenic form
4. indications
5. usage
6. contra_indications
7. restrictions
8. interactions
9. pregnancy
  1. driving_ability

  2. unwanted_effects

  3. overdose

  4. effects

  5. kinetic

  6. preclinic

  7. other_advice

  8. iksnr

  9. packages

  10. registration_owner

  11. date

# File lib/ydocx/templates/fachinfo.rb, line 37
def chapters
  chapters = {
    :de => {
      'name'                 => /^Name\s+des\s+Präparates$/u, # 1
      'composition'          => /^Zusammensetzung|Wirkstoffe|Hilsstoffe/u, # 2
      'galenic_form'         => /^Galenische\s+Form\s*(und|\/)\s*Wirkstoffmenge\s+pro\s+Einheit$/iu, # 3
      'indications'          => /^Indikationen(\s+|\s*(\/|und)\s*)Anwendungsmöglichkeiten$/u, # 4
      'usage'                => /^Dosierung\s*(\/|und)\s*Anwendung/u, # 5
      'contra_indications'   => /^Kontraindikationen($|\s*\(\s*absolute\s+Kontraindikationen\s*\)$)/u, # 6
      'restrictions'         => /^Warnhinweise\s+und\s+Vorsichtsmassnahmen($|\s*\/\s*(relative\s+Kontraindikationen|Warnhinweise\s*und\s*Vorsichtsmassnahmen)$)/u, # 7
      'interactions'         => /^Interaktionen$/u, # 8
      'pregnancy'            => /^Schwangerschaft(,\s*|\s*\/\s*|\s+und\s+)Stillzeit$/u, # 9
      'driving_ability'      => /^Wirkung\s+auf\s+die\s+Fahrtüchtigkeit\s+und\s+auf\s+das\s+Bedienen\s+von\s+Maschinen$/u, # 10
      'unwanted_effects'     => /^Unerwünschte\s+Wirkungen$/u, # 11
      'overdose'             => /^Überdosierung$/u, # 12
      'effects'              => /^Eigenschaften\s*\/\s*Wirkungen($|\s*\(\s*(ATC\-Code|Wirkungsmechanismus|Pharmakodyamik|Klinische\s+Wirksamkeit)\s*\)\s*$)/iu, # 13
      'kinetic'              => /^Pharmakokinetik($|\s*\((Absorption,\s*Distribution,\s*Metabolisms,\s*Elimination\s|Kinetik\s+spezieller\s+Patientengruppen)*\)$)/iu, # 14
      'preclinic'            => /^Präklinische\s+Daten$/u, # 15
      'other_advice'         => /^Sonstige\s*Hinweise($|\s*\(\s*(Inkompatibilitäten|Beeinflussung\s*diagnostischer\s*Methoden|Haltbarkeit|Besondere\s*Lagerungshinweise|Hinweise\s+für\s+die\s+Handhabung)\s*\)$)|^Remarques/u, # 16
      'iksnrs'               => /^Zulassungsnummer(n|:|$|\s*\(\s*Swissmedic\s*\)$)/u, # 17
      'packages'             => /^Packungen($|\s*\(\s*mit\s+Angabe\s+der\s+Abgabekategorie\s*\)$)/u, # 18
      'registration_owner'   => /^Zulassungsinhaberin($|\s*\(\s*Firma\s+und\s+Sitz\s+gemäss\s*Handelsregisterauszug\s*\))/u, # 19
      'date'                 => /^Stand\s+der\s+Information$/iu, # 20
      'fabrication'          => /^Herstellerin/u,
      'company'              => /^Vertriebsfirma/u,
    },
    :fr => {
      'name'                => /^Nom$/u, # 1
      'composition'         => /^Composition$/u, # 2
      'galenic_form'        => /^Forme\s+galénique\s+et\s+quantité\s+de\s+principe\s+actif\s+par\s+unité|^Forme\s*gal.nique/iu, # 3
      'indications'         => /^Indications/u, # 4
      'usage'               => /^Posologiei/u, # 5
      'contra_indications'  => /^Contre\-indications/iu, # 6
      'restrictions'        => /^Mises/u, # 7
      'interactions'        => /^Interactions/u, # 8
      'pregnancy'           => /^Grossesse\s*\/\s*Allaitement/u, # 9
      'driving_ability'     => /^Effet\s+sur\s+l'aptitude\s+à;\s+la\s+conduite\s+et\s+l'utilisation\s+de\s+machines/u, # 10
      'unwanted_effects'    => /^Effets/u, # 11
      'overdose'            => /^Surdosage$/u, # 12
      'effects'             => /^Propriétés/iu, # 13
      'kinetic'             => /^Pharmacocinétique$/iu, # 14
      'preclinic'           => /^Données\s+précliniques$/u, # 15
      'other_advice'        => /^Remarques/u, # 16
      'iksnrs'              => /^Numéro\s+d'autorisation$/u, # 17
      'packages'            => /^Présentation/iu, # 18
      'registration_owner'  => /^Titulaire\s+de\s+l'autorisation$/u, # 19
      'date'                => /^Mise à jour/iu, # 20
      'fabrication'         => /^Fabricant$/u,
      'company'             => /^Distributeur/u,
    }
  }
  if @lang == 'fr' || @lang == :fr
    chapters[:fr]
  else
    chapters[:de]
  end
end
init() click to toggle source
# File lib/ydocx/parser.rb, line 27
def init
end
parse() click to toggle source
# File lib/ydocx/parser.rb, line 29
def parse
  @doc.xpath('//w:document//w:body').children.map do |node|
    case node.node_name
    when 'text'
      @result << parse_paragraph(node)
    when 'tbl'
      @result << parse_table(node)
    when 'p'
      @result << parse_paragraph(node)
    else
      # skip
    end
  end
  @result
end

Private Instance Methods

apply_align(rpr, text) click to toggle source
# File lib/ydocx/parser.rb, line 64
def apply_align(rpr, text)
  unless rpr.xpath('w:vertAlign').empty?
    script = rpr.xpath('w:vertAlign').first['val'].to_sym
    if script == :subscript
      text = markup(:sub, text)
    elsif script == :superscript
      if text =~ /^[0-9]$/
        text = "&sup" + text + ";"
      else
        text = markup(:sup, text)
      end
    end
  end
  text
end
apply_fonts(rpr, text) click to toggle source
# File lib/ydocx/parser.rb, line 45
def apply_fonts(rpr, text)
  symbol = false
  unless rpr.xpath('w:rFonts').empty?
    rpr.xpath('w:rFonts').each do |font|
      if font.values.include? 'Symbol'
        symbol = true
      end
      break if symbol
    end
  end
  if symbol
    _text = ''
    text.unpack('U*').each do |char|
      _text << character_replace(char.to_s(16))
    end
    text = _text
  end
  text
end
character_encode(text) click to toggle source
# File lib/ydocx/parser.rb, line 79
def character_encode(text)
  text.force_encoding('utf-8')
  # NOTE
  # :named only for escape at Builder
  text = @coder.encode(text, :named)
  text
end
character_replace(code) click to toggle source
# File lib/ydocx/parser.rb, line 86
def character_replace(code)
  code = '0x' + code
  # NOTE
  # replace with rsemble html character ref
  # Symbol Font to HTML Character named ref
  case code
  when '0xf020' # '61472'
    ""
  when '0xf025' # '61477'
    "%"
  when '0xf02b' # '61482'
    "*"
  when '0xf02b' # '61483'
    "+"
  when '0xf02d' # '61485'
    "-"
  when '0xf02f' # '61487'
    "/"
  when '0xf03c' # '61500'
    "&lt;"
  when '0xf03d' # '61501'
    "="
  when '0xf03e' # '61502'
    "&gt;"
  when '0xf040' # '61504'
    "&cong;"
  when '0xf068' # '61544'
    "&eta;"
  when '0xf071' # '61553'
    "&theta;"
  when '0xf06d' # '61549'
    "&mu;"
  when '0xf0a3' # '61603'
    "&le;"
  when '0xf0ab' # '61611'
    "&harr;"
  when '0xf0ac' # '61612'
    "&larr;"
  when '0xf0ad' # '61613'
    "&uarr;"
  when '0xf0ae' # '61614'
    "&rarr;"
  when '0xf0ad' # '61615'
    "&darr;"
  when '0xf0b1' # '61617'
    "&plusmn;"
  when '0xf0b2' # '61618'
    "&Prime;"
  when '0xf0b3' # '61619'
    "&ge;"
  when '0xf0b4' # '61620'
    "&times;"
  when '0xf0b7' # '61623'
    "&sdot;"
  else
    #p "code : " + ("&#%s;" % code)
    #p "hex  : " + code.hex.to_s
    #p "char : " + @coder.decode("&#%s;" % code.hex.to_s)
  end
end
escape_id(text) click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 95
def escape_id(text)
  CGI.escape(text.
             gsub(/&(.)uml;/, '\1e').gsub(/&apos;/, '').gsub(/&(eacute|agrave);/, 'e').
             gsub(/\s*\/\s*|\s+|\/|\-/, '_').gsub(/\./, '').downcase)
end
optional_escape(text) click to toggle source
# File lib/ydocx/parser.rb, line 146
def optional_escape(text)
  text
end
parse_block(node) click to toggle source
# File lib/ydocx/parser.rb, line 149
def parse_block(node)
  nil # default no block element
end
parse_code(text) click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 100
def parse_code(text) # swissmedic number
  if text.gsub(@@figure_pattern, '') =~
     /^\s*(\d{5})(.*|\s*)\s*\(\s*Swiss\s*medic\s*\)(\s*|.)$/iu
    @code = "%5d" % $1
  else
    nil
  end
end
parse_heading(text, id) click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 108
def parse_heading(text, id)
  return markup(:h2, text, {:id => id})
end
parse_image(r) click to toggle source
# File lib/ydocx/parser.rb, line 152
def parse_image(r)
  id = nil
  additional_namespaces = {
    'xmlns:a'   => 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'xmlns:pic' => 'http://schemas.openxmlformats.org/drawingml/2006/picture'
  }
  ns = r.namespaces.merge additional_namespaces
  [
    { # old type shape
      :attr => 'r:id',
      :path => 'w:pict//v:shape//v:imagedata',
      :wrap => 'w:pict//v:shape//w10:wrap',
      :type => '',
    },
    { # in anchor
      :attr => 'r:embed',
      :path => 'w:drawing//wp:anchor//a:graphic//a:graphicData//pic:pic//pic:blipFill//a:blip',
      :wrap => 'w:drawing//wp:anchor//wp:wrapTight',
      :type => 'wrapText',
    },
    { # stand alone
      :attr => 'r:embed',
      :path => 'w:drawing//a:graphic//a:graphicData//pic:pic//pic:blipFill//a:blip',
      :wrap => 'w:drawing//wp:wrapTight',
      :type => 'wrapText',
    },
  ].each do |element|
    if image = r.xpath(element[:path], ns) and !image.empty?
      if wrap = r.xpath("#{element[:wrap]}", ns).first
        # TODO
        # wrap handling (currently all wrap off)
        # wrap[element[:type]] has "bothSides", "topAndBottom" and "wrapText"
        @image_style = 'display:block;'
      end
      (id = image.first[element[:attr].to_s]) && break
    end
  end
  if id
    @rel.xpath('/').children.each do |rel|
      rel.children.each do |r|
        if r['Id'] == id and r['Target']
          target = r['Target']
          source = source_path(target)
          @images << {
            :origin => target,
            :source => source
          }
          attributes = {:src => source}
          attributes.merge!({:style => @image_style}) unless @image_style.empty?
          return markup :img, [], attributes
        end
      end
    end
  end
  nil
end
parse_paragraph(node) click to toggle source
# File lib/ydocx/parser.rb, line 217
def parse_paragraph(node)
  content = []
  if block = parse_block(node)
    content << block
  else # as p
    pos = 0
    node.xpath('w:r').each do |r|
      unless r.xpath('w:t').empty?
        content << parse_text(r, (pos == 0)) # rm indent
        pos += 1
      else
        unless r.xpath('w:tab').empty?
          if content.last != @space and pos != 0 # ignore tab at line head
            content << @space
            pos += 1
          end
        end
        unless r.xpath('w:sym').empty?
          code = r.xpath('w:sym').first['w:char'].downcase # w:char
          content << character_replace(code)
          pos += 1
        end
        if !r.xpath('w:pict').empty? or !r.xpath('w:drawing').empty?
          content << parse_image(r)
        end
      end
    end
  end
  content.compact!
  unless content.empty?
    paragraph = content.select do |c|
      c.is_a?(Hash) and c[:tag].to_s =~ /^h[1-9]/u
    end.empty?
    if paragraph
      markup :p, content
    else
      content.first
    end
  else
    {}
  end
end
parse_table(node) click to toggle source
# File lib/ydocx/parser.rb, line 259
def parse_table(node)
  table = markup :table
  node.xpath('w:tr').each do |tr|
    cells = markup :tr
    tr.xpath('w:tc').each do |tc|
      attributes = {}
      tc.xpath('w:tcPr').each do |tcpr|
        if span = tcpr.xpath('w:gridSpan') and !span.empty?
          attributes[:colspan] = span.first['val'] # w:val
        end
      end
      cell = markup :td, [], attributes
      tc.xpath('w:p').each do |p|
        cell[:content] << parse_paragraph(p)
      end
      cells[:content] << cell
    end
    table[:content] << cells
  end
  table
end
parse_text(r, lstrip=false) click to toggle source
# File lib/ydocx/parser.rb, line 280
def parse_text(r, lstrip=false)
  text = r.xpath('w:t').map(&:text).join('')
  text = character_encode(text)
  text = optional_escape(text)
  text = text.lstrip if lstrip
  if rpr = r.xpath('w:rPr')
    text = apply_fonts(rpr, text)
    text = apply_align(rpr, text)
    unless rpr.xpath('w:u').empty?
      text = markup(:span, text, {:style => "text-decoration:underline;"})
    end
    unless rpr.xpath('w:i').empty?
      text = markup(:em, text)
    end
    unless rpr.xpath('w:b').empty?
      text = markup(:strong, text)
    end
  end
  text
end
parse_title(node, text) click to toggle source
# File lib/ydocx/templates/fachinfo.rb, line 111
def parse_title(node, text)
  if @indecies.empty? and !text.empty? and node.previous and
     (node.parent.previous.nil? or node.previous.inner_text.strip.empty?)
    # The first line as package name
    title = (@lang == 'fr' ? 'Titre' : 'Titel')
    @indecies << {:text => title, :id => title.downcase}
    return markup(:h1, text, {:id => title.downcase})
  else
    return nil
  end
end
source_path(target) click to toggle source
# File lib/ydocx/parser.rb, line 208
def source_path(target)
  source = @image_path + '/'
  if defined? Magick::Image and
     ext = File.extname(target).match(/\.wmf$/).to_a[0]
    source << File.basename(target, ext) + '.png'
  else
    source << File.basename(target)
  end
end