class RexleParser

Attributes

doctype[R]
instructions[R]
to_a[R]

Public Class Methods

new(raw_s) click to toggle source
Calls superclass method
# File lib/rexleparser.rb, line 72
def initialize(raw_s)

  super()
  s = raw_s.clone.strip
  return if s.empty?
  
  raw_xml, raw_instrctns = if s.lines.first =~ /<?xml/ then
    s.split(/(?=\?>\s*<\w)/,2).reverse
  else
    s
  end
  @instructions = raw_instrctns ? \
                            raw_instrctns.scan(/<\?([\w-]+) ([^\?]+)/) : []
  @doctype = s.slice!(/<!DOCTYPE html>\n?/) if s.lines.first =~ /<\!DOCTYPE/
  @to_a = reverse(parse_node(raw_xml.strip.reverse))

end

Private Instance Methods

get_attributes(raw_attributes) click to toggle source
# File lib/rexleparser.rb, line 215
def get_attributes(raw_attributes)
  
  r1 = /([\w\-:\(\)]+\='[^']*)'/
  r2 = /([\w\-:\(\)]+\="[^"]*)"/
  
  r =  raw_attributes.scan(/#{r1}|#{r2}/).map(&:compact)\
                                .flatten.inject(Attributes.new) do |r, x|
    attr_name, raw_val = x.split(/=/,2) 
    val = attr_name != 'class' ? raw_val[1..-1] : raw_val[1..-1].split
    r.merge(attr_name.to_sym => val)
  end

  return r
end
parse_node(r, j=nil) click to toggle source
# File lib/rexleparser.rb, line 174
def parse_node(r, j=nil)
  
  return unless r.length > 0
  tag = r.slice!(/^>[^<]+</) if (r =~ /^>[^<]+</) == 0
  tagname = tag[/([\w!:]+)\/?<$/,1] 

  # self closing tag?
  if tag[/^>\/.*#{tagname}<$/m] then
    return [">/#{tagname}<", [], "#{tag.sub(/>\//,'>')}"]
  end

  start_tag, children, end_tag = tag, [], nil

  unless start_tag[1..-3][/\w+$/] then
    raise RexleParserException, 'invalid closing tag found ' + \
              start_tag.reverse + '; context: ' + r[0..120].reverse.inspect
  end

  until end_tag do 
    
    key, res = scan_next r, tagname      
    
    case key 
    when :end_tag
      end_tag = res
      r2 = [start_tag, children, end_tag]
      end_tag = nil
      
      return r2
    when :child
      children << res
    when :newnode
      children << parse_node(r, tagname)
    else
      break
    end
  end

  [start_tag,  children, end_tag]
end
reverse(raw_obj) click to toggle source
# File lib/rexleparser.rb, line 230
def reverse(raw_obj)
  
  return unless raw_obj
  obj = raw_obj.clone
  return obj.reverse! if obj.is_a? String

  tag = obj.pop.reverse
  
  children = obj[-1]

  r = children.reverse.map {|x| reverse(x)}
  
  return [tag[/[!\-\w:\[]+/], get_attributes(tag), *r]
end
scan_next(r, tagname) click to toggle source
# File lib/rexleparser.rb, line 93
def scan_next(r, tagname)

  j = tagname

  if r[0] == '>' then

    # end tag match
    tag = r[/^>[^<]+</]
    
    if tag[1][/[ \w"']/] and  tag[-2] != '/'  then

      # is it the end tag to match the start tag?
      tag = r.slice!(/^>[^<]+</)
      end_tag = tag[/^>[^>]*#{j}<$/]

      if end_tag then
        
        j = nil
        return   [:end_tag, end_tag]

      elsif tag[/^>[^>]*\w+<$/] then
        # broken tag found
        broken_tag = tag
        return [:child, [nil, [], broken_tag]] if broken_tag          
      else
        
        text, newtag =  tag.sub('>',';tg&').split(/>/,2)
        
        if newtag then
          tag = newtag
          r.prepend '>' + tag
        end

        return [:child, text]
      end
    elsif r[0,3] == '>--' then   # comment tag found
        
      r.slice!(0,3)
      i = r =~ /(\-\-!<)/
      s = r.slice!(0,i)
      r.slice!(0,4)

      tagname, content = ['-!',s]
    
      return [:child, [">#{tagname}<", [content], ">#{tagname}/<"]]
      
    elsif r[0,3] == '>]]' then   # CDATA tag found

      r.slice!(0,3)
      i = r =~ /(\[ATADC\[!<)/
      s = r.slice!(0,i)
      r.slice!(0,9)

      tagname, content = ['[!',s]

      return [:child, [">#{tagname}<", [content], ">#{tagname}/<"]]        
      
    elsif tag[/>\/|\/<$/] or tag[/^>.*[\w!]+\/<$/] then
              
      return [:newnode]        
      
    else

      r.sub!('>',';tg&')      
      i = r =~ />(?:[\-\/"'\w]|\]\])/ # collect until a tag is found or a CDATA element
      text = r.slice!(0,i)

      return [:child, text] if text

    end # end of tag match
    
  else

    # it's a text value
    i = r =~ />(?:[\-\/"'\w]|\]\])/ # collect until a tag is found or a CDATA element
    text = r.slice!(0,i)

    return [:child, text] if text
  end
end