module PertinentParser

Public Class Methods

html(html) click to toggle source

Better write our own traversal function so that we can screw with the HTML representation the way we like.

# File lib/pertinent_parser.rb, line 17
def html(html)
  doc = Hpricot(html)
  d = 0
  t = text(doc.inner_text)
  doc.traverse_all_element do |elem|
    if elem.text?
      #puts elem.inner_text
      d += elem.inner_text.size
    else
      #puts elem.stag
      t + wrap_(d...d+elem.inner_text.size, elem.stag)
      #puts "#{d}..#{d+elem.inner_text.size}"
    end
  end
  t
end
new_replace(context, target, number, replacement) click to toggle source
# File lib/pertinent_parser.rb, line 72
def new_replace(context, target, number, replacement)
  range = range_from_specification(context, target, number)
  transform = Transform.new(:replacement, replacement)
  r = Rule.new(range, transform)
end
new_wrap(context, target, number, tag) click to toggle source
# File lib/pertinent_parser.rb, line 57
def new_wrap(context, target, number, tag)
  range = range_from_specification(context, target, number)
  wrap_(range, tag)
end
offset_to_r(o) click to toggle source
# File lib/pertinent_parser.rb, line 41
def offset_to_r(o)
  (o[0]..o[1]-1)
end
range_from_specification(context, target, number) click to toggle source
# File lib/pertinent_parser.rb, line 45
def range_from_specification context, target, number
  count, position = 0, 0
  stored = []
  re = Regexp.new(Regexp.escape(target))
  while (match = context.match(re , position)) do
    temp = match.offset 0
    position += 1; count += 1 if temp != stored
    return offset_to_r(temp) if count == number
    stored = temp
  end
end
rule(range, transform) click to toggle source
# File lib/pertinent_parser.rb, line 62
def rule(range, transform)
  Rule.new(range, transform)
end
text(s) click to toggle source
# File lib/pertinent_parser.rb, line 34
def text(s)
  r = Rule.new((0..s.size-1), Transform.new(:identity, ["id"]))
  t = Text.new(s)
  t.rule = r
  t
end
wrap_(range, tag) click to toggle source
# File lib/pertinent_parser.rb, line 67
def wrap_(range, tag)
  transform = Transform.new(:wrap, [tag, "</"+tag.match(/<(\S*)(\s|>)/)[1]+">" ])
  r = Rule.new(range, transform)
end