module PertinentParser
Public Class Methods
html(html)
click to toggle source
Better write our own traversal function so that we can screw with the HTML representation the way we like.
# File lib/pertinent_parser.rb, line 17 def html(html) doc = Hpricot(html) d = 0 t = text(doc.inner_text) doc.traverse_all_element do |elem| if elem.text? #puts elem.inner_text d += elem.inner_text.size else #puts elem.stag t + wrap_(d...d+elem.inner_text.size, elem.stag) #puts "#{d}..#{d+elem.inner_text.size}" end end t end
new_replace(context, target, number, replacement)
click to toggle source
# File lib/pertinent_parser.rb, line 72 def new_replace(context, target, number, replacement) range = range_from_specification(context, target, number) transform = Transform.new(:replacement, replacement) r = Rule.new(range, transform) end
new_wrap(context, target, number, tag)
click to toggle source
# File lib/pertinent_parser.rb, line 57 def new_wrap(context, target, number, tag) range = range_from_specification(context, target, number) wrap_(range, tag) end
offset_to_r(o)
click to toggle source
# File lib/pertinent_parser.rb, line 41 def offset_to_r(o) (o[0]..o[1]-1) end
range_from_specification(context, target, number)
click to toggle source
# File lib/pertinent_parser.rb, line 45 def range_from_specification context, target, number count, position = 0, 0 stored = [] re = Regexp.new(Regexp.escape(target)) while (match = context.match(re , position)) do temp = match.offset 0 position += 1; count += 1 if temp != stored return offset_to_r(temp) if count == number stored = temp end end
rule(range, transform)
click to toggle source
# File lib/pertinent_parser.rb, line 62 def rule(range, transform) Rule.new(range, transform) end
text(s)
click to toggle source
# File lib/pertinent_parser.rb, line 34 def text(s) r = Rule.new((0..s.size-1), Transform.new(:identity, ["id"])) t = Text.new(s) t.rule = r t end
wrap_(range, tag)
click to toggle source
# File lib/pertinent_parser.rb, line 67 def wrap_(range, tag) transform = Transform.new(:wrap, [tag, "</"+tag.match(/<(\S*)(\s|>)/)[1]+">" ]) r = Rule.new(range, transform) end