class FoodIngredientParser::Loose::Scanner
Constants
- ABBREV_RE
Keep in sync with
abbrev
in theCommon
grammar, plus relevant ones from theAmount
grammar.- MARK_CHARS
- NOTE_RE
- PREFIX_RE
- SEP_CHARS
Public Class Methods
new(s, index: 0)
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 33 def initialize(s, index: 0) @s = s # input string @i = index # current index in string @cur = nil # current node we're populating @ancestors = [Node.new(@s, @i)] # nesting hierarchy @iterator = :beginning # scan_iteration_<iterator> to use for parsing @dest = :contains # append current node to this attribute on parent end
Public Instance Methods
scan()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 42 def scan loop do method(:"scan_iteration_#{@iterator}").call end close_all_ancestors @ancestors.first.ends(@i-1) @ancestors.first end
Private Instance Methods
abbrev_len()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 163 def abbrev_len m = @s[@i .. -1].match(ABBREV_RE) m ? m.offset(0).last : 0 end
add_child()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 184 def add_child name_until_here cur.ends(@i-1) parent.send(@dest) << cur @cur = nil end
c()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 135 def c @s[@i] end
close_all_ancestors()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 206 def close_all_ancestors while @ancestors.count > 1 add_child close_parent end add_child end
close_parent()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 197 def close_parent return unless @ancestors.count > 1 @cur = @ancestors.pop while @cur.auto_close add_child @cur = @ancestors.pop end end
cur()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 143 def cur @cur ||= Node.new(@s, @i) end
dot_is_not_sep?()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 227 def dot_is_not_sep? # if separator is dot ".", don't use it for note detection if @dot_is_not_sep.nil? @dot_is_not_sep = begin # @todo if another separator is found more often, dot is not a separator num_words = @s.split(/\s+/).count num_dots = @s.count(".") # heuristic: 1/4+ of the words has a dot, with at least five words num_words < 5 || 4 * num_dots < num_words end end @dot_is_not_sep end
is_mark?(i = @i)
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 151 def is_mark?(i = @i) mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/ end
is_notes_start?()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 168 def is_notes_start? # @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel? ml = mark_len if ( is_mark? && @s[@i+ml .. -1] =~ /\A\s*=/ ) || # "* = Biologisch" ( is_mark? && @s[@i-1] =~ /\s/ && @s[@i+ml .. -1] =~ /\A\s*\w/ ) || # " **Biologisch" ( @s[@i..-1] =~ NOTE_RE ) # "E=", "Kan sporen van", ... @i -= 1 # we want to include the mark in the note true # End of sentence elsif dot_is_not_sep? && is_sep?(chars: ".") true else false end end
is_sep?(chars: SEP_CHARS)
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 147 def is_sep?(chars: SEP_CHARS) chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/ end
loop() { |!= false| ... }
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 54 def loop while @i < @s.length @i += 1 if yield != false end end
mark_len(i = @i)
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 155 def mark_len(i = @i) j = i while @s[j] && MARK_CHARS.include?(@s[j]) j += 1 end j - i end
name_until_here()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 214 def name_until_here cur.name ||= begin i, j = cur.interval.first, @i - 1 i += mark_len(i) # skip any mark in front # Set name if there is any. There is one corner-case that needs to be avoided when # a nesting was opened without a name, which would set the name to the nesting text. # In this case, the name starts with an open-nesting symbol, which should never happen. if j >= i && !"([:".include?(@s[i]) Node.new(@s, i .. j) end end end
open_parent(**options)
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 191 def open_parent(**options) name_until_here @ancestors << cur @cur = Node.new(@s, @i + 1, **options) end
parent()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 139 def parent @ancestors.last end
scan_iteration_beginning()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 60 def scan_iteration_beginning # skip over some common prefixes m = @s[@i .. -1].match(PREFIX_RE) @i += m.offset(0).last if m # now continue with the standard parsing @iterator = :standard false end
scan_iteration_colon()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 102 def scan_iteration_colon if (len = abbrev_len) > 0 # defer iterations until after any abbreviation cur # reference to record starting position @i += len - 1 elsif "/".include?(c) # slash separator in colon nesting only add_child elsif is_sep? # regular separator indicates end of colon nesting add_child close_parent # revert to standard parsing from here on @iterator = :standard scan_iteration_standard elsif "([]):".include?(c) # continue with deeper nesting level # revert to standard parsing from here on @iterator = :standard scan_iteration_standard else # normal handling for this character scan_iteration_standard end end
scan_iteration_notes()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 124 def scan_iteration_notes if (len = abbrev_len) > 0 # defer iterations until after any abbreviation cur # reference to record starting position @i += len - 1 elsif is_sep?(chars: ".") # dot means new note add_child else cur end end
scan_iteration_standard()
click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 69 def scan_iteration_standard if (len = abbrev_len) > 0 # defer iterations until after any abbreviation cur # reference to record starting position @i += len - 1 elsif "([".include?(c) # open nesting open_parent elsif ")]".include?(c) # close nesting add_child close_parent elsif is_notes_start? # usually a dot marks the start of notes close_all_ancestors @iterator = :notes @dest = :notes elsif is_sep? # separator add_child elsif ":".include?(c) # another open nesting if @s[@i+1..-1] =~ /\A\s*(\(|\[)/ # ignore colon before an open bracket, then it's a regular nesting name_until_here else open_parent(auto_close: true) @iterator = :colon end elsif is_mark? && !cur.mark # mark after ingredient name_until_here len = mark_len cur.mark = Node.new(@s, @i .. @i+len-1) @i += len - 1 else cur # reference to record starting position end end