class FoodIngredientParser::Loose::Scanner

Constants

ABBREV_RE

Keep in sync with abbrev in the Common grammar, plus relevant ones from the Amount grammar.

MARK_CHARS
NOTE_RE
PREFIX_RE
SEP_CHARS

Public Class Methods

new(s, index: 0) click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 33
def initialize(s, index: 0)
  @s = s                           # input string
  @i = index                       # current index in string
  @cur = nil                       # current node we're populating
  @ancestors = [Node.new(@s, @i)]  # nesting hierarchy
  @iterator = :beginning           # scan_iteration_<iterator> to use for parsing
  @dest = :contains                # append current node to this attribute on parent
end

Public Instance Methods

scan() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 42
def scan
  loop do
    method(:"scan_iteration_#{@iterator}").call
  end

  close_all_ancestors
  @ancestors.first.ends(@i-1)
  @ancestors.first
end

Private Instance Methods

abbrev_len() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 163
def abbrev_len
  m = @s[@i .. -1].match(ABBREV_RE)
  m ? m.offset(0).last : 0
end
add_child() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 184
def add_child
  name_until_here
  cur.ends(@i-1)
  parent.send(@dest) << cur
  @cur = nil
end
c() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 135
def c
  @s[@i]
end
close_all_ancestors() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 206
def close_all_ancestors
  while @ancestors.count > 1
    add_child
    close_parent
  end
  add_child
end
close_parent() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 197
def close_parent
  return unless @ancestors.count > 1
  @cur = @ancestors.pop
  while @cur.auto_close
    add_child
    @cur = @ancestors.pop
  end
end
cur() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 143
def cur
  @cur ||= Node.new(@s, @i)
end
dot_is_not_sep?() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 227
def dot_is_not_sep?
  # if separator is dot ".", don't use it for note detection
  if @dot_is_not_sep.nil?
    @dot_is_not_sep = begin
      # @todo if another separator is found more often, dot is not a separator
      num_words = @s.split(/\s+/).count
      num_dots = @s.count(".")
      # heuristic: 1/4+ of the words has a dot, with at least five words
      num_words < 5 || 4 * num_dots < num_words
    end
  end
  @dot_is_not_sep
end
is_mark?(i = @i) click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 151
def is_mark?(i = @i)
  mark_len(i) > 0 && @s[i..i+1] !~ /\A°[CF]/
end
is_notes_start?() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 168
def is_notes_start?
  # @todo use more heuristics: don't assume dot is notes when separator is a dot, and only toplevel?
  ml = mark_len
  if ( is_mark? && @s[@i+ml .. -1] =~ /\A\s*=/ ) ||                      # "* = Biologisch"
     ( is_mark? && @s[@i-1] =~ /\s/ && @s[@i+ml .. -1] =~ /\A\s*\w/ ) || # " **Biologisch"
     ( @s[@i..-1] =~ NOTE_RE )                                           # "E=", "Kan sporen van", ...
    @i -= 1 # we want to include the mark in the note
    true
  # End of sentence
  elsif dot_is_not_sep? && is_sep?(chars: ".")
    true
  else
    false
  end
end
is_sep?(chars: SEP_CHARS) click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 147
def is_sep?(chars: SEP_CHARS)
  chars.include?(c) && @s[@i-1..@i+1] !~ /\A\d.\d\z/
end
loop() { |!= false| ... } click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 54
def loop
  while @i < @s.length
    @i += 1 if yield != false
  end
end
mark_len(i = @i) click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 155
def mark_len(i = @i)
  j = i
  while @s[j] && MARK_CHARS.include?(@s[j])
    j += 1
  end
  j - i
end
name_until_here() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 214
def name_until_here
  cur.name ||= begin
    i, j = cur.interval.first, @i - 1
    i += mark_len(i) # skip any mark in front
    # Set name if there is any. There is one corner-case that needs to be avoided when
    # a nesting was opened without a name, which would set the name to the nesting text.
    # In this case, the name starts with an open-nesting symbol, which should never happen.
    if j >= i && !"([:".include?(@s[i])
      Node.new(@s, i .. j)
    end
  end
end
open_parent(**options) click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 191
def open_parent(**options)
  name_until_here
  @ancestors << cur
  @cur = Node.new(@s, @i + 1, **options)
end
parent() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 139
def parent
  @ancestors.last
end
scan_iteration_beginning() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 60
def scan_iteration_beginning
  # skip over some common prefixes
  m = @s[@i .. -1].match(PREFIX_RE)
  @i += m.offset(0).last if m
  # now continue with the standard parsing
  @iterator = :standard
  false
end
scan_iteration_colon() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 102
def scan_iteration_colon
  if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
    cur # reference to record starting position
    @i += len - 1
  elsif "/".include?(c)     # slash separator in colon nesting only
    add_child
  elsif is_sep?             # regular separator indicates end of colon nesting
    add_child
    close_parent
    # revert to standard parsing from here on
    @iterator = :standard
    scan_iteration_standard
  elsif "([]):".include?(c) # continue with deeper nesting level
    # revert to standard parsing from here on
    @iterator = :standard
    scan_iteration_standard
  else
    # normal handling for this character
    scan_iteration_standard
  end
end
scan_iteration_notes() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 124
def scan_iteration_notes
  if (len = abbrev_len) > 0 # defer iterations until after any abbreviation
    cur # reference to record starting position
    @i += len - 1
  elsif is_sep?(chars: ".")    # dot means new note
    add_child
  else
    cur
  end
end
scan_iteration_standard() click to toggle source
# File lib/food_ingredient_parser/loose/scanner.rb, line 69
def scan_iteration_standard
  if (len = abbrev_len) > 0   # defer iterations until after any abbreviation
    cur # reference to record starting position
    @i += len - 1
  elsif "([".include?(c)      # open nesting
    open_parent
  elsif ")]".include?(c)      # close nesting
    add_child
    close_parent
  elsif is_notes_start?       # usually a dot marks the start of notes
    close_all_ancestors
    @iterator = :notes
    @dest = :notes
  elsif is_sep?               # separator
    add_child
  elsif ":".include?(c)       # another open nesting
    if @s[@i+1..-1] =~ /\A\s*(\(|\[)/
      # ignore colon before an open bracket, then it's a regular nesting
      name_until_here
    else
      open_parent(auto_close: true)
      @iterator = :colon
    end
  elsif is_mark? && !cur.mark # mark after ingredient
    name_until_here
    len = mark_len
    cur.mark = Node.new(@s, @i .. @i+len-1)
    @i += len - 1
  else
    cur # reference to record starting position
  end
end