module FoodIngredientParser::Strict::Grammar

grammar Common

  rule ws
    !newline [[:space:]]
  end

  rule newline
    "\n"
  end

  rule char
    !mark [[:alnum:]] /
    fraction /
    [-/\`'"´‘’+=_{}&] /
    [®©™♣] /
    [¿?¯] /                        # weird characters turning up in names (e.g. encoding issues)
    [₁₂₃₄₅₆₇₈₉]                    # can occur with vitamins
  end

  rule mark
    # mark referencing a footnote
    [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? /
    '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' /
    [˄^] digit /
    [†‡⁺•°▪◊#˄^~˛] /
    '*'+ /
    '(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')'
  end

  rule digit
    [0-9]
  end

  rule fraction
    [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] /
    digit+ '/' digit+
  end

  rule percent
    [%٪⁒%﹪]
  end

  rule number
    digit+ [,.] digit+ / digit+ ws* fraction / fraction / digit+
  end

  rule dash
    [-֊ ‐ ‑ ‒ – — ― ﹘﹣-]
  end

  rule word
    abbrev / word_complex / char+
  end

  rule and
    ( 'and' / 'en' / 'und' ) !char / '&'
  end

  rule e_number
    ( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? )
    ( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)"
  end

  rule chem_systematic_name
    ( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word /
    ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word
  end

  rule chem_systematic_name_word
    [A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )*
  end

  rule chem_systematic_name_num
    digit+ [RH] /
    digit+ ( ',' digit+ )* '\''?
  end

  rule abbrev
    # These are listed explicitely to avoid incorrect interpretations, and allow missing trailing dots.
    # To get an idea of what occurs (second one omits trailing dots):
    #   cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z]\.?)\W/ && print "$1\n"' | sort | uniq -c | sort -rn
    #   cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq -c | sort -rn
    # Finally, you can generate the full list using this command:
    #   cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq | sed "s/^/'/;s/$/'i \//"
    #
    # Keep this list in sync with {FoodIngredientParser::Loose::Scanner#ABBREVS}.
    # too bad we can't use a shared array for this - https://groups.google.com/d/msg/treetop-dev/f3NveVHi7Aw/0uUogmLMb8wJ
    (
      'a.o.p'i /
      'b.g.a'i /
      'b.o.b'i /
      'c.a'i /
      'c.i'i /
      'd.e'i /
      'd.m.v'i /
      'd.o.c'i /
      'd.o.p'i /
      'd.s'i /
      'e.a'i /
      'e.g'i /
      'e.u'i /
      'f.i.l'i /
      'f.o.s'i /
      'i.a'i /
      'i.d'i /
      'i.e'i /
      'i.g.m.e'i /
      'i.g.p'i /
      'i.m.v'i /
      'i.o'i /
      'i.v.m'i /
      'l.s.l'i /
      'n.a'i /
      'n.b'i /
      'n.o'i /
      'n.v.t'i /
      'o.a'i /
      'o.b.v'i /
      'p.d.o'i /
      'p.g.i'i /
      'q.s'i /
      's.l'i /
      's.s'i /
      't.o.v'i /
      'u.h.t'i /
      'v.g'i /
      'v.s'i /
      'w.a'i /
      'w.o'i /
      'w.v'i /
      # not auto-generated additions
      'nr.'i /
      'vit'i /   # vitamin
      'denat'i / # denaturated
      'alc'i /   # alcohol
      'vol'i /   # volume
      'conc'i /  # concentration
      'subsp'i / # subspecies
      'www.'i [-_\/:%.A-Za-z0-9]+
    )
    '.'? ![[:alpha:]]
  end

  rule word_complex
    # Complex words that contain characters that would otherwise be considered non-words.
    (
      'N°'i /
      '°C'i /
      ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ /
      'L(+)' ('-' / ws) [[:alnum:]]+ /
      'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i /
      'S.' ws+ 'thermophilus'i / 'L.' ws+ 'bulgaricus'i /
      'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? /
      'nucifera' ws+ 'L.'i /
      'type'i ws+ '"' [0-9]+ '"' /
      e_number /
      chem_systematic_name
    ) ![[:alpha:]]
  end
end

end