module FoodIngredientParser::Strict::Grammar
grammar Common rule ws !newline [[:space:]] end rule newline "\n" end rule char !mark [[:alnum:]] / fraction / [-/\`'"´‘’+=_{}&] / [®©™♣] / [¿?¯] / # weird characters turning up in names (e.g. encoding issues) [₁₂₃₄₅₆₇₈₉] # can occur with vitamins end rule mark # mark referencing a footnote [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾'? / '⁽' [¹²³⁴⁵ᵃᵇᶜᵈᵉᶠᵍªº] '⁾' / [˄^] digit / [†‡⁺•°▪◊#˄^~˛] / '*'+ / '(' ws* ( [†‡⁺•°▪◊#˄^~˛] / '*'+ ) ws* ')' end rule digit [0-9] end rule fraction [½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒] / digit+ '/' digit+ end rule percent [%٪⁒%﹪] end rule number digit+ [,.] digit+ / digit+ ws* fraction / fraction / digit+ end rule dash [-֊ ‐ ‑ ‒ – — ― ﹘﹣-] end rule word abbrev / word_complex / char+ end rule and ( 'and' / 'en' / 'und' ) !char / '&' end rule e_number ( 'E'i ( '-' / ws+ )? [0-9] [0-9] [0-9] [[:alpha:]]? ) ( ( ws* '(' [iIvV]+ ')' ) / ( ws* '[' [iIvV]+ ']' ) / ![[:alnum:]] ) # e.g. "E450 (iii)" end rule chem_systematic_name ( chem_systematic_name_num dash ) ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )* chem_systematic_name_word / ( chem_systematic_name_word dash chem_systematic_name_num dash ws? )+ chem_systematic_name_word end rule chem_systematic_name_word [A-Za-z]+ ( dash [A-Za-z]+ dash [A-Za-z]+ )* end rule chem_systematic_name_num digit+ [RH] / digit+ ( ',' digit+ )* '\''? end rule abbrev # These are listed explicitely to avoid incorrect interpretations, and allow missing trailing dots. # To get an idea of what occurs (second one omits trailing dots): # cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z]\.?)\W/ && print "$1\n"' | sort | uniq -c | sort -rn # cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq -c | sort -rn # Finally, you can generate the full list using this command: # cat data/ingredient-samples-nl | perl -ne '$_=lc($_); /\b(([a-z]\.)+[a-z])\W/ && print "$1\n"' | sort | uniq | sed "s/^/'/;s/$/'i \//" # # Keep this list in sync with {FoodIngredientParser::Loose::Scanner#ABBREVS}. # too bad we can't use a shared array for this - https://groups.google.com/d/msg/treetop-dev/f3NveVHi7Aw/0uUogmLMb8wJ ( 'a.o.p'i / 'b.g.a'i / 'b.o.b'i / 'c.a'i / 'c.i'i / 'd.e'i / 'd.m.v'i / 'd.o.c'i / 'd.o.p'i / 'd.s'i / 'e.a'i / 'e.g'i / 'e.u'i / 'f.i.l'i / 'f.o.s'i / 'i.a'i / 'i.d'i / 'i.e'i / 'i.g.m.e'i / 'i.g.p'i / 'i.m.v'i / 'i.o'i / 'i.v.m'i / 'l.s.l'i / 'n.a'i / 'n.b'i / 'n.o'i / 'n.v.t'i / 'o.a'i / 'o.b.v'i / 'p.d.o'i / 'p.g.i'i / 'q.s'i / 's.l'i / 's.s'i / 't.o.v'i / 'u.h.t'i / 'v.g'i / 'v.s'i / 'w.a'i / 'w.o'i / 'w.v'i / # not auto-generated additions 'nr.'i / 'vit'i / # vitamin 'denat'i / # denaturated 'alc'i / # alcohol 'vol'i / # volume 'conc'i / # concentration 'subsp'i / # subspecies 'www.'i [-_\/:%.A-Za-z0-9]+ ) '.'? ![[:alpha:]] end rule word_complex # Complex words that contain characters that would otherwise be considered non-words. ( 'N°'i / '°C'i / ( 'ijzer'i / 'chroom'i / 'koper'i ) ws* '(' 'I'i+ ')' ws* [[:alnum:]]+ / 'L(+)' ('-' / ws) [[:alnum:]]+ / 'L.' ws+ 'rhamnosus'i / 'L.' ws+ 'acidophilus'i / 'L.' ws+ 'casei' / 'B.'i ws+ 'lactis'i / 'A.'i ws+ 'oryzae'i / 'S.' ws+ 'thermophilus'i / 'L.' ws+ 'bulgaricus'i / 'T.' ws* 'aestivum'i (ws+ 'vitt.'i)? / 'nucifera' ws+ 'L.'i / 'type'i ws+ '"' [0-9]+ '"' / e_number / chem_systematic_name ) ![[:alpha:]] end end
end