class SportDb::MapperV2
note: this was/is a cut-n-page (inline) copy of TextUtils::TitleMapper2
see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/title_mapper2.rb
Constants
- MappingStruct
key: e.g. augsburg name: e.g. FC Augsburg length (of name(!!) - not regex pattern): e.g. 11 – do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
- Record
convenience helper - (auto)build ActiveRecord-like team records/structs
Attributes
known_names[R]
Public Class Methods
new( records_or_mapping, tag )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 58 def initialize( records_or_mapping, tag ) ## for convenience allow easy (auto-)convert text (lines) to records ## as 1) text block/string or ## 2) array of lines/strings records_or_mapping = build_records( records_or_mapping ) if records_or_mapping.is_a?( String ) || (records_or_mapping.is_a?( Array ) && records_or_mapping[0].is_a?( String )) ## build mapping lookup table @known_names = if records_or_mapping.is_a?( Hash ) ## assume "custom" mapping hash table (name=>record) build_name_table_for_mapping( records_or_mapping ) else ## assume array of records build_name_table_for_records( records_or_mapping ) end ## build lookup hash by record (e.g. team/club/etc.) key records = if records_or_mapping.is_a?( Array ) records_or_mapping else ## assume hash (uses values assuming to be all records - note might include duplicates) records_or_mapping.values end @records = records.reduce({}) { |h,rec| h[rec.key]=rec; h } ## todo: rename tag to attrib or attrib_name - why ?? why not ??? @tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc. end
Public Instance Methods
build_records( txt_or_lines )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 25 def build_records( txt_or_lines ) recs = [] if txt_or_lines.is_a?( String ) ## todo/fix: use ParserHelper read_lines !!! ???? txt = txt_or_lines lines = [] txt.each_line do |line| line = line.strip next if line.empty? || line.start_with?( '#' ) ## note: skip empty and comment lines lines << line end else lines = txt_or_lines end lines.each do |line| values = line.split( '|' ) values = values.map { |value| value.strip } name = values[0] ## note: quick hack - auto-generate key, that is, remove all non-ascii chars and downcase key = name.downcase.gsub( /[^a-z]/, '' ) alt_names = values.size > 1 ? values[1..-1].join( '|' ) : nil recs << Record.new( key, name, alt_names ) end recs end
find_rec!( line )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 94 def find_rec!( line ) find_rec_for!( @tag, line, @records ) end
find_recs!( line )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 98 def find_recs!( line ) # note: keys (plural!) - will return array counter = 1 recs = [] rec = find_rec_for!( "#{@tag}#{counter}", line, @records ) while rec recs << rec counter += 1 rec = find_rec_for!( "#{@tag}#{counter}", line, @records ) end recs end
map_names!( line )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 88 def map_names!( line ) ## rename to just map! - why?? why not??? begin found = map_name_for!( @tag, line, @known_names ) end while found end
Private Instance Methods
build_name_table_for_mapping( mapping )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 113 def build_name_table_for_mapping( mapping ) known_names = [] mapping.each do |name, rec| m = MappingStruct.new m.key = rec.key m.name = name m.length = name.length m.pattern = Regexp.escape( name ) ## note: just use "standard" regex escape (e.g. no extras for umlauts,accents,etc.) known_names << m end ## note: sort here by length (largest goes first - best match) known_names = known_names.sort { |l,r| r.length <=> l.length } known_names end
build_name_table_for_records( records )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 131 def build_name_table_for_records( records ) ## build known tracks table w/ alt names e.g. # # [[ 'wolfsbrug', 'VfL Wolfsburg'], # [ 'augsburg', 'FC Augsburg'], # [ 'augsburg', 'Augi2'], # [ 'augsburg', 'Augi3' ], # [ 'stuttgart', 'VfB Stuttgart']] known_names = [] records.each_with_index do |rec,index| name_candidates = [] name_candidates << rec.name name_candidates += rec.alt_names.split('|') if rec.alt_names && !rec.alt_names.empty? ## check if name includes subname e.g. Grand Prix Japan (Suzuka Circuit) # make subname optional by adding name w/o subname e.g. Grand Prix Japan names = [] name_candidates.each do |t| names << t if t =~ /\(.+\)/ extra_name = t.gsub( /\(.+\)/, '' ) # remove/delete subnames # note: strip leading n trailing withspaces too! # -- todo: add squish or something if () is inline e.g. leaves two spaces? extra_name.strip! names << extra_name end end names.each do |name| m = MappingStruct.new m.key = rec.key m.name = name m.length = name.length ## note: escape for regex plus allow subs for special chars/accents m.pattern = name_esc_regex( name ) known_names << m end logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{names.join('|')}<" ## note: only include code field - if defined if rec.respond_to?(:code) && rec.code && !rec.code.empty? m = MappingStruct.new m.key = rec.key m.name = rec.code m.length = rec.code.length m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now) known_names << m end end ## note: sort here by length (largest goes first - best match) # exclude code and key (key should always go last) known_names = known_names.sort { |l,r| r.length <=> l.length } known_names end
find_rec_for!( tag, line, records )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 220 def find_rec_for!( tag, line, records ) re = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@]) if line =~ re key = $1 logger.debug " #{tag.downcase}: >#{key}<" line.sub!( re, "[#{tag.upcase}]" ) records[ key ] ## note: map key to record (using records hash table mapping) else nil end end
map_name_for!( tag, line, mappings )
click to toggle source
# File lib/sportdb/formats/match/mapper.rb, line 199 def map_name_for!( tag, line, mappings ) mappings.each do |mapping| key = mapping.key pattern = mapping.pattern ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9) ## (thus add it, allows match for Benfica Lis. for example - note . at the end) ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$ re = /\b#{pattern}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker) if line =~ re logger.debug " match for #{tag.downcase} >#{key}< >#{pattern}<" # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc. line.sub!( re, "@@oo#{key}oo@@ " ) # NB: add one space char at end return true # break out after first match (do NOT continue) end end false end
name_esc_regex( name_unescaped )
click to toggle source
name helper cut-n-paste copy from TextUtils
see https://github.com/textkit/textutils/blob/master/textutils/lib/textutils/helper/title_helper.rb
# File lib/sportdb/formats/match/mapper.rb, line 239 def name_esc_regex( name_unescaped ) ## escape regex special chars e.g. # . to \. and # ( to \( # ) to \) # ? to \? -- zero or one # * to \* -- zero or more # + to \+ -- one or more # $ to \$ -- end of line # ^ to \^ -- start of line etc. ### add { and } ??? ### add [ and ] ??? ### add \ too ??? ### add | too ??? # e.g. Benfica Lis. # e.g. Club Atlético Colón (Santa Fe) # e.g. Bauer Anton (????) ## note: cannot use Regexp.escape! will escape space '' to '\ ' ## name = Regexp.escape( name_unescaped ) name = name_unescaped.gsub( '.', '\.' ) name = name.gsub( '(', '\(' ) name = name.gsub( ')', '\)' ) name = name.gsub( '?', '\?' ) name = name.gsub( '*', '\*' ) name = name.gsub( '+', '\+' ) name = name.gsub( '$', '\$' ) name = name.gsub( '^', '\^' ) ## match accented char with or without accents ## add (ü|ue) etc. ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss ## todo: add some more ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more ## ## reuse for all readers! alternatives = [ ['-', '(-| )'], ## e.g. Blau-Weiß Linz ['æ', '(æ|ae)'], ## e.g. ['ä', '(ä|ae)'], ## e.g. ['Ö', '(Ö|Oe)'], ## e.g. Österreich ['ö', '(ö|oe)'], ## e.g. Mönchengladbach ['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz ['ü', '(ü|ue)'], ## e.g. ['á', '(á|a)'], ## e.g. Bogotá, Sársfield ['ã', '(ã|a)'], ## e.g São Paulo ['ç', '(ç|c)'], ## e.g. Fenerbahçe ['é', '(é|e)'], ## e.g. Vélez ['ê', '(ê|e)'], ## e.g. Grêmio ['ï', '(ï|i)' ], ## e.g. El Djazaïr ['ñ', '(ñ|n)'], ## e.g. Porteño ['ň', '(ň|n)'], ## e.g. Plzeň ['ó', '(ó|o)'], ## e.g. Colón ['ō', '(ō|o)'], # # e.g. Tōkyō ['ș', '(ș|s)'], ## e.g. Bucarești ['ú', '(ú|u)'] ## e.g. Fútbol ] ### fix/todo: check for dot+space e.g. . and make dot optional ## # e.g. make dot (.) optional plus allow alternative optional space e.g. # -- for U.S.A. => allow USA or U S A # ## e.g. U. de G. or U de G or U.de G. ?? ## collect some more (real-world) examples first!!!!! alternatives.each do |alt| name = name.gsub( alt[0], alt[1] ) end name end