class List::Matcher

Constants

QRX

to make a replacement of Regexp.quote that ignores characters that only need quoting inside character classes

Attributes

atomic[R]
backtracking[R]
bound[R]
case_insensitive[R]
encoding[R]
left_bound[R]
multiline[R]
name[R]
normalize_whitespace[R]
not_extended[R]
right_bound[R]
strip[R]
vet[R]
word_test[R]

Public Class Methods

new( atomic: true, backtracking: true, bound: false, strip: false, case_insensitive: false, multiline: false, not_extended: false, normalize_whitespace: false, symbols: {}, name: false, vet: false, encoding: Encoding::UTF_8 ) click to toggle source
# File lib/list_matcher.rb, line 26
def initialize(
      atomic:               true,
      backtracking:         true,
      bound:                false,
      strip:                false,
      case_insensitive:     false,
      multiline:            false,
      not_extended:         false,
      normalize_whitespace: false,
      symbols:              {},
      name:                 false,
      vet:                  false,
      encoding:             Encoding::UTF_8
    )
  @atomic               = atomic
  @backtracking         = backtracking
  @strip                = strip || normalize_whitespace
  @case_insensitive     = case_insensitive
  @multiline            = multiline
  @not_extended         = not_extended
  @symbols              = deep_dup symbols
  @_bound               = bound
  @bound                = !!bound
  @normalize_whitespace = normalize_whitespace
  @vet                  = vet
  @encoding             = encoding
  if name
    raise Error, "name must be a string or symbol" unless name.is_a?(String) || name.is_a?(Symbol)
    begin
      Regexp.new "(?<#{name}>.*)"   # stir up any errors that might arise from using this name in a named capture
      @name = name
    rescue
      raise Error, "#{name} does not work as the name of a named group"
    end
  end
  case bound
  when TrueClass
    @word_test   = /\w/
    @left_bound  = '\b'
    @right_bound = '\b'
  when FalseClass
  when Symbol
    case bound
    when :string, :string_left, :string_right
      @word_test   = /./
      @left_bound  = '\A'
      @right_bound = '\z'
    when :line, :line_left, :line_right
      @word_test   = /./
      @left_bound  = '^'
      @right_bound = '$'
    when :word, :word_left, :word_right
      @word_test   = /\w/
      @left_bound  = '\b'
      @right_bound = '\b'
    else
      raise Error, "unfamiliar value for :bound option: #{bound.inspect}"
    end
    if /_left/ === bound.to_s
      @right_bound = nil
    elsif /_right/ === bound.to_s
      @left_bound = nil
    end
  when Hash
    @word_test   = bound[:test] || raise( Error, 'no boundary test provided' )
    @left_bound  = bound[:left]
    @right_bound = bound[:right]
    raise Error, 'neither bound provided' unless @left_bound || @right_bound
    raise Error, 'test must be Regexp or String' unless @word_test.is_a?(Regexp) || @word_test.is_a?(String)
    @word_test = Regexp.new @word_test unless @word_test.is_a?(Regexp)
    [ @left_bound, @right_bound ].compact.each do |b|
      raise Error, 'bounds must be strings' unless b.is_a?(String)
      begin
        Regexp.new b
      rescue
        raise Error, "bad boundary pattern: #{b}"
      end
    end
  else
    raise Error, "unfamiliar value for :bound option: #{bound.inspect}"
  end
  symbols.keys.each do |k|
    raise Error, "symbols variable #{k.inspect} is neither a string, a symbol, nor a regex" unless k.is_a?(String) || k.is_a?(Symbol) || k.is_a?(Regexp)
  end
  if normalize_whitespace
    @symbols[' '] = { pattern: '\s++' }
  elsif not_extended
    @symbols[' '] = { pattern: ' ' }
  end
  if not_extended
    @symbols['#'] = { pattern: '#' }
  end
  if vet
    Special.new( self, @symbols, [] ).verify
  end
end
pattern(list, opts={}) click to toggle source

convenience method for one-off regexen where there's no point in keeping around a pattern generator

# File lib/list_matcher.rb, line 14
def self.pattern(list, opts={})
  self.new(**opts).pattern list
end
quote(s) click to toggle source
# File lib/list_matcher.rb, line 249
def self.quote(s)
  s.gsub(QRX) { |c| Regexp.quote c }
end
rx(list, opts={}) click to toggle source

like self.pattern, but returns a regex rather than a string

# File lib/list_matcher.rb, line 19
def self.rx(list, opts={})
  self.new(**opts).rx list
end

Public Instance Methods

bud(opts={}) click to toggle source

returns a new pattern matcher differing from the original only in the options specified

# File lib/list_matcher.rb, line 124
def bud(opts={})
  opts = {
    atomic:               @atomic,
    backtracking:         @backtracking,
    bound:                @_bound,
    strip:                @strip,
    case_insensitive:     @case_insensitive,
    multiline:            @multiline,
    not_extended:         @not_extended,
    normalize_whitespace: @normalize_whitespace,
    symbols:              @symbols,
    name:                 @name,
    vet:                  @vet && opts[:symbols]
  }.merge opts
  self.class.new(**opts)
end
modifiers() click to toggle source
# File lib/list_matcher.rb, line 172
def modifiers
  ( @modifiers ||= if case_insensitive || multiline || not_extended
    [ [ ( 'i' if case_insensitive ), ( 'm' if multiline ), ( '-x' if not_extended ) ].compact.join ]
  else
    [nil]
  end )[0]
end
pattern( list, opts={} ) click to toggle source

converst list into a string representing a regex pattern suitable for inclusion in a larger regex

# File lib/list_matcher.rb, line 142
def pattern( list, opts={} )
  return '(?!)' unless list.any?
  return bud(opts).pattern list unless opts.empty?
  list = list.compact.map(&:to_s).select{ |s| s.length > 0 }
  list.map!(&:strip).select!{ |s| s.length > 0 } if strip
  list.map!{ |s| s.gsub /\s++/, ' ' } if normalize_whitespace
  return nil if list.empty?
  specializer = Special.new self, @symbols, list
  list = specializer.normalize

  root = tree list, specializer
  root.root = true
  root.flatten
  rx = root.convert
  if m = modifiers
    rx = "(?#{m}:#{rx})"
    grouped = true
  end
  if name
    rx = "(?<#{name}>#{rx})"
    grouped = true
  end
  return rx if grouped && backtracking
  if atomic && !root.atomic?
    wrap rx
  else
    rx
  end
end
pfx() click to toggle source
# File lib/list_matcher.rb, line 185
def pfx
  @pfx ||= backtracking ? '(?:' : '(?>'
end
qmark() click to toggle source
# File lib/list_matcher.rb, line 189
def qmark
  @qmark ||= backtracking ? '?' : '?+'
end
quote(s) click to toggle source
# File lib/list_matcher.rb, line 253
def quote(s)
  self.class.quote s
end
rx(list, opts={}) click to toggle source

like pattern but it returns a regex instead of a string

# File lib/list_matcher.rb, line 181
def rx(list, opts={})
  Regexp.new pattern(list, opts)
end
tree(list, symbols) click to toggle source
# File lib/list_matcher.rb, line 201
def tree(list, symbols)
  if list.size == 1
    leaves = list[0].chars.map do |c|
      symbols[c] || Leaf.new( self, c )
    end
    if leaves.length == 1
      leaves.first
    else
      Sequence.new self, *leaves
    end
  elsif list.all?{ |w| w.length == 1 }
    chars = list.select{ |w| !symbols[w] }
    if chars.size > 1
      list -= chars
      c = CharClass.new self, chars
    end
    a = Alternate.new self, symbols, list unless list.empty?
    a.children.unshift c if a && c
    a || c
  elsif c = best_prefix(list)   # found a fixed-width prefix pattern
    if optional = c[1].include?('')
      c[1].reject!{ |w| w == '' }
    end
    c1 = tree c[0], symbols
    c2 = tree c[1], symbols
    c2 = c2.optionalize optional
    Sequence.new self, c1, c2
  elsif c = best_suffix(list)   # found a fixed-width suffix pattern
    if optional = c[0].include?('')
      c[0].reject!{ |w| w == '' }   # TODO make this faster with index
    end
    c1 = tree c[0], symbols
    c1 = c1.optionalize optional
    c2 = tree c[1], symbols
    Sequence.new self, c1, c2
  else
    grouped = list.group_by{ |w| w[0] }
    chars = grouped.select{ |_, w| w.size == 1 && w[0].size == 1 && !symbols[w[0]] }.map{ |v, _| v }
    if chars.size > 1
      list -= chars
      c = CharClass.new self, chars
    end
    a = Alternate.new self, symbols, list
    a.children.unshift c if c
    a
  end
end
wrap(s) click to toggle source
# File lib/list_matcher.rb, line 193
def wrap(s)
  pfx + s + ')'
end
wrap_size() click to toggle source
# File lib/list_matcher.rb, line 197
def wrap_size
  @wrap_size ||= pfx.length + 1
end

Protected Instance Methods

best_prefix(list) click to toggle source
# File lib/list_matcher.rb, line 271
def best_prefix(list)
  acceptable = nil
  sizes      = list.map(&:size)
  min        = sizes.reduce 0, :+
  sizes.uniq!
  lim = sizes.count == 1 ? list[0].size - 1 : sizes.min
  (1..lim).each do |l|
    c = {}
    list.each do |w|
      pfx = w[0...l]
      sfx = w[l..-1]
      ( c[pfx] ||= [] ) << sfx
    end
    c = cross_products c
    if c.size == 1
      count = count(c)
      if count < min
        min = count
        acceptable = c[0]
      end
    end
  end
  acceptable
end
best_suffix(list) click to toggle source
# File lib/list_matcher.rb, line 296
def best_suffix(list)
  acceptable = nil
  sizes      = list.map(&:size)
  min        = sizes.reduce 0, :+
  sizes.uniq!
  lim = sizes.count == 1 ? list[0].size - 1 : sizes.min
  (1..lim).each do |l|
    c = {}
    list.each do |w|
      i   = w.length - l
      pfx = w[0...i]
      sfx = w[i..-1]
      ( c[sfx] ||= [] ) << pfx
    end
    c = cross_products c
    if c.size == 1
      count = count(c)
      if count < min
        min = count
        acceptable = c[0].reverse
      end
    end
  end
  acceptable
end
count(c) click to toggle source
# File lib/list_matcher.rb, line 327
def count(c)
  c = c[0]
  c[0].map(&:size).reduce( 0, :+ ) + c[1].map(&:size).reduce( 0, :+ )
end
cross_products(c) click to toggle source

discover cross products – e.g., {this, that} X {cat, dog}

# File lib/list_matcher.rb, line 323
def cross_products(c)
  c.to_a.group_by{ |_, v| v.sort }.map{ |k,v| [ v.map{ |a| a[0] }.sort, k ] }
end
deep_dup(o) click to toggle source
# File lib/list_matcher.rb, line 259
def deep_dup(o)
  if o.is_a?(Hash)
    Hash[o.map{ |k, v| [ deep_dup(k), deep_dup(v) ] }]
  elsif o.is_a?(Array)
    o.map{ |v| deep_dup v }
  elsif o.nil? || o.is_a?(Symbol) || o.is_a?(TrueClass) || o.is_a?(FalseClass)
    o
  else
    o.dup
  end
end