class List::Matcher
Constants
- QRX
to make a replacement of Regexp.quote that ignores characters that only need quoting inside character classes
Attributes
atomic[R]
backtracking[R]
bound[R]
case_insensitive[R]
encoding[R]
left_bound[R]
multiline[R]
name[R]
normalize_whitespace[R]
not_extended[R]
right_bound[R]
strip[R]
vet[R]
word_test[R]
Public Class Methods
new( atomic: true, backtracking: true, bound: false, strip: false, case_insensitive: false, multiline: false, not_extended: false, normalize_whitespace: false, symbols: {}, name: false, vet: false, encoding: Encoding::UTF_8 )
click to toggle source
# File lib/list_matcher.rb, line 26 def initialize( atomic: true, backtracking: true, bound: false, strip: false, case_insensitive: false, multiline: false, not_extended: false, normalize_whitespace: false, symbols: {}, name: false, vet: false, encoding: Encoding::UTF_8 ) @atomic = atomic @backtracking = backtracking @strip = strip || normalize_whitespace @case_insensitive = case_insensitive @multiline = multiline @not_extended = not_extended @symbols = deep_dup symbols @_bound = bound @bound = !!bound @normalize_whitespace = normalize_whitespace @vet = vet @encoding = encoding if name raise Error, "name must be a string or symbol" unless name.is_a?(String) || name.is_a?(Symbol) begin Regexp.new "(?<#{name}>.*)" # stir up any errors that might arise from using this name in a named capture @name = name rescue raise Error, "#{name} does not work as the name of a named group" end end case bound when TrueClass @word_test = /\w/ @left_bound = '\b' @right_bound = '\b' when FalseClass when Symbol case bound when :string, :string_left, :string_right @word_test = /./ @left_bound = '\A' @right_bound = '\z' when :line, :line_left, :line_right @word_test = /./ @left_bound = '^' @right_bound = '$' when :word, :word_left, :word_right @word_test = /\w/ @left_bound = '\b' @right_bound = '\b' else raise Error, "unfamiliar value for :bound option: #{bound.inspect}" end if /_left/ === bound.to_s @right_bound = nil elsif /_right/ === bound.to_s @left_bound = nil end when Hash @word_test = bound[:test] || raise( Error, 'no boundary test provided' ) @left_bound = bound[:left] @right_bound = bound[:right] raise Error, 'neither bound provided' unless @left_bound || @right_bound raise Error, 'test must be Regexp or String' unless @word_test.is_a?(Regexp) || @word_test.is_a?(String) @word_test = Regexp.new @word_test unless @word_test.is_a?(Regexp) [ @left_bound, @right_bound ].compact.each do |b| raise Error, 'bounds must be strings' unless b.is_a?(String) begin Regexp.new b rescue raise Error, "bad boundary pattern: #{b}" end end else raise Error, "unfamiliar value for :bound option: #{bound.inspect}" end symbols.keys.each do |k| raise Error, "symbols variable #{k.inspect} is neither a string, a symbol, nor a regex" unless k.is_a?(String) || k.is_a?(Symbol) || k.is_a?(Regexp) end if normalize_whitespace @symbols[' '] = { pattern: '\s++' } elsif not_extended @symbols[' '] = { pattern: ' ' } end if not_extended @symbols['#'] = { pattern: '#' } end if vet Special.new( self, @symbols, [] ).verify end end
pattern(list, opts={})
click to toggle source
convenience method for one-off regexen where there's no point in keeping around a pattern generator
# File lib/list_matcher.rb, line 14 def self.pattern(list, opts={}) self.new(**opts).pattern list end
quote(s)
click to toggle source
# File lib/list_matcher.rb, line 249 def self.quote(s) s.gsub(QRX) { |c| Regexp.quote c } end
rx(list, opts={})
click to toggle source
like self.pattern, but returns a regex rather than a string
# File lib/list_matcher.rb, line 19 def self.rx(list, opts={}) self.new(**opts).rx list end
Public Instance Methods
bud(opts={})
click to toggle source
returns a new pattern matcher differing from the original only in the options specified
# File lib/list_matcher.rb, line 124 def bud(opts={}) opts = { atomic: @atomic, backtracking: @backtracking, bound: @_bound, strip: @strip, case_insensitive: @case_insensitive, multiline: @multiline, not_extended: @not_extended, normalize_whitespace: @normalize_whitespace, symbols: @symbols, name: @name, vet: @vet && opts[:symbols] }.merge opts self.class.new(**opts) end
modifiers()
click to toggle source
# File lib/list_matcher.rb, line 172 def modifiers ( @modifiers ||= if case_insensitive || multiline || not_extended [ [ ( 'i' if case_insensitive ), ( 'm' if multiline ), ( '-x' if not_extended ) ].compact.join ] else [nil] end )[0] end
pattern( list, opts={} )
click to toggle source
converst list into a string representing a regex pattern suitable for inclusion in a larger regex
# File lib/list_matcher.rb, line 142 def pattern( list, opts={} ) return '(?!)' unless list.any? return bud(opts).pattern list unless opts.empty? list = list.compact.map(&:to_s).select{ |s| s.length > 0 } list.map!(&:strip).select!{ |s| s.length > 0 } if strip list.map!{ |s| s.gsub /\s++/, ' ' } if normalize_whitespace return nil if list.empty? specializer = Special.new self, @symbols, list list = specializer.normalize root = tree list, specializer root.root = true root.flatten rx = root.convert if m = modifiers rx = "(?#{m}:#{rx})" grouped = true end if name rx = "(?<#{name}>#{rx})" grouped = true end return rx if grouped && backtracking if atomic && !root.atomic? wrap rx else rx end end
pfx()
click to toggle source
# File lib/list_matcher.rb, line 185 def pfx @pfx ||= backtracking ? '(?:' : '(?>' end
qmark()
click to toggle source
# File lib/list_matcher.rb, line 189 def qmark @qmark ||= backtracking ? '?' : '?+' end
quote(s)
click to toggle source
# File lib/list_matcher.rb, line 253 def quote(s) self.class.quote s end
rx(list, opts={})
click to toggle source
like pattern but it returns a regex instead of a string
# File lib/list_matcher.rb, line 181 def rx(list, opts={}) Regexp.new pattern(list, opts) end
tree(list, symbols)
click to toggle source
# File lib/list_matcher.rb, line 201 def tree(list, symbols) if list.size == 1 leaves = list[0].chars.map do |c| symbols[c] || Leaf.new( self, c ) end if leaves.length == 1 leaves.first else Sequence.new self, *leaves end elsif list.all?{ |w| w.length == 1 } chars = list.select{ |w| !symbols[w] } if chars.size > 1 list -= chars c = CharClass.new self, chars end a = Alternate.new self, symbols, list unless list.empty? a.children.unshift c if a && c a || c elsif c = best_prefix(list) # found a fixed-width prefix pattern if optional = c[1].include?('') c[1].reject!{ |w| w == '' } end c1 = tree c[0], symbols c2 = tree c[1], symbols c2 = c2.optionalize optional Sequence.new self, c1, c2 elsif c = best_suffix(list) # found a fixed-width suffix pattern if optional = c[0].include?('') c[0].reject!{ |w| w == '' } # TODO make this faster with index end c1 = tree c[0], symbols c1 = c1.optionalize optional c2 = tree c[1], symbols Sequence.new self, c1, c2 else grouped = list.group_by{ |w| w[0] } chars = grouped.select{ |_, w| w.size == 1 && w[0].size == 1 && !symbols[w[0]] }.map{ |v, _| v } if chars.size > 1 list -= chars c = CharClass.new self, chars end a = Alternate.new self, symbols, list a.children.unshift c if c a end end
wrap(s)
click to toggle source
# File lib/list_matcher.rb, line 193 def wrap(s) pfx + s + ')' end
wrap_size()
click to toggle source
# File lib/list_matcher.rb, line 197 def wrap_size @wrap_size ||= pfx.length + 1 end
Protected Instance Methods
best_prefix(list)
click to toggle source
# File lib/list_matcher.rb, line 271 def best_prefix(list) acceptable = nil sizes = list.map(&:size) min = sizes.reduce 0, :+ sizes.uniq! lim = sizes.count == 1 ? list[0].size - 1 : sizes.min (1..lim).each do |l| c = {} list.each do |w| pfx = w[0...l] sfx = w[l..-1] ( c[pfx] ||= [] ) << sfx end c = cross_products c if c.size == 1 count = count(c) if count < min min = count acceptable = c[0] end end end acceptable end
best_suffix(list)
click to toggle source
# File lib/list_matcher.rb, line 296 def best_suffix(list) acceptable = nil sizes = list.map(&:size) min = sizes.reduce 0, :+ sizes.uniq! lim = sizes.count == 1 ? list[0].size - 1 : sizes.min (1..lim).each do |l| c = {} list.each do |w| i = w.length - l pfx = w[0...i] sfx = w[i..-1] ( c[sfx] ||= [] ) << pfx end c = cross_products c if c.size == 1 count = count(c) if count < min min = count acceptable = c[0].reverse end end end acceptable end
count(c)
click to toggle source
# File lib/list_matcher.rb, line 327 def count(c) c = c[0] c[0].map(&:size).reduce( 0, :+ ) + c[1].map(&:size).reduce( 0, :+ ) end
cross_products(c)
click to toggle source
discover cross products – e.g., {this, that} X {cat, dog}
# File lib/list_matcher.rb, line 323 def cross_products(c) c.to_a.group_by{ |_, v| v.sort }.map{ |k,v| [ v.map{ |a| a[0] }.sort, k ] } end
deep_dup(o)
click to toggle source
# File lib/list_matcher.rb, line 259 def deep_dup(o) if o.is_a?(Hash) Hash[o.map{ |k, v| [ deep_dup(k), deep_dup(v) ] }] elsif o.is_a?(Array) o.map{ |v| deep_dup v } elsif o.nil? || o.is_a?(Symbol) || o.is_a?(TrueClass) || o.is_a?(FalseClass) o else o.dup end end