module Boogex

Constants

AND_REGEX
Error
NOT_REGEX
OR_REGEX
VERSION

Public Class Methods

convert(text) click to toggle source
# File lib/boogex/convertor.rb, line 7
def self.convert(text)
  texts = text.split(NOT_REGEX)
  fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2
  inclu_text = texts[0]
  exclu_text = texts[1]
  regex_hash = {
    inclusive_regex: run_through_convertors(inclu_text)
  }

  unless exclu_text.nil?
    regex_hash[:exclusive_regex] = run_through_convertors(exclu_text)
    regex_hash[:no_links] = true if exclu_text.include?('HTTP')
    validate_regex_syntax!(regex_hash[:exclusive_regex], text)
  end

  validate_regex_syntax!(regex_hash[:inclusive_regex], text)
  regex_hash
end
run_through_convertors(text) click to toggle source
# File lib/boogex/convertor.rb, line 31
def self.run_through_convertors(text)
  array = array_struct(text)
  array = ors_to_pipes(array)
  array = regex_formatting(array)
  regex_array_to_string(array)
end
validate_regex_syntax!(regex, text) click to toggle source
# File lib/boogex/convertor.rb, line 26
def self.validate_regex_syntax!(regex, text)
  # Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong.
  fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil?
end

Private Class Methods

all_strings?(array) click to toggle source
# File lib/boogex/convertor.rb, line 222
def self.all_strings?(array)
  array.all? do |elem|
    elem.is_a?(String)
  end
end
array_struct(text) click to toggle source

This function converts a string into an array where brackets in the string are converted to an array structure to allow further manipulation “a OR (b) OR c” => [“a OR ”, [“b”], “ OR c”] “a OR (b AND (c OR d)) OR e” => [“a OR ”, [“b AND ”, [“c OR d”]], “ OR e”]

# File lib/boogex/convertor.rb, line 44
def self.array_struct(text)
  inside_brackets = "[^\(\)]*"
  not_open_bracket = "[^\(]*"

  #This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets
  regex = Regexp.new(get_bracket_regex)
  
  cuts = text.scan(regex).to_a.flatten.reject(&:nil?)
  # If nothing found then return orignal text
  return text if cuts.empty?
  
  # The text is now cut into an array where the bracketing of the string determines the elements
  # ie. "a OR (b) OR c" => ["a OR ", ["b"], " OR c"]
  text_array = cuts.inject([text]) do |a, cut|
    a.each_with_object([]) do |str, result|
      if !str.include?(cut)
        result << str
      else
        splits = str.split(cut)
        if splits.size == 2
          result << splits.first
          cut_without_brackets = cut[1..-2]
          result << [cut_without_brackets]
          result << splits.last
        elsif splits.size == 1 && str.index(splits[0]) == 0
          result << splits[0]
          cut_without_brackets = cut[1..-2]
          result << [cut_without_brackets]
        elsif splits.size == 1 && str.index(splits[0]) > 0
          cut_without_brackets = cut[1..-2]
          result << [cut_without_brackets]
          result << splits[0]
        elsif splits.size == 0
          cut_without_brackets = cut[1..-2]
          result << [cut_without_brackets]
        else
          fail "This should never happen"
        end
      end
    end
  end.compact

  # This recursively converts any brackets in the text back into the array_struct function
  # where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex
  # on line 9 but can be extended.
  # If the element of the array is a string then no recursion to apply.
  # If the element of the array is an array then iterate THAT through the array_struct function
  text_array.reject(&:empty?).each_with_object([]) do |str, result|
      if str.is_a?(String)
        result << str
        next
      end

      result << str.collect do |str|
      array_struct(str)
    end
  end
end
bracket_input_to_brackets(input) click to toggle source

This function first converts the sequence of 0's and 1's to open and close brackets. It then puts in a 'b' string between any close brackets that are followed by an open bracket. Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a).

# File lib/boogex/convertor.rb, line 299
def self.bracket_input_to_brackets(input)
  brackets = ["\\(a", "a\\)"]
  input.collect do |i|
    brackets[i]
  end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a')
end
construct_AND_array(array) click to toggle source
# File lib/boogex/convertor.rb, line 208
def self.construct_AND_array(array)
  and_prefix = 'AND(['
  and_suffix = "])" 
  internal_str = array.collect do |str|
    next "'" + str + "'" unless str.include?(and_prefix)
    str
  end.join(',')
  and_prefix + internal_str + and_suffix
end
contain_AND?(obj) click to toggle source
# File lib/boogex/convertor.rb, line 218
def self.contain_AND?(obj)
  obj.is_a?(String) && obj.match(AND_REGEX)
end
generate_brack_regex() click to toggle source

This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end.

# File lib/boogex/convertor.rb, line 248
def self.generate_brack_regex
  puts "Loading bracket regex..."
  inside_brackets = "[^\(\)]*"
  not_open_bracket = "[^\(]*"

  get_bracket_inputs.collect do |input|
    bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket)
  end.join('|')
end
get_bracket_inputs() click to toggle source

This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket. All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected.

# File lib/boogex/convertor.rb, line 260
def self.get_bracket_inputs
  inputs = []
  (0..2000).to_a.each do |v|
    result = v.to_s(2).split('').collect(&:to_i)
    inputs << result
    inputs << result.reverse unless result == result.reverse
    inputs.uniq!
  end

  inputs.select { |v| valid?(v) }
end
get_bracket_regex() click to toggle source
# File lib/boogex/convertor.rb, line 241
def self.get_bracket_regex
  @bracket_regex ||= generate_brack_regex
end
has_unclosed_brackets?(obj) click to toggle source
# File lib/boogex/convertor.rb, line 115
def self.has_unclosed_brackets?(obj)
  obj.any? do |o|
    o.count('(') != o.count(')')
  end
end
is_AND_array?(array) click to toggle source
# File lib/boogex/convertor.rb, line 204
def self.is_AND_array?(array)
  array[0] == 'AND'
end
not_in_or?(text) click to toggle source

Is this text not in awe? lols. Rather, is it not wrapped in regex `or` . ie. |pieceofcontent| = false ie. |pieceofcontent = true ie. pieceofcontent| = true ie. pieceofcontent = true

# File lib/boogex/convertor.rb, line 233
def self.not_in_or?(text)
  text[0] != '|' && text[-1] != '|'
end
ors_to_pipes(obj) click to toggle source

This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks

# File lib/boogex/convertor.rb, line 104
def self.ors_to_pipes(obj)
  return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String)

  raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj)

  # This recursively applies this function to ensure all levels of the array are converted
  obj.collect do |text|
    ors_to_pipes(text)
  end
end
regex_array_to_string(obj) click to toggle source

This function converts the entire array with regex formatting into a regex string The AND array is an adhoc format generated by Lexer as regex doesn't have a Lucene Boolean `AND` equivalent. This adhoc regex AND is generated in the Lexer stack in the task generator

# File lib/boogex/convertor.rb, line 188
def self.regex_array_to_string(obj)
  return obj if obj.is_a?(String)

  is_AND_array = is_AND_array?(obj)

  # This removes the "AND" from the AND array
  obj.shift if is_AND_array

  result = obj.collect do |text|
    regex_array_to_string(text)
  end
  return construct_AND_array(result) if is_AND_array
  
  result.join('')
end
regex_formatting(obj) click to toggle source

This function begins to tranform the elements of the array structure to regex formatting including:

  • (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises

the regex of the elements of the structures

  • (b) Converting any Lucene Boolean `AND` into an AND array structure where the first element is “AND” and the

remaining elements of that array are the regexes that make up the `AND` ie. [“pete AND james”] => [“AND”, “pete”, “james”] ie. [“jenny AND”, [“billy OR jimmy”]] => [“AND”, “jenny”, [“billy OR jimmy”]]

# File lib/boogex/convertor.rb, line 130
def self.regex_formatting(obj)
  # (a)
  # if string then wrap it in brackets if needed and then return
  if obj.is_a?(String)
    if contain_AND?(obj)
      result = ['AND']
      result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str|
        regex_formatting(str)
      end
      return result
    end
    needs_brackets = not_in_or?(obj)
    obj = wrap_in_brackets(obj) if needs_brackets
    return obj
  end
  
  # if an all string array, then check if any of the elements of the array need bracket wrapped and return
  if all_strings?(obj)
    if obj.any? do |str|
      contain_AND?(str)
    end
      result = obj.each_with_object(['AND']) do |str, arr|
        str.split(AND_REGEX).reject(&:empty?).collect do |str|
          arr << regex_formatting(str)
        end
      end
      return result
    end
    needs_brackets = obj.any? do |text|
      not_in_or?(text)
    end
    obj = obj.join('')
    obj = wrap_in_brackets(obj) if needs_brackets
    return obj
  end

  # (b)
  result = []

  # If this level of bracket contains a string with `AND` in it, then consider this element an `AND` array
  result << 'AND' if obj.any? do |elem|
    contain_AND?(elem)
  end

  obj.each_with_object(result) do |text, result|
    if contain_AND?(text)
      text.split(AND_REGEX).reject(&:empty?).each do |str|
        result << regex_formatting(str)
      end
    else
      result << regex_formatting(text)
    end
  end
end
valid?(input) click to toggle source
# File lib/boogex/convertor.rb, line 272
def self.valid?(input)
  # The total count of brackets must be even.
  return false unless input.size.even?

  # Only have 0's or 1's as inputs
  return false if input.any? { |v| ![0, 1].include?(v) }

  # The number of open brackets must equal the number of closed brackets
  return false unless input.inject(0) { |n, v| n + v } == input.size / 2

  # Can't start with a close bracket or end with an open bracket
  return false if input.first == 1 || input.last == 0

  sum = 0
  valid = true
  input.each_with_index do |v, i|
    sum += v
    comparison = (i + 1) / 2
    valid = false if sum > comparison
  end
  valid
end
wrap_in_brackets(text) click to toggle source
# File lib/boogex/convertor.rb, line 237
def self.wrap_in_brackets(text)
  '(?:' + text + ')'
end