class ExtractPatterns

Public Class Methods

new(input, fields, match_name) click to toggle source
# File lib/extractpatterns.rb, line 6
def initialize(input, fields, match_name)
  @input = JSON.parse(input)
  @fields = fields
  @match_name = match_name
  @output = Array.new
end

Public Instance Methods

comma_list_matches(value) click to toggle source

Split to find matches

# File lib/extractpatterns.rb, line 14
def comma_list_matches(value)
  if value
    # Split on commas
    list_items = value.split(",")

    # Only get items under certain num of words
    list_items.reject!{ |item| item.split(" ").length > 2 }

    # Clean whitespace and ands
    return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip }
  end
  return []
end
find_known_terms(item, field, extract_list) click to toggle source

Extract set terms

# File lib/extractpatterns.rb, line 44
def find_known_terms(item, field, extract_list)
  d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
  d.extractSetTerms(fixEncode(File.read(extract_list)), ["codeword"], "case_sensitive")
  return JSON.parse(d.getAllOutput).first["extracted_codewords"]
end
fixEncode(str) click to toggle source

Fix encoding errors

# File lib/extractpatterns.rb, line 51
def fixEncode(str)
  if str.is_a?(String)
    return str.unpack('C*').pack('U*')
  else
    return str
  end
end
get_allcaps(value, length) click to toggle source

Get words in ALLCAPS past certain length

# File lib/extractpatterns.rb, line 29
def get_allcaps(value, length)
  if length && value
    # Get all matches
    matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/)
    
    # Remove matches that are too long
    matches.reject!{|match| match.length > 100}
    
    # Remove whitespace
    return matches.map{ |match| match.strip.lstrip }
  end
  return []
end
normalize_results(extracted_raw, synonym_list) click to toggle source

Normalize and match synonyms and deduplicate

# File lib/extractpatterns.rb, line 60
def normalize_results(extracted_raw, synonym_list)
  synonyms = JSON.parse(fixEncode(File.read(synonym_list)))
  outarr = extracted_raw.dup

  # Go through all extracted
  extracted_raw.each do |extracted|
    # Go through each item in synonym list
    synonyms.each do |key, value|
      value["codeword"].each do |word|
        # Match found!
        if word.downcase == extracted.downcase
          outarr.delete(extracted)
          outarr.push(key)
        end
      end
    end
  end

  # Return deduplicated
  return outarr.uniq
end
ranked_hash_output(results) click to toggle source

Return a ranked hash of the results

# File lib/extractpatterns.rb, line 107
def ranked_hash_output(results)
  # Make array of all results
  allmatches = Array.new
  results.each do |i|
    i["tools_mentioned"].each do |match|
      allmatches.push(match)
    end
  end

  # Make ranked hash
  rankedhash = Hash.new
  allmatches.each do |match|
    if rankedhash[match]
      rankedhash[match] += 1
    else
      rankedhash[match] = 1
    end
  end
  return rankedhash.sort_by{|k, v| v}
end
search_fields(allcaps_length, extract_list, merge_field) click to toggle source

Go through all items in JSON and fields to search

# File lib/extractpatterns.rb, line 83
def search_fields(allcaps_length, extract_list, merge_field)
  # Extract from each item
  @input.each do |item|
    item[@match_name] = Array.new
    
    @fields.each do |field|
      # Extract list results, allcaps, and known codewords from each field
      list_results = comma_list_matches(item[field])
      allcaps_results = get_allcaps(item[field], allcaps_length)
      merge_results = item[merge_field] ? item[merge_field] : []
      known_terms_results = find_known_terms(item, field, extract_list)
                           
      # Merge results and post-process
      item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results ),extract_list)
    end
    
    # Push updated item out
    @output.push(item)
  end

  return @output
end