class ConfidentialInfoRedactor::Extractor

This class extracts proper nouns from a text

Constants

EXTRACT_REGEX

Rubular: rubular.com/r/qE0g4r9zR7

PUNCTUATION_REGEX

Attributes

corpus[R]
language[R]

Public Class Methods

new(**args) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 12
def initialize(**args)
  @language = args[:language] || 'en'
  case @language
  when 'en'
    @corpus = ConfidentialInfoRedactor::WordLists::EN_WORDS
  when 'de'
    @corpus = ConfidentialInfoRedactor::WordLists::DE_WORDS
  else
    @corpus = ConfidentialInfoRedactor::WordLists::EN_WORDS
  end
end

Public Instance Methods

extract(text) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 24
def extract(text)
  extracted_terms = []
  PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment|
    initial_extracted_terms = extract_preliminary_terms(segment)
    search_ngrams(initial_extracted_terms, extracted_terms)
  end
  extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?)
end

Private Instance Methods

clean_token(token) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 39
def clean_token(token)
  token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip
end
extract_preliminary_terms(segment) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 35
def extract_preliminary_terms(segment)
  segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact
end
find_extracted_terms(string, extracted_terms) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 68
def find_extracted_terms(string, extracted_terms)
  cleaned_token_downcased = clean_token(string.downcase)
  cleaned_token = clean_token(string)
  tokens = cleaned_token_downcased.split(' ')
  if matching_first_token?(tokens)
    extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1])
  else
    extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token))
  end
  extracted_terms
end
includes_confidential?(token) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 56
def includes_confidential?(token)
  token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank')
  true
end
matching_first_token?(tokens) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 61
def matching_first_token?(tokens)
  corpus.include?(tokens[0]) &&
    tokens[0] != 'the' &&
    tokens[0] != 'deutsche' &&
    tokens.length.eql?(2)
end
non_confidential_token?(token, includes_confidential) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 43
def non_confidential_token?(token, includes_confidential)
  corpus.include?(token) || !includes_confidential || singular_in_corpus?(token)
end
search_ngrams(tokens, extracted_terms) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 80
def search_ngrams(tokens, extracted_terms)
  tokens.each do |ngram|
    ngram.split(PUNCTUATION_REGEX).each do |t|
      next if !(t !~ /.*\d+.*/)
      extracted_terms = find_extracted_terms(t, extracted_terms)
    end
  end
end
singular_in_corpus?(token) click to toggle source
# File lib/confidential_info_redactor/extractor.rb, line 47
def singular_in_corpus?(token)
  corpus.include?(token[0...-1]) &&
    token[-1].eql?('s') ||
    corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') ||
    corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') ||
    corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') ||
    corpus.include?(token[0...-1]) && token[-1].eql?('n')
end