class Entifier
Constants
- DAY_NAMES
- INDEXICALS_PRECEDING_APOSTROPHE_S
- MONTH_NAMES
Public Class Methods
extract(string, options = {})
click to toggle source
# File lib/entifier.rb, line 11 def self.extract(string, options = {}) options.nested_stringify_keys! entities = [] # HACK! This allows comma separated entities to be picked up. # string.gsub!(/\,\s/, ", ") # HACK! This allows entities to be at the end and start of conjoining sentences. # string.gsub!(/\./, ".") No longer needed # HACK! Remove extra spaces between sentences. #string.gsub!(/([\.\?\!])\s\s+/, "\1") # Pre-processor: remove extra spaces (this shouldn't affect which entities are detected, # it just makes the output look better) string.gsub!(/[[:blank:]]+/, "\s") # HACK! This allows parenthesized entity following other entity to be picked up. string.gsub!(/\s\(/, " (") capitalised_word = /[ÄÅÖA-Z](?:[a-zA-ZÄÅÖÜàâæçéèêëîïôøöûùüÿñ\-\d\&]+|\.(?:[A-Z]\.)*)/ capitalised_word_phrase = %r{ (?:\d{4}\s|Dr\.\s)? #{capitalised_word} (?: (?: (?:\s+(?:of|for|on|of\sthe|\&|d\'|du|de)|\'s) )? \s+#{capitalised_word})* (?:\s\d+)? }x regex = %r{ (?: (?: (?:\A|[\.\?\!\:][\"\']?\s+|\n) # At start of string, or starting new sentence... (?:[\"\'\(])? # ...optionally started with quote marks. ) ( (?:In\s(?:\d{4}\s)?)? #{capitalised_word_phrase}(?:\'s)? ) | # --- OR --- [^\.\n\?\!\:\"][[:blank:]][\"\'\(]? # After any non-full-stop followed by a space... (#{capitalised_word_phrase}) ) }x #[\,\'\s\.\Z] string.scan(regex) do |match| #entity = match if match[0] word_count = match[0].split(" ").size if word_count > 1 entity = match[0].gsub(/\A(In(?:\s\d{4})?|The|If|But|Two|(?:One|Two)\sof)\s/, "").gsub(/\'s\Z/, "") elsif match[0][-2,2] == "'s" entity = match[0].gsub(/\'s\Z/, "") elsif match[0] =~ /\A[A-Z]+\Z/ entity = match[0] else entity = nil end else entity = match[1] end # HACK: These should really be filtered out by the regex. if entity # entity = entity.strip.gsub(/\'s\Z/, "").gsub(/\AIn\s/, "").gsub(/\AIf\s/, "") end entity = nil if DAY_NAMES.include?(entity) entity = nil if MONTH_NAMES.include?(entity) entity = nil if INDEXICALS_PRECEDING_APOSTROPHE_S.include?(entity) if entity entity.gsub!( /((I|i)n\s)(January|Feburary|March|April|May|June|July|August|September|October|November|December)/, "") entity.gsub!( /(January|Feburary|March|April|May|June|July|August|September|October|November|December)\s(\d{4}|\d{2})/, "") entity.gsub!(/(O|\so)n\s(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)/, "") entity.gsub!(/\A\d+\Z/, "") # Make string blank if it's just numbers. entity = nil if entity == "" # If there's nothing left, make it nil. end entities << entity unless entity.nil? # Don't collect the entity if it's nil end entities.uniq! return entities end