class Wordlist::Lexer
Parses arbitrary text and scans each word from it.
@api semipublic
@since 1.0.0
Constants
- ACRONYM
Regexp to match acronyms.
- SPECIAL_CHARS
Default set of punctuation characters allowed within words
Attributes
@return [Array<String, Regexp>]
@return [Symbol]
@return [Array<String>]
@return [Array<String>]
Public Class Methods
Initializes the lexer.
@param [Symbol] lang
The language to use. Defaults to {Lang.default}.
@param [Array<String>] stop_words
The explicit stop-words to ignore. If not given, default stop words will be loaded based on `lang` or {Lang.default}.
@param [Array<String, Regexp>] ignore_words
Optional list of words to ignore. Can contain Strings or Regexps.
@param [Boolean] digits
Controls whether parsed words may contain digits or not.
@param [Array<String>] special_chars
The additional special characters allowed within words.
@param [Boolean] numbers
Controls whether whole numbers will be parsed as words.
@param [Boolean] acronyms
Controls whether acronyms will be parsed as words.
@param [Boolean] normalize_case
Controls whether to convert all words to lowercase.
@param [Boolean] normalize_apostrophes
Controls whether apostrophes will be removed from the end of words.
@param [Boolean] normalize_acronyms
Controls whether acronyms will have `.` characters removed.
@raise [ArgumentError]
The `ignore_words` keyword contained a value other than a String or Regexp.
# File lib/wordlist/lexer.rb, line 73 def initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) @lang = lang @stop_words = stop_words @ignore_words = ignore_words @special_chars = special_chars @digits = digits @numbers = numbers @acronyms = acronyms @normalize_acronyms = normalize_acronyms @normalize_apostrophes = normalize_apostrophes @normalize_case = normalize_case escaped_chars = Regexp.escape(@special_chars.join) @word = if @digits # allows numeric characters /\p{L}(?:[\p{L}\p{Nd}#{escaped_chars}]*[\p{L}\p{Nd}])?/ else # only allows alpha characters /\p{L}(?:[\p{L}#{escaped_chars}]*\p{L})?/ end skip_words = Regexp.union( (@stop_words + @ignore_words).map { |pattern| case pattern when Regexp then pattern when String then /#{Regexp.escape(pattern)}/i else raise(ArgumentError,"ignore_words: must contain only Strings or Regexps") end } ) if @numbers # allows lexing whole numbers @skip_word = /(?:#{skip_words}[[:punct:]]*(?:[[:space:]]+|$))+/i @word = /#{@word}|\d+/ @not_a_word = /[^\p{L}\d]+/ else # skips whole numbers @skip_word = /(?:(?:#{skip_words}|\d+)[[:punct:]]*(?:[[:space:]]+|$))+/i @not_a_word = /[^\p{L}]+/ end end
Public Instance Methods
Determines whether acronyms will be parsed or ignored.
@return [Boolean]
# File lib/wordlist/lexer.rb, line 152 def acronyms? @acronyms end
Determines whether parsed words may contain digits or not.
@return [Boolean]
# File lib/wordlist/lexer.rb, line 134 def digits? @digits end
Determines whether ‘.` characters will be removed from acronyms.
@return [Boolean]
# File lib/wordlist/lexer.rb, line 161 def normalize_acronyms? @normalize_acronyms end
Determines whether apostrophes will be stripped from words.
@return [Boolean]
# File lib/wordlist/lexer.rb, line 170 def normalize_apostrophes? @normalize_apostrophes end
Determines whether all words will be converted to lowercase.
@return [Boolean]
# File lib/wordlist/lexer.rb, line 179 def normalize_case? @normalize_case end
Determines whether numbers will be parsed or ignored.
@return [Boolean]
# File lib/wordlist/lexer.rb, line 143 def numbers? @numbers end
Enumerates over each word in the text.
@yield [word]
The given block will be passed each word from the text.
@yieldparam [String] word
A parsed word from the text.
@return [Array<String>]
If no block is given, an Array of the parsed words will be returned instead.
# File lib/wordlist/lexer.rb, line 196 def parse(text,&block) return enum_for(__method__,text).to_a unless block_given? scanner = StringScanner.new(text) until scanner.eos? scanner.skip(@not_a_word) scanner.skip(@skip_word) if (acronym = scanner.scan(ACRONYM)) if @acronyms acronym.tr!('.','') if @normalize_acronyms yield acronym end elsif (word = scanner.scan(@word)) word.downcase! if @normalize_case word.chomp!("'s") if (@normalize_apostrophes && word.end_with?("'s")) yield word end end end