module Mongoid::FullTextSearch::ClassMethods
Public Instance Methods
all_ngrams(str, config, bound_number_returned = true)
click to toggle source
# File lib/mongoid/full_text_search.rb, line 205 def all_ngrams(str, config, bound_number_returned = true) return {} if str.nil? if config[:remove_accents] if defined?(UnicodeUtils) str = UnicodeUtils.nfkd(str) elsif defined?(DiacriticsFu) str = DiacriticsFu.escape(str) end end # Remove any characters that aren't in the alphabet and aren't word separators filtered_str = str.mb_chars.downcase.to_s.split('').find_all { |ch| config[:alphabet][ch] || config[:word_separators][ch] }.join('') # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams, # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'. step_size = if bound_number_returned [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max else 1 end # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the # input string using the step size that we just computed. Let score(x,y) be the score of string x # compared with string y - assigning scores to ngrams with the square root-based scoring function # below and multiplying scores of matching ngrams together yields a score function that has the # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z) # for any string z contained in y. ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i| score = if i == 0 || (config[:apply_prefix_scoring_to_all_words] && \ config[:word_separators].key?(filtered_str[i - 1].chr)) Math.sqrt(1 + 1.0 / filtered_str.length) else Math.sqrt(2.0 / filtered_str.length) end { ngram: filtered_str[i..i + config[:ngram_width] - 1], score: score } end # If an ngram appears multiple times in the query string, keep the max score ngram_array = ngram_array.group_by { |h| h[:ngram] }.map { |key, values| { ngram: key, score: values.map { |v| v[:score] }.max } } if config[:index_short_prefixes] || config[:index_full_words] split_regex_def = config[:word_separators].keys.map { |k| Regexp.escape(k) }.join split_regex = Regexp.compile("[#{split_regex_def}]") all_words = filtered_str.split(split_regex) end # Add 'short prefix' records to the array: prefixes of the string that are length (ngram_width - 1) if config[:index_short_prefixes] prefixes_seen = {} all_words.each do |word| next if word.length < config[:ngram_width] - 1 prefix = word[0...config[:ngram_width] - 1] if prefixes_seen[prefix].nil? && (config[:stop_words][word].nil? || word == filtered_str) ngram_array << { ngram: prefix, score: 1 + 1.0 / filtered_str.length } prefixes_seen[prefix] = true end end end # Add records to the array of ngrams for each full word in the string that isn't a stop word if config[:index_full_words] full_words_seen = {} all_words.each do |word| if word.length > 1 && full_words_seen[word].nil? && (config[:stop_words][word].nil? || word == filtered_str) ngram_array << { ngram: word, score: 1 + 1.0 / filtered_str.length } full_words_seen[word] = true end end end # If an ngram appears as any combination of full word, short prefix, and ngram, keep the sum of the two scores Hash[ngram_array.group_by { |h| h[:ngram] }.map { |key, values| [key, values.map { |v| v[:score] }.sum] }] end
create_fulltext_indexes()
click to toggle source
# File lib/mongoid/full_text_search.rb, line 70 def create_fulltext_indexes return unless mongoid_fulltext_config mongoid_fulltext_config.each_pair do |index_name, fulltext_config| fulltext_search_ensure_indexes(index_name, fulltext_config) end end
fulltext_search(query_string, options = {})
click to toggle source
# File lib/mongoid/full_text_search.rb, line 125 def fulltext_search(query_string, options = {}) max_results = options.key?(:max_results) ? options.delete(:max_results) : 10 return_scores = options.key?(:return_scores) ? options.delete(:return_scores) : false if mongoid_fulltext_config.count > 1 && !options.key?(:index) error_message = '%s is indexed by multiple full-text indexes. You must specify one by passing an :index_name parameter' raise UnspecifiedIndexError, error_message % name, caller end index_name = options.key?(:index) ? options.delete(:index) : mongoid_fulltext_config.keys.first # Options hash should only contain filters after this point ngrams = all_ngrams(query_string, mongoid_fulltext_config[index_name]) return [] if ngrams.empty? # For each ngram, construct the query we'll use to pull index documents and # get a count of the number of index documents containing that n-gram ordering = { 'score' => -1 } limit = mongoid_fulltext_config[index_name][:max_candidate_set_size] coll = collection.database[index_name] cursors = ngrams.map do |ngram| query = { 'ngram' => ngram[0] } query.update(map_query_filters(options)) count = coll.find(query).count { ngram: ngram, count: count, query: query } end.sort! { |record1, record2| record1[:count] <=> record2[:count] } # Using the queries we just constructed and the n-gram frequency counts we # just computed, pull in about *:max_candidate_set_size* candidates by # considering the n-grams in order of increasing frequency. When we've # spent all *:max_candidate_set_size* candidates, pull the top-scoring # *max_results* candidates for each remaining n-gram. results_so_far = 0 candidates_list = cursors.map do |doc| next if doc[:count] == 0 query_result = coll.find(doc[:query]) if results_so_far >= limit query_result = query_result.sort(ordering).limit(max_results) elsif doc[:count] > limit - results_so_far query_result = query_result.sort(ordering).limit(limit - results_so_far) end results_so_far += doc[:count] ngram_score = ngrams[doc[:ngram][0]] Hash[query_result.map do |candidate| [candidate['document_id'], { clazz: candidate['class'], score: candidate['score'] * ngram_score }] end] end.compact # Finally, score all candidates by matching them up with other candidates that are # associated with the same document. This is similar to how you might process a # boolean AND query, except that with an AND query, you'd stop after considering # the first candidate list and matching its candidates up with candidates from other # lists, whereas here we want the search to be a little fuzzier so we'll run through # all candidate lists, removing candidates as we match them up. all_scores = [] until candidates_list.empty? candidates = candidates_list.pop scores = candidates.map do |candidate_id, data| { id: candidate_id, clazz: data[:clazz], score: data[:score] + candidates_list.map { |others| (others.delete(candidate_id) || { score: 0 })[:score] }.sum } end all_scores.concat(scores) end all_scores.sort! { |document1, document2| -document1[:score] <=> -document2[:score] } instantiate_mapreduce_results(all_scores[0..max_results - 1], return_scores: return_scores) end
fulltext_search_ensure_indexes(index_name, config)
click to toggle source
# File lib/mongoid/full_text_search.rb, line 77 def fulltext_search_ensure_indexes(index_name, config) db = collection.database coll = db[index_name] # The order of filters matters when the same index is used from two or more collections. filter_indexes = (config[:filters] || []).map do |key, _value| ["filter_values.#{key}", 1] end.sort_by { |filter_index| filter_index[0] } index_definition = [['ngram', 1], ['score', -1]].concat(filter_indexes) # Since the definition of the index could have changed, we'll clean up by # removing any indexes that aren't on the exact. correct_keys = index_definition.map { |field_def| field_def[0] } all_filter_keys = filter_indexes.map { |field_def| field_def[0] } coll.indexes.each do |idef| keys = idef['key'].keys next unless keys.member?('ngram') all_filter_keys |= keys.find_all { |key| key.starts_with?('filter_values.') } next unless keys & correct_keys != correct_keys Mongoid.logger.info "Dropping #{idef['name']} [#{keys & correct_keys} <=> #{correct_keys}]" if Mongoid.logger if Mongoid::Compatibility::Version.mongoid5_or_newer? coll.indexes.drop_one(idef['key']) else coll.indexes.drop(idef['key']) end end if all_filter_keys.length > filter_indexes.length filter_indexes = all_filter_keys.map { |key| [key, 1] }.sort_by { |filter_index| filter_index[0] } index_definition = [['ngram', 1], ['score', -1]].concat(filter_indexes) end Mongoid.logger.info "Ensuring fts_index on #{coll.name}: #{index_definition}" if Mongoid.logger if Mongoid::Compatibility::Version.mongoid5_or_newer? coll.indexes.create_one(Hash[index_definition], name: 'fts_index') else coll.indexes.create(Hash[index_definition], name: 'fts_index') end Mongoid.logger.info "Ensuring document_id index on #{coll.name}" if Mongoid.logger if Mongoid::Compatibility::Version.mongoid5_or_newer? coll.indexes.create_one('document_id' => 1) # to make removes fast else coll.indexes.create('document_id' => 1) # to make removes fast end end
fulltext_search_in(*args)
click to toggle source
# File lib/mongoid/full_text_search.rb, line 24 def fulltext_search_in(*args) self.mongoid_fulltext_config = {} if mongoid_fulltext_config.nil? options = args.last.is_a?(Hash) ? args.pop : {} index_name = if options.key?(:index_name) options[:index_name] else 'mongoid_fulltext.index_%s_%s' % [name.downcase, mongoid_fulltext_config.count] end config = { alphabet: 'abcdefghijklmnopqrstuvwxyz0123456789 ', word_separators: "-_ \n\t", ngram_width: 3, max_ngrams_to_search: 6, apply_prefix_scoring_to_all_words: true, index_full_words: true, index_short_prefixes: false, max_candidate_set_size: 1000, remove_accents: true, reindex_immediately: true, stop_words: Hash[%w[i a s t me my we he it am is be do an if or as of at by to up in on no so our you him his she her its who are was has had did the and but for out off why how all any few nor not own too can don now ours your hers they them what whom this that were been have does with into from down over then once here when both each more most some such only same than very will just yours their which these those being doing until while about after above below under again there where other myself itself theirs having during before should himself herself because against between through further yourself ourselves yourselves themselves].map { |x| [x, true] }] } config.update(options) args = [:to_s] if args.empty? config[:ngram_fields] = args config[:alphabet] = Hash[config[:alphabet].split('').map { |ch| [ch, ch] }] config[:word_separators] = Hash[config[:word_separators].split('').map { |ch| [ch, ch] }] mongoid_fulltext_config[index_name] = config before_save(:update_ngram_index) if config[:reindex_immediately] before_destroy :remove_from_ngram_index end
instantiate_mapreduce_result(result)
click to toggle source
# File lib/mongoid/full_text_search.rb, line 193 def instantiate_mapreduce_result(result) result[:clazz].constantize.find(result[:id]) end
instantiate_mapreduce_results(results, options)
click to toggle source
# File lib/mongoid/full_text_search.rb, line 197 def instantiate_mapreduce_results(results, options) if options[:return_scores] results.map { |result| [instantiate_mapreduce_result(result), result[:score]] }.find_all { |result| !result[0].nil? } else results.map { |result| instantiate_mapreduce_result(result) }.compact end end
remove_from_ngram_index()
click to toggle source
# File lib/mongoid/full_text_search.rb, line 281 def remove_from_ngram_index mongoid_fulltext_config.each_pair do |index_name, _fulltext_config| coll = collection.database[index_name] if Mongoid::Compatibility::Version.mongoid5_or_newer? coll.find('class' => name).delete_many else coll.find('class' => name).remove_all end end end
update_ngram_index()
click to toggle source
# File lib/mongoid/full_text_search.rb, line 292 def update_ngram_index all.each(&:update_ngram_index) end
Private Instance Methods
format_query_filter(operator, key, value)
click to toggle source
# File lib/mongoid/full_text_search.rb, line 312 def format_query_filter(operator, key, value) ['filter_values.%s' % key, { operator => [value].flatten }] end
map_query_filters(filters)
click to toggle source
Take a list of filters to be mapped so they can update the query used upon the fulltext search of the ngrams
# File lib/mongoid/full_text_search.rb, line 300 def map_query_filters(filters) Hash[filters.map do |key, value| case value when Hash then if value.key? :any then format_query_filter('$in', key, value[:any]) elsif value.key? :all then format_query_filter('$all', key, value[:all]) else raise UnknownFilterQueryOperator, value.keys.join(','), caller end else format_query_filter('$all', key, value) end end] end