9#include <boost/tokenizer.hpp>
26 const Xapian::Database& iDatabase)
27 : _resultHolder (NULL), _database (iDatabase),
28 _queryString (iQueryString), _hasFullTextMatched (false),
43 std::ostringstream oStr;
50 std::ostringstream oStr;
52 if (_correctedQueryString.empty() ==
false
53 && _correctedQueryString != _queryString) {
54 oStr <<
"(corrected into '" << _correctedQueryString
55 <<
"' with an edit distance/error of " << _editDistance
56 <<
" over an allowable distance of " << _allowableEditDistance
66 std::ostringstream oStr;
69 if (_documentList.empty() ==
true) {
70 oStr <<
"No match" << std::endl;
73 assert (_hasFullTextMatched ==
true);
75 unsigned short idx = 0;
76 for (DocumentList_T::const_iterator itDoc = _documentList.begin();
77 itDoc != _documentList.end(); ++itDoc, ++idx) {
80 const Xapian::Document& lXapianDoc = lDocumentPair.first;
81 const Xapian::docid& lDocID = lXapianDoc.get_docid();
83 const ScoreBoard& lScoreBoard = lDocumentPair.second;
88 oStr <<
"Doc ID: " << lDocID <<
", matching with ("
89 << lScoreBoard.
describe() <<
"), containing: '"
90 << lXapianDoc.get_data() <<
"'";
107 getDocumentPair (
const Xapian::docid& iDocID)
const {
110 DocumentMap_T::const_iterator itDoc = _documentMap.find (iDocID);
112 if (itDoc == _documentMap.end()) {
114 <<
") can not be found in the Result object "
117 assert (itDoc != _documentMap.end());
123 return oDocumentPair;
127 const Xapian::Document& Result::
128 getDocument (
const Xapian::docid& iDocID)
const {
134 const Xapian::Document& oXapianDocument = lDocumentPair.first;
137 return oXapianDocument;
151 Score_T lCorrectedScore = iScore;
152 if (_editDistance > 0) {
153 lCorrectedScore = iScore / (_editDistance * _editDistance * _editDistance);
161 lXapianScoreType, lCorrectedScore);
164 const Xapian::docid& lDocID = iDocument.get_docid();
180 _documentList.push_back (lDocumentPair);
183 const bool hasInsertBeenSuccessful =
184 _documentMap.insert (DocumentMap_T::value_type (lDocID,
185 lDocumentPair)).second;
187 if (hasInsertBeenSuccessful ==
false) {
188 std::ostringstream errorStr;
189 errorStr <<
"Error while inserting the Xapian Document pair into "
190 <<
"the internal STL map";
193 assert (hasInsertBeenSuccessful ==
true);
202 for (Xapian::MSetIterator itDoc = iMatchingSet.begin();
203 itDoc != iMatchingSet.end(); ++itDoc) {
204 const int& lXapianPercentage = itDoc.get_percent();
205 const Xapian::Document& lDocument = itDoc.get_document();
232 <<
", " << _bestCombinedWeight <<
"% [" << _bestDocData
254 return oEditDistance;
274 const std::string& lDocumentDataStr = iDocument.get_data();
303 const Score_T oEnvelopeID =
static_cast<const Score_T> (lEnvelopeIDInt);
322 Xapian::MSet& ioMatchingSet) {
323 std::string oMatchedString;
329 Xapian::QueryParser lQueryParser;
330 lQueryParser.set_database (iDatabase);
338 lQueryParser.set_default_op (Xapian::Query::OP_PHRASE);
350 Xapian::Enquire enquire (iDatabase);
358 const Xapian::Query& lXapianQuery =
359 lQueryParser.parse_query (iQueryString,
360 Xapian::QueryParser::FLAG_BOOLEAN
361 | Xapian::QueryParser::FLAG_PHRASE
362 | Xapian::QueryParser::FLAG_LOVEHATE);
365 enquire.set_query (lXapianQuery);
372 int nbMatches = ioMatchingSet.size();
376 <<
"', i.e.: `" << lXapianQuery.get_description()
377 <<
"' => " << nbMatches <<
" result(s) found");
379 if (nbMatches != 0) {
382 setEditDistance (lEditDistance);
385 setAllowableEditDistance (lEditDistance);
388 oMatchedString = iQueryString;
391 setHasFullTextMatched (
true);
395 setCorrectedQueryString (oMatchedString);
399 <<
"' provides " << nbMatches <<
" exact matches.");
401 return oMatchedString;
403 assert (ioMatchingSet.empty() ==
true);
410 const NbOfErrors_T& lAllowableEditDistance =
411 calculateEditDistance (iQueryString);
414 const std::string& lCorrectedString =
415 iDatabase.get_spelling_suggestion (iQueryString, lAllowableEditDistance);
419 if (lCorrectedString.empty() ==
true || lCorrectedString == iQueryString) {
422 << iQueryString <<
"' provides no match, "
423 <<
"and there is no spelling suggestion, "
424 <<
"even with an edit distance of "
425 << lAllowableEditDistance);
428 setHasFullTextMatched (
false);
431 return oMatchedString;
433 assert (lCorrectedString.empty() ==
false
434 && lCorrectedString != iQueryString);
438 Levenshtein::getDistance (iQueryString, lCorrectedString);
447 const Xapian::Query& lCorrectedXapianQuery =
448 lQueryParser.parse_query (lCorrectedString,
449 Xapian::QueryParser::FLAG_BOOLEAN
450 | Xapian::QueryParser::FLAG_PHRASE
451 | Xapian::QueryParser::FLAG_LOVEHATE);
455 enquire.set_query (lCorrectedXapianQuery);
456 ioMatchingSet = enquire.get_mset (0, K_DEFAULT_XAPIAN_MATCHING_SET_SIZE);
459 nbMatches = ioMatchingSet.size();
464 << lCorrectedXapianQuery.get_description()
465 <<
"' => " << nbMatches <<
" result(s) found");
467 if (nbMatches != 0) {
469 setEditDistance (lEditDistance);
472 setAllowableEditDistance (lAllowableEditDistance);
475 oMatchedString = lCorrectedString;
478 setHasFullTextMatched (
true);
481 setCorrectedQueryString (oMatchedString);
485 << iQueryString <<
"', spelling suggestion: `"
487 <<
"', with a Levenshtein edit distance of "
489 <<
" over an allowable edit distance of "
490 << lAllowableEditDistance <<
", provides "
491 << nbMatches <<
" matches.");
494 return oMatchedString;
499 << iQueryString <<
"', spelling suggestion: `"
501 <<
"', with a Levenshtein edit distance of "
503 <<
" over an allowable edit distance of "
504 << lAllowableEditDistance <<
", provides no match, "
505 <<
"which is not consistent with the existence of "
506 <<
"the spelling correction.");
509 }
catch (
const Xapian::Error& error) {
511 throw XapianException (error.get_msg());
515 setHasFullTextMatched (
false);
517 return oMatchedString;
523 std::string oMatchedString;
536 Xapian::MSet lMatchingSet;
537 if (isToBeAdded ==
true) {
538 oMatchedString =
fullTextMatch (iDatabase, iQueryString, lMatchingSet);
545 if (isToBeAdded ==
false) {
548 <<
"' is not made of searchable words");
553 }
catch (
const Xapian::Error& error) {
558 return oMatchedString;
564 for (DocumentList_T::const_iterator itDoc = _documentList.begin();
565 itDoc != _documentList.end(); ++itDoc) {
569 const Xapian::Document& lXapianDoc = lDocumentPair.first;
572 const Xapian::docid& lDocID = lXapianDoc.get_docid();
578 const ScoreBoard& lScoreBoard = lDocumentPair.second;
585 <<
"' with (" << lLocationKey <<
", doc ID = "
586 << lDocID <<
") matches at " << lXapianPct
596 DocumentMap_T::iterator itDoc = _documentMap.find (iDocID);
598 if (itDoc == _documentMap.end()) {
600 <<
") can not be found in the Result object "
603 assert (itDoc != _documentMap.end());
607 ScoreBoard& lScoreBoard = lXapianDocPair.second;
610 lScoreBoard.
setScore (iType, iScore);
616 for (DocumentList_T::iterator itDoc = _documentList.begin();
617 itDoc != _documentList.end(); ++itDoc) {
621 const Xapian::Document& lXapianDoc = lDocumentPair.first;
624 const Xapian::docid& lDocID = lXapianDoc.get_docid();
633 if (lEnvelopeIDInt != 0) {
635 <<
"] (" << lLocationKey <<
", doc ID = "
636 << lDocID <<
") has a non-null envelope ID ("
637 << lEnvelopeIDInt <<
") => match of 0.10%");
641 const Score_T lEnvelopeID =
static_cast<const Score_T> (lEnvelopeIDInt);
644 ScoreBoard& lScoreBoard = lDocumentPair.second;
655 for (DocumentList_T::iterator itDoc = _documentList.begin();
656 itDoc != _documentList.end(); ++itDoc) {
660 const Xapian::Document& lXapianDoc = lDocumentPair.first;
663 const Xapian::docid& lDocID = lXapianDoc.get_docid();
670 bool hasCodeFullyMatched =
false;
675 std::string lFilteredString (_queryString);
683 lFilteredQueryWordList);
684 const NbOfWords_T nbOfFilteredQueryWords = lFilteredQueryWordList.size();
687 if (_hasFullTextMatched ==
true) {
694 const size_t lNbOfLetters = lFilteredString.size();
695 if (nbOfFilteredQueryWords == 1
696 && lNbOfLetters >= 3 && lNbOfLetters <= 4
697 && _correctedQueryString == _queryString) {
700 std::string lUpperQueryWord;
701 lUpperQueryWord.resize (lNbOfLetters);
702 std::transform (lFilteredString.begin(), lFilteredString.end(),
703 lUpperQueryWord.begin(), ::toupper);
710 if (lUpperQueryWord == lIataCode) {
715 hasCodeFullyMatched =
true;
719 if (hasCodeFullyMatched ==
true) {
722 <<
"' matches the IATA/ICAO code ("
723 << lLocationKey <<
", doc ID = "
724 << lDocID <<
") => match of "
729 <<
"' does not match with the IATA/ICAO "
730 <<
"code (" << lLocationKey <<
", doc ID = "
731 << lDocID <<
") => match of "
737 ScoreBoard& lScoreBoard = lDocumentPair.second;
748 for (DocumentList_T::iterator itDoc = _documentList.begin();
749 itDoc != _documentList.end(); ++itDoc) {
753 const Xapian::Document& lXapianDoc = lDocumentPair.first;
756 const Xapian::docid& lDocID = lXapianDoc.get_docid();
766 <<
"] (" << lLocationKey <<
", doc ID = "
767 << lDocID <<
") has a PageRank of "
768 << lPageRank <<
"%");
771 ScoreBoard& lScoreBoard = lDocumentPair.second;
791 std::string lBestDocData;
794 Xapian::docid lBestDocID = 0;
795 for (DocumentList_T::iterator itDoc = _documentList.begin();
796 itDoc != _documentList.end(); ++itDoc) {
800 const Xapian::Document& lXapianDoc = lDocumentPair.first;
801 const Xapian::docid& lDocID = lXapianDoc.get_docid();
802 const std::string& lDocData = lXapianDoc.get_data();
808 ScoreBoard& lScoreBoard = lDocumentPair.second;
821 if (lPercentage > lMaxPercentage) {
822 lMaxPercentage = lPercentage;
824 lBestDocData = lDocData;
831 lOriginalQueryWordList);
832 const NbOfWords_T nbOfOriginalQueryWords = lOriginalQueryWordList.size();
835 if (_hasFullTextMatched ==
true) {
839 const Xapian::Document& lXapianDoc = lXapianDocPair.first;
840 const ScoreBoard& lScoreBoard = lXapianDocPair.second;
845 <<
"' matches at " << lMaxPercentage
846 <<
"% for " << lLocationKey <<
" (doc ID = "
847 << lBestDocID <<
"). Score calculation: "
857 if (nbOfOriginalQueryWords == 1 && shouldBeKept ==
true) {
863 lMaxPercentage = 100.0;
867 <<
"' does not match, but it is a non black-listed "
868 <<
"single-word string; hence, the weight is "
869 << lMaxPercentage <<
"%");
880 lMaxPercentage = std::pow (10.0, -3*nbOfOriginalQueryWords);
884 <<
"' does not match, and is either a multiple-word "
885 <<
"string or black-listed; hence, the weight is "
886 << lMaxPercentage <<
"%");
#define OPENTREP_LOG_ERROR(iToBeLogged)
#define OPENTREP_LOG_DEBUG(iToBeLogged)
#define OPENTREP_LOG_NOTIFICATION(iToBeLogged)
const Location & generateLocation()
Class modelling a place/POR (point of reference).
void setEditDistance(const NbOfErrors_T &iEditDistance)
void setDocID(const XapianDocID_T &iDocID)
void setAllowableEditDistance(const NbOfErrors_T &iAllowableEditDistance)
void setPercentage(const MatchingPercentage_T &iPercentage)
void setOriginalKeywords(const std::string &iOriginalKeywords)
const LocationKey & getKey() const
void setCorrectedKeywords(const std::string &iCorrectedKeywords)
std::string fullTextMatch(const Xapian::Database &, const TravelQuery_T &)
void setScoreOnDocMap(const Xapian::docid &, const ScoreType &, const Score_T &)
void addDocument(const Xapian::Document &, const Score_T &)
void setBestCombinedWeight(const Percentage_T &iPercentage)
void displayXapianPercentages() const
void fillResult(const Xapian::MSet &iMatchingSet)
const XapianDocumentPair_T & getDocumentPair(const Xapian::docid &) const
void setBestDocData(const std::string &iDocData)
void fillPlace(Place &) const
void toStream(std::ostream &ioOut) const
void calculatePageRanks()
void setBestDocID(const Xapian::docid &iDocID)
static Score_T getEnvelopeID(const Xapian::Document &)
static PageRank_T getPageRank(const Xapian::Document &)
std::string toString() const
void fromStream(std::istream &ioIn)
std::string describeShortKey() const
static Location retrieveLocation(const Xapian::Document &)
void calculateHeuristicWeights()
void calculateEnvelopeWeights()
void calculateCodeMatches()
static LocationKey getPrimaryKey(const Xapian::Document &)
void calculateCombinedWeights()
std::string describeKey() const
static void tokeniseStringIntoWordList(const TravelQuery_T &, WordList_T &)
std::list< Word_T > WordList_T
unsigned int NbOfLetters_T
unsigned short NbOfErrors_T
const Percentage_T K_DEFAULT_MODIFIED_MATCHING_PCT
std::string TravelQuery_T
static unsigned int calculateEditDistance(const TravelQuery_T &iPhrase)
Helper function.
unsigned int EnvelopeID_T
const Percentage_T K_DEFAULT_FULL_CODE_MATCH_PCT
std::pair< Xapian::Document, ScoreBoard > XapianDocumentPair_T
const NbOfMatches_T K_DEFAULT_XAPIAN_MATCHING_SET_SIZE
const NbOfErrors_T K_DEFAULT_SIZE_FOR_SPELLING_ERROR_UNIT
unsigned short NbOfWords_T
std::string toString(const TokenList_T &iTokenList)
static void trim(std::string &ioPhrase, const NbOfLetters_T &iMinWordLength=4)
static bool shouldKeep(const std::string &iPhrase, const std::string &iWord)
Class modelling the primary key of a location/POR (point of reference).
const IATACode_T & getIataCode() const
Structure modelling a (geographical) location.
const LocationKey & getKey() const
const EnvelopeID_T & getEnvelopeID() const
const PageRank_T & getPageRank() const
Structure holding a board for all the types of score/matching having been performed.
Percentage_T calculateCombinedWeight()
std::string describe() const
void setScore(const ScoreType &, const Score_T &)
Score_T getScore(const ScoreType &) const
Enumeration of score types.