OpenTREP Logo  0.07.18
C++ Open Travel Request Parsing Library
Loading...
Searching...
No Matches
Filter.cpp
Go to the documentation of this file.
1// //////////////////////////////////////////////////////////////////////
2// Import section
3// //////////////////////////////////////////////////////////////////////
4// STL
5#include <cassert>
6#include <sstream>
7// OpenTrep
12
13namespace OPENTREP {
14
15 // //////////////////////////////////////////////////////////////////////
16 Filter::Filter() {
17 assert (false);
18 }
19
20 // //////////////////////////////////////////////////////////////////////
21 Filter::Filter (const Filter& iFilter) {
22 assert (false);
23 }
24
25 // //////////////////////////////////////////////////////////////////////
26 Filter::~Filter() {
27 }
28
29
38 // //////////////////////////////////////////////////////////////////////
39 bool hasGoodSize (const std::string& iWord, const NbOfLetters_T& iMinWordLength) {
40 bool hasGoodSizeFlag = true;
41 //
42 const size_t lWordLength = iWord.size();
43 if (lWordLength < iMinWordLength) {
44 hasGoodSizeFlag = false;
45 }
46 return hasGoodSizeFlag;
47 }
48
52 // //////////////////////////////////////////////////////////////////////
53 bool isBlackListed (const std::string& iWord) {
54 // When the word is part of the "black list", it should obviously be
55 // filtered out.
56 BlackList_T::const_iterator itWord = K_BLACK_LIST.find (iWord);
57 const bool isBlackListedFlag = (itWord != K_BLACK_LIST.end());
58
59 // DEBUG
60 // const std::string areEqualStr = (isBlackListedFlag)?"Yes":"No";
61 // const std::string& lWord = *itWord;
62 // OPENTREP_LOG_DEBUG ("Word: '" << iWord << "', black-list word: '"
63 // << lWord << "', Equals: " << areEqualStr);
64
65 return isBlackListedFlag;
66 }
67
71 // //////////////////////////////////////////////////////////////////////
72 void rtrim (WordList_T& ioWordList, const NbOfLetters_T& iMinWordLength) {
73 // If the list is empty, obviously nothing can be done at that stage.
74 if (ioWordList.empty() == true) {
75 return;
76 }
77
78 // Take the first right outer word
79 WordList_T::reverse_iterator itWord = ioWordList.rbegin();
80 assert (itWord != ioWordList.rend());
81 const std::string& lWord = *itWord;
82
83 // Check whether that word has the good size (>= iMinWordLength) and whether it is
84 // black-listed.
85 const bool hasGoodSizeFlag = hasGoodSize (lWord, iMinWordLength);
86 const bool isBlackListedFlag = isBlackListed (lWord);
87 if (hasGoodSizeFlag == false || isBlackListedFlag == true) {
88 ioWordList.erase (--itWord.base());
89 rtrim (ioWordList, iMinWordLength);
90 }
91 }
92
96 // //////////////////////////////////////////////////////////////////////
97 void ltrim (WordList_T& ioWordList, const NbOfLetters_T& iMinWordLength) {
98 // If the list is empty, obviously nothing can be done at that stage.
99 if (ioWordList.empty() == true) {
100 return;
101 }
102
103 // Take the first left outer word
104 WordList_T::iterator itWord = ioWordList.begin();
105 assert (itWord != ioWordList.end());
106 const std::string& lWord = *itWord;
107
108 // Check whether that word has the good size (>= iMinWordLength) and whether it is
109 // black-listed.
110 const bool hasGoodSizeFlag = hasGoodSize (lWord, iMinWordLength);
111 const bool isBlackListedFlag = isBlackListed (lWord);
112 if (hasGoodSizeFlag == false || isBlackListedFlag == true) {
113 ioWordList.erase (itWord);
114 ltrim (ioWordList, iMinWordLength);
115 }
116 }
117
121 // //////////////////////////////////////////////////////////////////////
122 void trim (WordList_T& ioWordList, const NbOfLetters_T& iMinWordLength) {
123 // Trim the non-relevant left outer words
124 ltrim (ioWordList, iMinWordLength);
125
126 // Trim the non-relevant right outer words
127 rtrim (ioWordList, iMinWordLength);
128 }
129
130 // //////////////////////////////////////////////////////////////////////
131 void Filter::trim (std::string& ioPhrase, const NbOfLetters_T& iMinWordLength) {
132 // Create a list of words from the given phrase
133 WordList_T lWordList;
134 tokeniseStringIntoWordList (ioPhrase, lWordList);
135
136 // Trim the non-relevant left and right outer words
137 OPENTREP::trim (lWordList, iMinWordLength);
138
139 // Re-create the phrase from the (potentially altered) list of words
140 ioPhrase = createStringFromWordList (lWordList);
141 }
142
143 // //////////////////////////////////////////////////////////////////////
144 bool Filter::shouldKeep (const std::string& iPhrase,
145 const std::string& iWord) {
146 bool isToBeKept = true;
147
148 // If both the phrase and the word are empty, the word should obviously
149 // be filtered out.
150 if (iPhrase.empty() == true && iWord.empty() == true) {
151 isToBeKept = false;
152 return isToBeKept;
153 }
154
155 // If the term to be added is equal to the whole phrase (e.g., 'san'),
156 // it should be kept (not filtered out). Indeed, three-letter words
157 // often correspond to IATA codes, and should obviously be kept for
158 // indexation/searching.
159 if (iPhrase == iWord) {
160 return isToBeKept;
161 }
162
163 // Now, the word is part of the phrase, and not equal to it (and not empty).
164
165 // If the word has no more than two letters (e.g., 'de'), it should be
166 // filtered out. Indeed, when 'de' is part of 'charles de gaulle',
167 // for instance, it should not be indexed/searched alone (in a search,
168 // the resulting match score will be zero).
169 isToBeKept = hasGoodSize (iWord, 3);
170 if (isToBeKept == false) {
171 return isToBeKept;
172 }
173
174 // Check whether the word is black-listed
175 isToBeKept = !isBlackListed (iWord);
176
177 //
178 return isToBeKept;
179 }
180
181}
std::list< Word_T > WordList_T
unsigned int NbOfLetters_T
void tokeniseStringIntoWordList(const std::string &iPhrase, WordList_T &ioWordList)
Definition Utilities.cpp:19
std::string createStringFromWordList(const WordList_T &iWordList, const NbOfWords_T iSplitIdx, const bool iFromBeginningFlag)
Definition Utilities.cpp:43
const BlackList_T K_BLACK_LIST
Definition BasConst.cpp:196
void rtrim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition Filter.cpp:72
bool hasGoodSize(const std::string &iWord, const NbOfLetters_T &iMinWordLength)
Definition Filter.cpp:39
bool isBlackListed(const std::string &iWord)
Definition Filter.cpp:53
void trim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition Filter.cpp:122
void ltrim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition Filter.cpp:97
Class filtering out the words not suitable for indexing and/or searching, when part of greater string...
Definition Filter.hpp:21
static void trim(std::string &ioPhrase, const NbOfLetters_T &iMinWordLength=4)
Definition Filter.cpp:131
static bool shouldKeep(const std::string &iPhrase, const std::string &iWord)
Definition Filter.cpp:144