OpenTREP Logo  0.07.18
C++ Open Travel Request Parsing Library
Loading...
Searching...
No Matches
IndexBuilder.cpp
Go to the documentation of this file.
1// //////////////////////////////////////////////////////////////////////
2// Import section
3// //////////////////////////////////////////////////////////////////////
4// STL
5#include <cassert>
6#include <string>
7#include <vector>
8#include <exception>
9// Boost
10#include <boost/filesystem.hpp>
11#include <boost/filesystem/fstream.hpp>
12#include <boost/tokenizer.hpp>
13#include <boost/iostreams/device/file.hpp>
14#include <boost/iostreams/filtering_stream.hpp>
15#include <boost/iostreams/filter/gzip.hpp>
16#include <boost/iostreams/filter/bzip2.hpp>
17// SOCI
18#include <soci/soci.h>
19// Xapian
20#include <xapian.h>
21// OpenTrep
36
37namespace OPENTREP {
38
39 // //////////////////////////////////////////////////////////////////////
40 void addToXapian (const Place& iPlace, Xapian::Document& ioDocument,
41 Xapian::WritableDatabase& ioDatabase) {
48 Xapian::TermGenerator lTermGenerator;
49 lTermGenerator.set_database (ioDatabase);
50 lTermGenerator.set_document (ioDocument);
51
52 // DEBUG
53 // OPENTREP_LOG_DEBUG ("Indexing for " << iPlace.describeKey());
54
55 const Place::TermSetMap_T& lTermSetMap = iPlace.getTermSetMap();
56 for (Place::TermSetMap_T::const_iterator itStringSet = lTermSetMap.begin();
57 itStringSet != lTermSetMap.end(); ++itStringSet) {
58 // Retrieve the weight
59 const Weight_T& lWeight = itStringSet->first;
60 const Xapian::termcount lWDFInc =
61 static_cast<const Xapian::termcount> (lWeight);
62
63 // Retrieve the set of strings for that weight
64 const Place::StringSet_T& lTermSet = itStringSet->second;
65 for (Place::StringSet_T::const_iterator itString = lTermSet.begin();
66 itString != lTermSet.end(); ++itString) {
67 const std::string& lString = *itString;
68 lTermGenerator.index_text (lString, lWDFInc);
69
70 // DEBUG
71 //OPENTREP_LOG_DEBUG("[" << lWeight << "/" << lWDFInc << "] "<< lString);
72 }
73 }
74
75 // Spelling terms
76 const Place::StringSet_T& lSpellingSet = iPlace.getSpellingSet();
77 for (Place::StringSet_T::const_iterator itTerm = lSpellingSet.begin();
78 itTerm != lSpellingSet.end(); ++itTerm) {
79 const std::string& lTerm = *itTerm;
80 ioDatabase.add_spelling (lTerm);
81 }
82
83 // DEBUG
84 OPENTREP_LOG_DEBUG ("Added terms for '" << iPlace.describeKey()
85 << "': " << iPlace.describeSets()
86 << " into " << ioDocument.get_description());
87 }
88
89 // //////////////////////////////////////////////////////////////////////
90 void IndexBuilder::addDocumentToIndex(Xapian::WritableDatabase& ioDatabase,
91 Place& ioPlace,
92 const OTransliterator& iTransliterator) {
93
94 // Create an empty Xapian document
95 Xapian::Document lDocument;
96
97 // Retrieve the raw data string, to be stored as is within
98 // the Xapian document
99 const RawDataString_T& lRawDataString = ioPlace.getRawDataString();
100
101 // The Xapian document data is indeed the same as the one of the
102 // OPTD-maintained list of POR (points of reference), allowing the search
103 // process to use exactly the same parser as the indexation process
104 lDocument.set_data (lRawDataString);
105
106 // Build the (STL) sets of terms to be added to the Xapian index and
107 // spelling dictionary
108 ioPlace.buildIndexSets (iTransliterator);
109
110 // Add the (STL) sets of terms to the Xapian index and spelling dictionary
111 addToXapian (ioPlace, lDocument, ioDatabase);
112
113 // Add the document to the database
114 const Xapian::docid& lDocID = ioDatabase.add_document (lDocument);
115
116 // Assign back the newly generated Xapian document ID to the
117 // Place object
118 ioPlace.setDocID (lDocID);
119 }
120
121 // //////////////////////////////////////////////////////////////////////
122 NbOfDBEntries_T IndexBuilder::
123 buildSearchIndex (Xapian::WritableDatabase* ioXapianDB_ptr,
124 const DBType& iSQLDBType, soci::session* ioSociSessionPtr,
125 std::istream& iPORFileStream,
126 const shouldIndexNonIATAPOR_T& iIncludeNonIATAPOR,
127 const OTransliterator& iTransliterator) {
128 NbOfDBEntries_T oNbOfEntries = 0;
129 NbOfDBEntries_T oNbOfEntriesInPORFile = 0;
130
131 // Open the file to be parsed
132 Place& lPlace = FacPlace::instance().create();
133 std::string itReadLine;
134 while (std::getline (iPORFileStream, itReadLine)) {
135
136 /* First, if only the IATA-refernced POR must be indexed
137 * (ie, when iIncludeNonIATAPOR is set to false), the line
138 * must start with a non empty IATA code of three letters;
139 * in other words, the separator (the hat symbol) is first seen
140 * at position 3 (remember that strings in C++ start at position 0).
141 * Otherwise, the line is skipped.
142 */
143 if (!iIncludeNonIATAPOR) {
144 const unsigned short lFirstSeparatorPos = itReadLine.find_first_of ("^");
145 if (lFirstSeparatorPos != 3) {
146 // DEBUG
147 /*
148 OPENTREP_LOG_ERROR ("[" << oNbOfEntries << "] pos of sep: "
149 << lFirstSeparatorPos << ", full line: "
150 << itReadLine);
151 */
152
153 //
154 ++oNbOfEntriesInPORFile;
155
156 //
157 continue;
158 }
159 }
160
161 // Initialise the parser
162 PORStringParser lStringParser (itReadLine);
163
164 // Parse the string
165 const Location& lLocation = lStringParser.generateLocation();
166
167 // DEBUG
168 /*
169 const LocationKey& lLocationKey = lLocation.getKey();
170 OPENTREP_LOG_DEBUG ("[BEF-ADD] " << lLocationKey);
171 */
172
173 /* When the line/string is relevant, create a BOM instance from
174 * the Location structure.
175 * Otherwise, the line is skipped.
176 */
177 const std::string& lCommonName = lLocation.getCommonName();
178 if (lCommonName == "NotAvailable") {
179 continue;
180 }
181
182 // Fill the Place object with the Location structure.
183 lPlace.setLocation (lLocation);
184
185 // Add the document, associated to the Place object, to the Xapian index,
186 // if required
187 if (ioXapianDB_ptr != NULL) {
188 IndexBuilder::addDocumentToIndex (*ioXapianDB_ptr, lPlace,
189 iTransliterator);
190 }
191
192 // Add the document to the SQL database, if required
193 if (ioSociSessionPtr != NULL) {
194 DBManager::insertPlaceInDB (*ioSociSessionPtr, lPlace);
195 }
196
197 // DEBUG
198 /*
199 OPENTREP_LOG_DEBUG ("[AFT-ADD] " << lLocationKey
200 << ", Place: " << lPlace);
201 */
202
203 // Iteration
204 ++oNbOfEntries; ++oNbOfEntriesInPORFile;
205
206 // Progress status
207 if (oNbOfEntries % 1000 == 0) {
208 std::cout.imbue( std::locale (std::locale::classic(), new NumSep));
209 std::cout << "Number of actually parsed records: " << oNbOfEntries
210 << ", out of " << oNbOfEntriesInPORFile
211 << " records in the POR data file so far" << std::endl;
212 }
213
214 // DEBUG
215 OPENTREP_LOG_DEBUG ("[" << oNbOfEntries << "] " << lPlace);
216
217 // Reset for next turn
218 lPlace.resetMatrix();
219 lPlace.resetIndexSets();
220 }
221
222 return oNbOfEntries;
223 }
224
225 // //////////////////////////////////////////////////////////////////////
226 NbOfDBEntries_T IndexBuilder::
227 buildSearchIndex (const PORFilePath_T& iPORFilePath,
228 const TravelDBFilePath_T& iTravelIndexFilePath,
229 const DBType& iSQLDBType,
230 const SQLDBConnectionString_T& iSQLDBConnStr,
231 const shouldIndexNonIATAPOR_T& iIncludeNonIATAPOR,
232 const shouldIndexPORInXapian_T& iShouldIndexPORInXapian,
233 const shouldAddPORInSQLDB_T& iShouldAddPORInSQLDB,
234 const OTransliterator& iTransliterator) {
235 NbOfDBEntries_T oNbOfEntries = 0;
236 soci::session* lSociSession_ptr = NULL;
237 Xapian::WritableDatabase* lXapianDatabase_ptr = NULL;
238
247 if (iShouldIndexPORInXapian) {
248 // Delete and recreate the directory, and its full content,
249 // hosting the Xapian index / database
250 FileManager::recreateXapianDirectory (iTravelIndexFilePath);
251
252 // Recreate the Xapian index / database
253 lXapianDatabase_ptr =
254 FacXapianDB::instance().create (iTravelIndexFilePath, Xapian::DB_CREATE);
255 assert (lXapianDatabase_ptr != NULL);
256
257 // DEBUG
258 OPENTREP_LOG_DEBUG ("The Xapian index / database ('"
259 << iTravelIndexFilePath
260 << "') has been re-created, checked and opened");
261
262
271 lXapianDatabase_ptr->begin_transaction();
272
273 // DEBUG
274 OPENTREP_LOG_DEBUG ("A transaction has begun on the Xapian database ('"
275 << iTravelIndexFilePath << "')");
276 }
277
283 if (iShouldAddPORInSQLDB) {
284 /*
285 // Creation of the trep user and trep_trep database
286 bool isSuccessful = DBManager::createSQLDBUser (iSQLDBType, iSQLDBConnStr);
287 if (isSuccessful == false) {
288 std::ostringstream errorStr;
289 errorStr << "Error when trying to re-initialize the SQL database ('"
290 << iSQLDBConnStr << "')";
291 OPENTREP_LOG_ERROR (errorStr.str());
292 throw SQLDatabaseImpossibleConnectionException (errorStr.str());
293 }
294 */
295
299 if (!(iSQLDBType == DBType::NODB)) {
300 // Connection to the database
301 lSociSession_ptr =
302 DBManager::initSQLDBSession (iSQLDBType, iSQLDBConnStr);
303
304 if (lSociSession_ptr == NULL) {
305 std::ostringstream errorStr;
306 errorStr << "Error when trying to connect to the SQL database ('"
307 << iSQLDBConnStr << "')";
308 OPENTREP_LOG_ERROR (errorStr.str());
309 throw SQLDatabaseImpossibleConnectionException (errorStr.str());
310 }
311 assert (lSociSession_ptr != NULL);
312
313 // Creation of the POR table
314 DBManager::createSQLDBTables (*lSociSession_ptr);
315 }
316 }
317
321 // DEBUG
322 OPENTREP_LOG_DEBUG ("Parsing POR input file: " << iPORFilePath);
323
324 // Get a reference on the file stream corresponding to the POR file.
325 const PORFileHelper lPORFileHelper (iPORFilePath);
326 std::istream& lPORFileStream = lPORFileHelper.getFileStreamRef();
327
328 // Browse the input POR (point of reference) data file,
329 // parse every of its rows, and put the result in the Xapian database/index
330 // and, if needed, within the SQL database.
331 oNbOfEntries = buildSearchIndex (lXapianDatabase_ptr, iSQLDBType,
332 lSociSession_ptr, lPORFileStream,
333 iIncludeNonIATAPOR, iTransliterator);
334
339 if (iShouldIndexPORInXapian) {
340 assert (lXapianDatabase_ptr != NULL);
341 lXapianDatabase_ptr->commit_transaction();
342
343 // DEBUG
344 OPENTREP_LOG_DEBUG ("Xapian has indexed " << oNbOfEntries << " entries.");
345 }
346
354 if (iShouldIndexPORInXapian) {
355 assert (lXapianDatabase_ptr != NULL);
356 lXapianDatabase_ptr->close();
357 }
358
359
360 if (iShouldAddPORInSQLDB) {
364 if (!(iSQLDBType == DBType::NODB)) {
365 assert (lSociSession_ptr != NULL);
366 DBManager::createSQLDBIndexes (*lSociSession_ptr);
367 }
368
372 if (!(iSQLDBType == DBType::NODB)) {
373 assert (lSociSession_ptr != NULL);
374 DBManager::terminateSQLDBSession (iSQLDBType, iSQLDBConnStr,
375 *lSociSession_ptr);
376 }
377 }
378
379 return oNbOfEntries;
380 }
381
382}
#define OPENTREP_LOG_ERROR(iToBeLogged)
Definition Logger.hpp:24
#define OPENTREP_LOG_DEBUG(iToBeLogged)
Definition Logger.hpp:33
static void terminateSQLDBSession(const DBType &, const SQLDBConnectionString_T &, soci::session &)
static void createSQLDBTables(soci::session &)
static soci::session * initSQLDBSession(const DBType &, const SQLDBConnectionString_T &)
static void createSQLDBIndexes(soci::session &)
static void insertPlaceInDB(soci::session &, const Place &)
static FacPlace & instance()
Definition FacPlace.cpp:29
static FacXapianDB & instance()
Xapian::WritableDatabase * create(const TravelDBFilePath_T &, const int &iXapianActionFlag)
static void recreateXapianDirectory(const std::string &iTravelDBFilePath)
Class modelling a place/POR (point of reference).
Definition Place.hpp:29
std::map< const Weight_T, StringSet_T > TermSetMap_T
Definition Place.hpp:41
std::string describeSets() const
Definition Place.cpp:157
const StringSet_T & getSpellingSet() const
Definition Place.hpp:509
std::set< std::string > StringSet_T
Definition Place.hpp:40
std::string describeKey() const
Definition Place.hpp:1053
const TermSetMap_T & getTermSetMap() const
Definition Place.hpp:495
void addToXapian(const Place &iPlace, Xapian::Document &ioDocument, Xapian::WritableDatabase &ioDatabase)
unsigned short Weight_T
bool shouldAddPORInSQLDB_T
unsigned int NbOfDBEntries_T
bool shouldIndexPORInXapian_T
bool shouldIndexNonIATAPOR_T
Enumeration of database types.
Definition DBType.hpp:17