Alexandria 2.31.0
SDC-CH common library for the Euclid project
Loading...
Searching...
No Matches
AsciiReaderHelper.cpp
Go to the documentation of this file.
1/*
2 * Copyright (C) 2012-2022 Euclid Science Ground Segment
3 *
4 * This library is free software; you can redistribute it and/or modify it under
5 * the terms of the GNU Lesser General Public License as published by the Free
6 * Software Foundation; either version 3.0 of the License, or (at your option)
7 * any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12 * details.
13 *
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this library; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
25#include "AsciiReaderHelper.h"
28#include "NdArray/NdArray.h"
29#include <boost/algorithm/string.hpp>
30#include <boost/lexical_cast.hpp>
31#include <boost/spirit/include/qi.hpp>
32#include <boost/tokenizer.hpp>
33#include <set>
34#include <sstream>
35
36namespace Euclid {
37namespace Table {
38
39using NdArray::NdArray;
40
42
43size_t countColumns(std::istream& in, const std::string& comment) {
44 StreamRewinder rewinder{in};
45 size_t count = 0;
46
47 while (in) {
48 std::string line;
49 getline(in, line);
50 // Remove any comments
51 size_t comment_pos = line.find(comment);
52 if (comment_pos != std::string::npos) {
53 line = line.substr(0, comment_pos);
54 }
55 boost::trim(line);
56 if (!line.empty()) {
57 std::string token;
58 std::stringstream line_stream(line);
59 line_stream >> boost::io::quoted(token);
60 while (line_stream) {
61 line_stream >> boost::io::quoted(token);
62 ++count;
63 }
64 break;
65 }
66 }
67 if (count == 0) {
68 throw Elements::Exception() << "No data lines found";
69 }
70 return count;
71}
72
76 // Boolean
77 {"bool", typeid(bool)},
78 {"boolean", typeid(bool)},
79 // Integers
80 {"int", typeid(int32_t)},
81 {"long", typeid(int64_t)},
82 {"int32", typeid(int32_t)},
83 {"int64", typeid(int64_t)},
84 // Floating point
85 {"float", typeid(float)},
86 {"double", typeid(double)},
87 // Strings
88 {"string", typeid(std::string)},
89 // Arrays
90 {"[bool]", typeid(std::vector<bool>)},
91 {"[boolean]", typeid(std::vector<bool>)},
92 {"[int]", typeid(std::vector<int32_t>)},
93 {"[long]", typeid(std::vector<int64_t>)},
94 {"[int32]", typeid(std::vector<int32_t>)},
95 {"[int64]", typeid(std::vector<int64_t>)},
96 {"[float]", typeid(std::vector<float>)},
97 {"[double]", typeid(std::vector<double>)},
98 // NdArrays
99 {"[int+]", typeid(NdArray<int32_t>)},
100 {"[long+]", typeid(NdArray<int64_t>)},
101 {"[int32+]", typeid(NdArray<int32_t>)},
102 {"[int64+]", typeid(NdArray<int64_t>)},
103 {"[float+]", typeid(NdArray<float>)},
104 {"[double+]", typeid(NdArray<double>)},
105};
106
108 auto i = std::find_if(KeywordTypeMap.begin(), KeywordTypeMap.end(),
109 [keyword](const std::pair<std::string, std::type_index>& p) { return p.first == keyword; });
110 if (i != KeywordTypeMap.end()) {
111 return i->second;
112 }
113 throw Elements::Exception() << "Unknown column type keyword " << keyword;
114}
115
117 StreamRewinder rewinder{in};
119 while (in) {
120 std::string line;
121 getline(in, line);
122 boost::trim(line);
123 if (line.empty()) {
124 continue; // We skip empty lines
125 }
126 if (boost::starts_with(line, comment)) {
127 // If we have a comment we remove all comment characters and check if we have
128 // a column description
129 boost::replace_all(line, comment, "");
130 boost::trim(line);
131 if (boost::starts_with(line, "Column:")) {
132 line.erase(0, 7);
133 boost::trim(line);
134 if (!line.empty()) {
135 std::string token;
136 std::stringstream line_stream(line);
137 std::string name;
138 line_stream >> boost::io::quoted(name);
139 if (descriptions.count(name) != 0) {
140 throw Elements::Exception() << "Duplicate column name " << name;
141 }
142 line_stream >> boost::io::quoted(token);
143 std::type_index type = typeid(std::string);
144 if (line_stream && !boost::starts_with(token, "(") && token != "-") {
145 type = keywordToType(token);
146 line_stream >> boost::io::quoted(token);
147 }
148 std::string unit = "";
149 if (line_stream && boost::starts_with(token, "(")) {
150 unit = token;
151 unit.erase(unit.begin());
152 unit.erase(unit.end() - 1);
153 line_stream >> boost::io::quoted(token);
154 }
155 if (line_stream && token == "-") {
156 line_stream >> boost::io::quoted(token);
157 }
159 while (line_stream) {
160 desc << token << ' ';
161 line_stream >> boost::io::quoted(token);
162 }
163 std::string desc_str = desc.str();
164 boost::trim(desc_str);
165 descriptions.emplace(std::piecewise_construct, std::forward_as_tuple(name),
166 std::forward_as_tuple(name, type, unit, desc_str));
167 }
168 }
169 } else {
170 break; // here we reached the first data line
171 }
172 }
173 return descriptions;
174}
175
176std::vector<std::string> autoDetectColumnNames(std::istream& in, const std::string& comment, size_t columns_number) {
177 StreamRewinder rewinder{in};
179
180 // Find the last comment line and at the same time read the names of the
181 // column info description comments
182 std::string last_comment{};
183 std::vector<std::string> desc_names{};
184 while (in) {
185 std::string line;
186 getline(in, line);
187 boost::trim(line);
188 if (line.empty()) {
189 continue; // We skip empty lines
190 }
191 if (boost::starts_with(line, comment)) {
192 // If we have a comment we remove all comment characters and check if we have
193 // the correct number of tokens
194 boost::replace_all(line, comment, "");
195 boost::trim(line);
196 if (!line.empty()) {
197 last_comment = line;
198 }
199 if (boost::starts_with(line, "Column:")) {
200 std::string temp = line;
201 temp.erase(0, 7);
202 boost::trim(temp);
203 auto space_i = temp.find(' ');
204 if (space_i > 0) {
205 temp = temp.substr(0, space_i);
206 }
207 desc_names.emplace_back(std::move(temp));
208 }
209 } else {
210 break; // here we reached the first data line
211 }
212 }
213
214 // Check if the last comment line contains the names of the columns
215 if (!last_comment.empty()) {
216 std::stringstream line_stream(last_comment);
217 std::string token;
218 line_stream >> boost::io::quoted(token);
219 while (line_stream) {
220 names.push_back(token);
221 line_stream >> boost::io::quoted(token);
222 }
223 if (names.size() != columns_number) {
224 names.clear();
225 }
226 }
227
228 // If the names are empty we fill them with the column descriprion ones
229 if (names.empty()) {
230 if (!desc_names.empty() && desc_names.size() != columns_number) {
231 logger.warn() << "Number of column descriptions does not matches the number"
232 << " of the columns";
233 }
234 names = desc_names;
235 }
236
237 if (names.size() < columns_number) {
238 for (size_t i = names.size() + 1; i <= columns_number; ++i) {
239 names.push_back("col" + std::to_string(i));
240 }
241 }
242 // Check for duplicate names
244 for (const auto& name : names) {
245 if (!set.insert(name).second) {
246 throw Elements::Exception() << "Duplicate column name " << name;
247 }
248 }
249 return names;
250}
251
252namespace {
253
254template <typename T>
255std::vector<T> convertStringToVector(const std::string& str) {
256 std::vector<T> result{};
257 boost::char_separator<char> sep{","};
258 boost::tokenizer<boost::char_separator<char>> tok{str, sep};
259 std::transform(tok.begin(), tok.end(), std::back_inserter(result),
260 [](const std::string& s) { return boost::get<T>(convertToCellType(s, typeid(T))); });
261 return result;
262}
263
264template <typename T>
265NdArray<T> convertStringToNdArray(const std::string& str) {
266 if (str.empty()) {
267 throw Elements::Exception() << "Cannot convert an empty string to a NdArray";
268 } else if (str[0] != '<') {
269 throw Elements::Exception() << "Unexpected initial character for a NdArray: " << str[0];
270 }
271
272 auto closing_char = str.find('>');
273 if (closing_char == std::string::npos) {
274 throw Elements::Exception() << "Could not find '>'";
275 }
276
277 auto shape_str = str.substr(1, closing_char - 1);
278 auto shape_i = convertStringToVector<int32_t>(shape_str);
279 auto data = convertStringToVector<T>(str.substr(closing_char + 1));
280
281 std::vector<size_t> shape_u;
282 std::copy(shape_i.begin(), shape_i.end(), std::back_inserter(shape_u));
283 return NdArray<T>(shape_u, data);
284}
285
286} // namespace
287
289 // Boolean
290 {typeid(bool),
291 [](const std::string& value) {
292 if (value == "true" || value == "t" || value == "yes" || value == "y" || value == "1") {
293 return true;
294 } else if (value == "false" || value == "f" || value == "no" || value == "n" || value == "0") {
295 return false;
296 }
297 throw Elements::Exception() << "Invalid boolean value " << value;
298 }},
299 // Integers
300 {typeid(int32_t), boost::lexical_cast<int32_t, const std::string&>},
301 {typeid(int64_t), boost::lexical_cast<int64_t, const std::string&>},
302 // Floating point
303 {typeid(float), boost::lexical_cast<float, const std::string&>},
304 {typeid(double), boost::lexical_cast<double, const std::string&>},
305 // String
306 {typeid(std::string), boost::lexical_cast<std::string, const std::string&>},
307 // Arrays
308 {typeid(std::vector<bool>), convertStringToVector<bool>},
309 {typeid(std::vector<int32_t>), convertStringToVector<int32_t>},
310 {typeid(std::vector<int64_t>), convertStringToVector<int64_t>},
311 {typeid(std::vector<float>), convertStringToVector<float>},
312 {typeid(std::vector<double>), convertStringToVector<double>},
313 // NdArray
314 {typeid(NdArray<int32_t>), convertStringToNdArray<int32_t>},
315 {typeid(NdArray<int64_t>), convertStringToNdArray<int64_t>},
316 {typeid(NdArray<float>), convertStringToNdArray<float>},
317 {typeid(NdArray<double>), convertStringToNdArray<double>},
318};
319
321 try {
322 auto i = sCellConverter.find(type);
323 if (i == sCellConverter.end()) {
324 throw Elements::Exception() << "Unknown type name " << type.name();
325 }
326 return i->second(value);
327 } catch (boost::bad_lexical_cast const&) {
328 throw Elements::Exception() << "Cannot convert " << value << " to " << type.name();
329 }
330}
331
332bool hasNextRow(std::istream& in, const std::string& comment) {
333 StreamRewinder rewinder{in};
334 while (in) {
335 std::string line;
336 getline(in, line);
337 size_t comment_pos = line.find(comment);
338 if (comment_pos != std::string::npos) {
339 line = line.substr(0, comment_pos);
340 }
341 boost::trim(line);
342 if (!line.empty()) {
343 return true;
344 }
345 }
346 return false;
347}
348
350 StreamRewinder rewinder{in};
351 std::size_t count = 0;
352 while (in) {
353 std::string line;
354 getline(in, line);
355 size_t comment_pos = line.find(comment);
356 if (comment_pos != std::string::npos) {
357 line = line.substr(0, comment_pos);
358 }
359 boost::trim(line);
360 if (!line.empty()) {
361 ++count;
362 }
363 }
364 return count;
365}
366
369 size_t comment_pos = line.find(comment);
370
371 if (comment_pos != std::string::npos) {
372 line = line.substr(0, comment_pos);
373 }
374 boost::trim(line);
375 if (!line.empty()) {
376 std::stringstream line_stream(line);
377 size_t count = 0;
378 std::string token;
379 line_stream >> boost::io::quoted(token);
380 while (line_stream) {
381 cells.emplace_back(token);
382 line_stream >> boost::io::quoted(token);
383 ++count;
384 }
385 }
386 return cells;
387}
388
390 StreamRewinder rewinder{in};
391 std::string line(comment);
392 while (in && boost::starts_with(line, comment)) {
393 getline(in, line);
394 }
395 return splitLine(line, comment);
396}
397
399 namespace qi = boost::spirit::qi;
400 double d;
401 long l;
402
403 auto it1 = token.begin();
404 auto it2 = it1;
405 if (qi::parse(it1, token.end(), qi::long_, l) && it1 == token.end()) {
406 return {typeid(int64_t), 0};
407 }
408 if (qi::parse(it2, token.end(), qi::double_, d) && it2 == token.end()) {
409 return {typeid(double), 0};
410 }
411 return {typeid(std::string), std::size_t(0)};
412}
413
414} // namespace Table
415} // end of namespace Euclid
T back_inserter(T... args)
T begin(T... args)
static Logging getLogger(const std::string &name="")
void warn(const std::string &logMessage)
boost::variant< bool, int32_t, int64_t, float, double, std::string, std::vector< bool >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, NdArray::NdArray< int32_t >, NdArray::NdArray< int64_t >, NdArray::NdArray< float >, NdArray::NdArray< double > > cell_type
The possible cell types.
Definition Row.h:64
This class gets a stream as argument during construction and when it is deleted it sets the position ...
T copy(T... args)
T count(T... args)
T emplace_back(T... args)
T emplace(T... args)
T empty(T... args)
T end(T... args)
T erase(T... args)
T find(T... args)
T forward_as_tuple(T... args)
T move(T... args)
T name(T... args)
std::vector< std::string > splitLine(std::string line, const std::string &comment)
std::type_index keywordToType(const std::string &keyword)
bool hasNextRow(std::istream &in, const std::string &comment)
std::map< std::string, ColumnDescription > autoDetectColumnDescriptions(std::istream &in, const std::string &comment)
Reads the column descriptions of the given stream.
size_t countColumns(std::istream &in, const std::string &comment)
Returns the number of whitespace separated tokens of the first non commented line.
std::vector< std::string > autoDetectColumnNames(std::istream &in, const std::string &comment, size_t columns_number)
Reads the column names of the given stream.
std::pair< std::type_index, std::size_t > guessColumnType(const std::string &token)
Row::cell_type convertToCellType(const std::string &value, std::type_index type)
Converts the given value to a Row::cell_type of the given type.
std::vector< std::string > firstDataLine(std::istream &in, const std::string &comment)
const std::vector< std::pair< std::string, std::type_index > > KeywordTypeMap
std::size_t countRemainingRows(std::istream &in, const std::string &comment)
const std::map< std::type_index, std::function< Row::cell_type(const std::string &)> > sCellConverter
static Elements::Logging logger
T str(T... args)
T substr(T... args)
T to_string(T... args)
T transform(T... args)