Orcus
Loading...
Searching...
No Matches
sax_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
9#define INCLUDED_ORCUS_SAX_PARSER_HPP
10
11#include <cstdint>
12
13#include "sax_parser_base.hpp"
14
15#include <string_view>
16
17namespace orcus {
18
20{
26 static constexpr uint8_t baseline_version = 10;
27};
28
30{
31public:
38 {
39 (void)dtd;
40 }
41
49 void start_declaration(std::string_view decl)
50 {
51 (void)decl;
52 }
53
59 void end_declaration(std::string_view decl)
60 {
61 (void)decl;
62 }
63
70 {
71 (void)elem;
72 }
73
80 {
81 (void)elem;
82 }
83
98 void characters(std::string_view val, bool transient)
99 {
100 (void)val; (void)transient;
101 }
102
112 {
113 (void)attr;
114 }
115};
116
132template<typename HandlerT, typename ConfigT = sax_parser_default_config>
133class sax_parser : public sax::parser_base
134{
135public:
136 typedef HandlerT handler_type;
137 typedef ConfigT config_type;
138
139 sax_parser(std::string_view content, handler_type& handler);
140 ~sax_parser() = default;
141
142 void parse();
143
144private:
145
150 void header();
151 void body();
152 void element();
153 void element_open(std::ptrdiff_t begin_pos);
154 void element_close(std::ptrdiff_t begin_pos);
155 void special_tag();
156 void declaration(const char* name_check);
157 void cdata();
158 void doctype();
159 void characters();
160 void attribute();
161
162private:
163 handler_type& m_handler;
164};
165
166template<typename HandlerT, typename ConfigT>
167sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) :
168 sax::parser_base(content.data(), content.size()),
169 m_handler(handler)
170{
171}
172
173template<typename HandlerT, typename ConfigT>
174void sax_parser<HandlerT,ConfigT>::parse()
175{
176 m_nest_level = 0;
177 mp_char = mp_begin;
178 header();
179 skip_space_and_control();
180 body();
181
182 assert(m_buffer_pos == 0);
183}
184
185template<typename HandlerT, typename ConfigT>
186void sax_parser<HandlerT,ConfigT>::header()
187{
188 // we don't handle multi byte encodings so we can just skip bom entry if exists.
189 skip_bom();
190
191 // Allow leading whitespace in the XML stream.
192 // TODO : Make this configurable since strictly speaking such an XML
193 // sttream is invalid.
194 skip_space_and_control();
195
196 if (!has_char() || cur_char() != '<')
197 throw malformed_xml_error("xml file must begin with '<'.", offset());
198
199 if (config_type::baseline_version >= 11)
200 {
201 // XML version 1.1 requires a header declaration whereas in 1.0 it's
202 // optional.
203 if (next_char_checked() != '?')
204 throw malformed_xml_error("xml file must begin with '<?'.", offset());
205
206 declaration("xml");
207 }
208}
209
210template<typename HandlerT, typename ConfigT>
211void sax_parser<HandlerT,ConfigT>::body()
212{
213 while (has_char())
214 {
215 if (cur_char() == '<')
216 {
217 element();
218 if (!m_root_elem_open)
219 // Root element closed. Stop parsing.
220 return;
221 }
222 else if (m_nest_level)
223 // Call characters only when in xml hierarchy.
224 characters();
225 else
226 next();
227 }
228}
229
230template<typename HandlerT, typename ConfigT>
231void sax_parser<HandlerT,ConfigT>::element()
232{
233 assert(cur_char() == '<');
234 std::ptrdiff_t pos = offset();
235 char c = next_char_checked();
236 switch (c)
237 {
238 case '/':
239 element_close(pos);
240 return;
241 case '!':
242 special_tag();
243 return;
244 case '?':
245 declaration(nullptr);
246 return;
247 }
248
249 element_open(pos);
250}
251
252template<typename HandlerT, typename ConfigT>
253void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos)
254{
256 element_name(elem, begin_pos);
257
258 while (true)
259 {
260 skip_space_and_control();
261 char c = cur_char_checked();
262 if (c == '/')
263 {
264 // Self-closing element: <element/>
265 if (next_and_char() != '>')
266 throw malformed_xml_error("expected '/>' to self-close the element.", offset());
267 next();
268 elem.end_pos = offset();
269 m_handler.start_element(elem);
270 reset_buffer_pos();
271 m_handler.end_element(elem);
272 if (!m_nest_level)
273 m_root_elem_open = false;
274#if ORCUS_DEBUG_SAX_PARSER
275 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
276#endif
277 return;
278 }
279 else if (c == '>')
280 {
281 // End of opening element: <element>
282 next();
283 elem.end_pos = offset();
284 nest_up();
285 m_handler.start_element(elem);
286 reset_buffer_pos();
287#if ORCUS_DEBUG_SAX_PARSER
288 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
289#endif
290 return;
291 }
292 else
293 attribute();
294 }
295}
296
297template<typename HandlerT, typename ConfigT>
298void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos)
299{
300 assert(cur_char() == '/');
301 nest_down();
302 next_check();
304 element_name(elem, begin_pos);
305
306 if (cur_char() != '>')
307 throw malformed_xml_error("expected '>' to close the element.", offset());
308 next();
309 elem.end_pos = offset();
310
311 m_handler.end_element(elem);
312#if ORCUS_DEBUG_SAX_PARSER
313 cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
314#endif
315 if (!m_nest_level)
316 m_root_elem_open = false;
317}
318
319template<typename HandlerT, typename ConfigT>
320void sax_parser<HandlerT,ConfigT>::special_tag()
321{
322 assert(cur_char() == '!');
323 // This can be either <![CDATA, <!--, or <!DOCTYPE.
324 size_t len = available_size();
325 if (len < 2)
326 throw malformed_xml_error("special tag too short.", offset());
327
328 switch (next_and_char())
329 {
330 case '-':
331 {
332 // Possibly comment.
333 if (next_and_char() != '-')
334 throw malformed_xml_error("comment expected.", offset());
335
336 len -= 2;
337 if (len < 3)
338 throw malformed_xml_error("malformed comment.", offset());
339
340 next();
341 comment();
342 }
343 break;
344 case '[':
345 {
346 // Possibly a CDATA.
347 expects_next("CDATA[", 6);
348 if (has_char())
349 cdata();
350 }
351 break;
352 case 'D':
353 {
354 // check if this is a DOCTYPE.
355 expects_next("OCTYPE", 6);
356 skip_space_and_control();
357 if (has_char())
358 doctype();
359 }
360 break;
361 default:
362 throw malformed_xml_error("failed to parse special tag.", offset());
363 }
364}
365
366template<typename HandlerT, typename ConfigT>
367void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check)
368{
369 assert(cur_char() == '?');
370 next_check();
371
372 // Get the declaration name first.
373 std::string_view decl_name;
374 name(decl_name);
375#if ORCUS_DEBUG_SAX_PARSER
376 cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
377#endif
378
379 if (name_check && decl_name != name_check)
380 {
381 std::ostringstream os;
382 os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
383 throw malformed_xml_error(os.str(), offset());
384 }
385
386 m_handler.start_declaration(decl_name);
387 skip_space_and_control();
388
389 // Parse the attributes.
390 while (cur_char_checked() != '?')
391 {
392 attribute();
393 skip_space_and_control();
394 }
395 if (next_char_checked() != '>')
396 throw malformed_xml_error("declaration must end with '?>'.", offset());
397
398 m_handler.end_declaration(decl_name);
399 reset_buffer_pos();
400 next();
401#if ORCUS_DEBUG_SAX_PARSER
402 cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
403#endif
404}
405
406template<typename HandlerT, typename ConfigT>
407void sax_parser<HandlerT,ConfigT>::cdata()
408{
409 size_t len = available_size();
410 assert(len > 3);
411
412 // Parse until we reach ']]>'.
413 const char* p0 = mp_char;
414 size_t i = 0, match = 0;
415 for (char c = cur_char(); i < len; ++i, c = next_and_char())
416 {
417 if (c == ']')
418 {
419 // Be aware that we may encounter a series of more than two ']'
420 // characters, in which case we'll only count the last two.
421
422 if (match == 0)
423 // First ']'
424 ++match;
425 else if (match == 1)
426 // Second ']'
427 ++match;
428 }
429 else if (c == '>' && match == 2)
430 {
431 // Found ']]>'.
432 size_t cdata_len = i - 2;
433 m_handler.characters(std::string_view(p0, cdata_len), false);
434 next();
435 return;
436 }
437 else
438 match = 0;
439 }
440 throw malformed_xml_error("malformed CDATA section.", offset());
441}
442
443template<typename HandlerT, typename ConfigT>
444void sax_parser<HandlerT,ConfigT>::doctype()
445{
446 // Parse the root element first.
448 name(param.root_element);
449 skip_space_and_control();
450
451 // Either PUBLIC or SYSTEM.
452 size_t len = available_size();
453 if (len < 6)
454 throw malformed_xml_error("DOCTYPE section too short.", offset());
455
456 param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
457 char c = cur_char();
458 if (c == 'P')
459 {
460 if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
461 throw malformed_xml_error("malformed DOCTYPE section.", offset());
462
463 param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
464 }
465 else if (c == 'S')
466 {
467 if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
468 throw malformed_xml_error("malformed DOCTYPE section.", offset());
469 }
470
471 next_check();
472 skip_space_and_control();
473
474 // Parse FPI.
475 value(param.fpi, false);
476
477 has_char_throw("DOCTYPE section too short.");
478 skip_space_and_control();
479 has_char_throw("DOCTYPE section too short.");
480
481 if (cur_char() == '>')
482 {
483 // Optional URI not given. Exit.
484#if ORCUS_DEBUG_SAX_PARSER
485 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
486#endif
487 m_handler.doctype(param);
488 next();
489 return;
490 }
491
492 // Parse optional URI.
493 value(param.uri, false);
494
495 has_char_throw("DOCTYPE section too short.");
496 skip_space_and_control();
497 has_char_throw("DOCTYPE section too short.");
498
499 if (cur_char() != '>')
500 throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
501
502#if ORCUS_DEBUG_SAX_PARSER
503 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
504#endif
505 m_handler.doctype(param);
506 next();
507}
508
509template<typename HandlerT, typename ConfigT>
510void sax_parser<HandlerT,ConfigT>::characters()
511{
512 const char* p0 = mp_char;
513 for (; has_char(); next())
514 {
515 if (cur_char() == '<')
516 break;
517
518 if (cur_char() == '&')
519 {
520 // Text span with one or more encoded characters. Parse using cell buffer.
521 cell_buffer& buf = get_cell_buffer();
522 buf.reset();
523 buf.append(p0, mp_char-p0);
524 characters_with_encoded_char(buf);
525 if (buf.empty())
526 m_handler.characters(std::string_view{}, false);
527 else
528 m_handler.characters(buf.str(), true);
529 return;
530 }
531 }
532
533 if (mp_char > p0)
534 {
535 std::string_view val(p0, mp_char-p0);
536 m_handler.characters(val, false);
537 }
538}
539
540template<typename HandlerT, typename ConfigT>
541void sax_parser<HandlerT,ConfigT>::attribute()
542{
544 attribute_name(attr.ns, attr.name);
545
546#if ORCUS_DEBUG_SAX_PARSER
547 cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
548#endif
549
550 skip_space_and_control();
551
552 char c = cur_char_checked();
553 if (c != '=')
554 {
555 std::ostringstream os;
556 os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
557 throw malformed_xml_error(os.str(), offset());
558 }
559
560 next_check(); // skip the '='.
561 skip_space_and_control();
562
563 attr.transient = value(attr.value, true);
564 if (attr.transient)
565 // Value is stored in a temporary buffer. Push a new buffer.
566 inc_buffer_pos();
567
568#if ORCUS_DEBUG_SAX_PARSER
569 cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
570#endif
571
572 m_handler.attribute(attr);
573}
574
575}
576
577#endif
578/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition cell_buffer.hpp:22
Definition exception.hpp:121
Definition parser_base.hpp:23
Definition sax_parser_base.hpp:108
Definition sax_parser.hpp:30
void end_declaration(std::string_view decl)
Definition sax_parser.hpp:59
void doctype(const orcus::sax::doctype_declaration &dtd)
Definition sax_parser.hpp:37
void attribute(const orcus::sax::parser_attribute &attr)
Definition sax_parser.hpp:111
void characters(std::string_view val, bool transient)
Definition sax_parser.hpp:98
void start_declaration(std::string_view decl)
Definition sax_parser.hpp:49
void end_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:79
void start_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:69
Definition sax_parser_base.hpp:37
Definition sax_parser_base.hpp:96
Definition sax_parser_base.hpp:77
Definition sax_parser.hpp:20
static constexpr uint8_t baseline_version
Definition sax_parser.hpp:26