9#ifndef PQXX_H_ENCODINGS
10#define PQXX_H_ENCODINGS
16#include "pqxx/internal/concat.hxx"
17#include "pqxx/internal/encoding_group.hxx"
22PQXX_DECLARE_ENUM_CONVERSION(pqxx::internal::encoding_group);
32PQXX_LIBEXPORT encoding_group
enc_group(
int );
51template<
char... NEEDLE>
54 std::size_t here = 0u)
56 auto const sz{std::size(haystack)};
57 auto const data{std::data(haystack)};
60 auto next{scanner(data, sz, here)};
61 PQXX_ASSUME(next > here);
64 if ((... or (data[here] == NEEDLE)))
86template<
typename CALLABLE>
88 encoding_group enc, CALLABLE callback,
char const buffer[],
89 std::size_t buffer_len, std::size_t start = 0)
92 for (std::size_t here = start, next; here < buffer_len; here = next)
94 next = scan(buffer, buffer_len, here);
95 PQXX_ASSUME(next > here);
96 callback(buffer + here, buffer + next);
104constexpr PQXX_PURE
unsigned char
105get_byte(
char const buffer[], std::size_t offset)
noexcept
107 return static_cast<unsigned char>(buffer[offset]);
111[[noreturn]] PQXX_COLD
void throw_for_encoding_error(
112 char const *encoding_name,
char const buffer[], std::size_t start,
116 s <<
"Invalid byte sequence for encoding " << encoding_name <<
" at byte "
117 << start <<
": " << std::hex << std::setw(2) << std::setfill(
'0');
118 for (std::size_t i{0}; i < count; ++i)
120 s <<
"0x" <<
static_cast<unsigned int>(get_byte(buffer, start + i));
129constexpr PQXX_PURE
bool
130between_inc(
unsigned char value,
unsigned bottom,
unsigned top)
132 return value >= bottom and value <= top;
146 PQXX_PURE
static std::size_t
147 call(
char const buffer[], std::size_t buffer_len, std::size_t start);
160template<encoding_group ENC,
char... NEEDLE>
161PQXX_PURE
inline std::size_t
162find_ascii_char(std::string_view haystack, std::size_t here)
166 static_assert((... and ((NEEDLE & 0x80) == 0)));
168 auto const sz{std::size(haystack)};
169 auto const data{std::data(haystack)};
175 PQXX_ASSUME(next > here);
193 if ((... or (data[here] == NEEDLE)))
209template<encoding_group ENC,
char... NEEDLE>
215 static_assert((... and ((NEEDLE >> 7) == 0)));
217 auto const sz{std::size(haystack)};
218 auto const data{std::data(haystack)};
222 while ((... and (data[here] != NEEDLE)))
225 PQXX_ASSUME(next > here);
234 static PQXX_PURE
constexpr std::size_t
235 call(
char const [], std::size_t buffer_len, std::size_t start)
238 if (start >= buffer_len)
239 PQXX_UNLIKELY
return std::string::npos;
249 static PQXX_PURE std::size_t
250 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
252 if (start >= buffer_len)
253 PQXX_UNLIKELY
return std::string::npos;
255 auto const byte1{get_byte(buffer, start)};
259 if (not between_inc(byte1, 0x81, 0xfe) or (start + 2 > buffer_len))
261 throw_for_encoding_error(
"BIG5", buffer, start, 1);
263 auto const byte2{get_byte(buffer, start + 1)};
265 not between_inc(byte2, 0x40, 0x7e) and
266 not between_inc(byte2, 0xa1, 0xfe))
268 throw_for_encoding_error(
"BIG5", buffer, start, 2);
289 static PQXX_PURE std::size_t
290 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
292 if (start >= buffer_len)
293 return std::string::npos;
295 auto const byte1{get_byte(buffer, start)};
299 if (not between_inc(byte1, 0xa1, 0xf7) or start + 2 > buffer_len)
301 throw_for_encoding_error(
"EUC_CN", buffer, start, 1);
303 auto const byte2{get_byte(buffer, start + 1)};
304 if (not between_inc(byte2, 0xa1, 0xfe))
306 throw_for_encoding_error(
"EUC_CN", buffer, start, 2);
320 static PQXX_PURE std::size_t
321 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
323 if (start >= buffer_len)
324 return std::string::npos;
326 auto const byte1{get_byte(buffer, start)};
330 if (start + 2 > buffer_len)
332 throw_for_encoding_error(
"EUC_JP", buffer, start, 1);
334 auto const byte2{get_byte(buffer, start + 1)};
337 if (not between_inc(byte2, 0xa1, 0xfe))
339 throw_for_encoding_error(
"EUC_JP", buffer, start, 2);
344 if (between_inc(byte1, 0xa1, 0xfe))
346 if (not between_inc(byte2, 0xa1, 0xfe))
348 throw_for_encoding_error(
"EUC_JP", buffer, start, 2);
353 if (byte1 == 0x8f and start + 3 <= buffer_len)
355 auto const byte3{get_byte(buffer, start + 2)};
357 not between_inc(byte2, 0xa1, 0xfe) or
358 not between_inc(byte3, 0xa1, 0xfe))
360 throw_for_encoding_error(
"EUC_JP", buffer, start, 3);
365 throw_for_encoding_error(
"EUC_JP", buffer, start, 1);
373 static PQXX_PURE std::size_t
374 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
376 if (start >= buffer_len)
377 PQXX_UNLIKELY
return std::string::npos;
379 auto const byte1{get_byte(buffer, start)};
383 if (not between_inc(byte1, 0xa1, 0xfe) or start + 2 > buffer_len)
385 throw_for_encoding_error(
"EUC_KR", buffer, start, 1);
387 auto const byte2{get_byte(buffer, start + 1)};
388 if (not between_inc(byte2, 0xa1, 0xfe))
390 throw_for_encoding_error(
"EUC_KR", buffer, start, 1);
400 static PQXX_PURE std::size_t
401 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
403 if (start >= buffer_len)
405 return std::string::npos;
407 auto const byte1{get_byte(buffer, start)};
411 if (start + 2 > buffer_len)
413 throw_for_encoding_error(
"EUC_KR", buffer, start, 1);
415 auto const byte2{get_byte(buffer, start + 1)};
416 if (between_inc(byte1, 0xa1, 0xfe))
418 if (not between_inc(byte2, 0xa1, 0xfe))
420 throw_for_encoding_error(
"EUC_KR", buffer, start, 2);
425 if (byte1 != 0x8e or start + 4 > buffer_len)
427 throw_for_encoding_error(
"EUC_KR", buffer, start, 1);
430 between_inc(byte2, 0xa1, 0xb0) and
431 between_inc(get_byte(buffer, start + 2), 0xa1, 0xfe) and
432 between_inc(get_byte(buffer, start + 3), 0xa1, 0xfe))
436 throw_for_encoding_error(
"EUC_KR", buffer, start, 4);
444 static PQXX_PURE std::size_t
445 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
447 if (start >= buffer_len)
448 PQXX_UNLIKELY
return std::string::npos;
450 auto const byte1{get_byte(buffer, start)};
454 throw_for_encoding_error(
"GB18030", buffer, start, buffer_len - start);
456 if (start + 2 > buffer_len)
458 throw_for_encoding_error(
"GB18030", buffer, start, buffer_len - start);
460 auto const byte2{get_byte(buffer, start + 1)};
461 if (between_inc(byte2, 0x40, 0xfe))
465 throw_for_encoding_error(
"GB18030", buffer, start, 2);
470 if (start + 4 > buffer_len)
472 throw_for_encoding_error(
"GB18030", buffer, start, buffer_len - start);
475 between_inc(byte2, 0x30, 0x39) and
476 between_inc(get_byte(buffer, start + 2), 0x81, 0xfe) and
477 between_inc(get_byte(buffer, start + 3), 0x30, 0x39))
481 throw_for_encoding_error(
"GB18030", buffer, start, 4);
489 static PQXX_PURE std::size_t
490 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
492 if (start >= buffer_len)
493 PQXX_UNLIKELY
return std::string::npos;
495 auto const byte1{get_byte(buffer, start)};
499 if (start + 2 > buffer_len)
501 throw_for_encoding_error(
"GBK", buffer, start, 1);
503 auto const byte2{get_byte(buffer, start + 1)};
505 (between_inc(byte1, 0xa1, 0xa9) and between_inc(byte2, 0xa1, 0xfe)) or
506 (between_inc(byte1, 0xb0, 0xf7) and between_inc(byte2, 0xa1, 0xfe)) or
507 (between_inc(byte1, 0x81, 0xa0) and between_inc(byte2, 0x40, 0xfe) and
509 (between_inc(byte1, 0xaa, 0xfe) and between_inc(byte2, 0x40, 0xa0) and
511 (between_inc(byte1, 0xa8, 0xa9) and between_inc(byte2, 0x40, 0xa0) and
513 (between_inc(byte1, 0xaa, 0xaf) and between_inc(byte2, 0xa1, 0xfe)) or
514 (between_inc(byte1, 0xf8, 0xfe) and between_inc(byte2, 0xa1, 0xfe)) or
515 (between_inc(byte1, 0xa1, 0xa7) and between_inc(byte2, 0x40, 0xa0) and
520 throw_for_encoding_error(
"GBK", buffer, start, 2);
536 static PQXX_PURE std::size_t
537 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
539 if (start >= buffer_len)
540 PQXX_UNLIKELY
return std::string::npos;
542 auto const byte1{get_byte(buffer, start)};
546 if (start + 2 > buffer_len)
548 throw_for_encoding_error(
"JOHAB", buffer, start, 1);
550 auto const byte2{get_byte(buffer, start)};
552 (between_inc(byte1, 0x84, 0xd3) and
553 (between_inc(byte2, 0x41, 0x7e) or between_inc(byte2, 0x81, 0xfe))) or
554 ((between_inc(byte1, 0xd8, 0xde) or between_inc(byte1, 0xe0, 0xf9)) and
555 (between_inc(byte2, 0x31, 0x7e) or between_inc(byte2, 0x91, 0xfe))))
559 throw_for_encoding_error(
"JOHAB", buffer, start, 2);
573 static PQXX_PURE std::size_t
574 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
576 if (start >= buffer_len)
577 PQXX_UNLIKELY
return std::string::npos;
579 auto const byte1{get_byte(buffer, start)};
583 if (start + 2 > buffer_len)
585 throw_for_encoding_error(
"MULE_INTERNAL", buffer, start, 1);
587 auto const byte2{get_byte(buffer, start + 1)};
588 if (between_inc(byte1, 0x81, 0x8d) and byte2 >= 0xa0)
591 if (start + 3 > buffer_len)
593 throw_for_encoding_error(
"MULE_INTERNAL", buffer, start, 2);
596 ((byte1 == 0x9a and between_inc(byte2, 0xa0, 0xdf)) or
597 (byte1 == 0x9b and between_inc(byte2, 0xe0, 0xef)) or
598 (between_inc(byte1, 0x90, 0x99) and byte2 >= 0xa0)) and
602 if (start + 4 > buffer_len)
604 throw_for_encoding_error(
"MULE_INTERNAL", buffer, start, 3);
607 ((byte1 == 0x9c and between_inc(byte2, 0xf0, 0xf4)) or
608 (byte1 == 0x9d and between_inc(byte2, 0xf5, 0xfe))) and
609 get_byte(buffer, start + 2) >= 0xa0 and
610 get_byte(buffer, start + 4) >= 0xa0)
614 throw_for_encoding_error(
"MULE_INTERNAL", buffer, start, 4);
630 static PQXX_PURE std::size_t
631 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
633 if (start >= buffer_len)
634 return std::string::npos;
636 auto const byte1{get_byte(buffer, start)};
637 if (byte1 < 0x80 or between_inc(byte1, 0xa1, 0xdf))
641 not between_inc(byte1, 0x81, 0x9f) and
642 not between_inc(byte1, 0xe0, 0xfc))
644 throw_for_encoding_error(
"SJIS", buffer, start, 1);
646 if (start + 2 > buffer_len)
648 throw_for_encoding_error(
"SJIS", buffer, start, buffer_len - start);
650 auto const byte2{get_byte(buffer, start + 1)};
653 throw_for_encoding_error(
"SJIS", buffer, start, 2);
655 if (between_inc(byte2, 0x40, 0x9e) or between_inc(byte2, 0x9f, 0xfc))
659 throw_for_encoding_error(
"SJIS", buffer, start, 2);
667 static PQXX_PURE std::size_t
668 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
670 if (start >= buffer_len)
671 PQXX_UNLIKELY
return std::string::npos;
673 auto const byte1{get_byte(buffer, start)};
677 if (start + 2 > buffer_len)
679 throw_for_encoding_error(
"UHC", buffer, start, buffer_len - start);
681 auto const byte2{get_byte(buffer, start + 1)};
682 if (between_inc(byte1, 0x80, 0xc6))
685 between_inc(byte2, 0x41, 0x5a) or between_inc(byte2, 0x61, 0x7a) or
686 between_inc(byte2, 0x80, 0xfe))
690 throw_for_encoding_error(
"UHC", buffer, start, 2);
693 if (between_inc(byte1, 0xa1, 0xfe))
695 if (not between_inc(byte2, 0xa1, 0xfe))
697 throw_for_encoding_error(
"UHC", buffer, start, 2);
702 throw_for_encoding_error(
"UHC", buffer, start, 1);
710 static PQXX_PURE std::size_t
711 call(
char const buffer[], std::size_t buffer_len, std::size_t start)
713 if (start >= buffer_len)
714 PQXX_UNLIKELY
return std::string::npos;
716 auto const byte1{get_byte(buffer, start)};
720 if (start + 2 > buffer_len)
722 throw_for_encoding_error(
"UTF8", buffer, start, buffer_len - start);
724 auto const byte2{get_byte(buffer, start + 1)};
725 if (between_inc(byte1, 0xc0, 0xdf))
727 if (not between_inc(byte2, 0x80, 0xbf))
729 throw_for_encoding_error(
"UTF8", buffer, start, 2);
734 if (start + 3 > buffer_len)
736 throw_for_encoding_error(
"UTF8", buffer, start, buffer_len - start);
738 auto const byte3{get_byte(buffer, start + 2)};
739 if (between_inc(byte1, 0xe0, 0xef))
741 if (between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf))
745 throw_for_encoding_error(
"UTF8", buffer, start, 3);
748 if (start + 4 > buffer_len)
750 throw_for_encoding_error(
"UTF8", buffer, start, buffer_len - start);
752 if (between_inc(byte1, 0xf0, 0xf7))
755 between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf) and
756 between_inc(get_byte(buffer, start + 3), 0x80, 0xbf))
760 throw_for_encoding_error(
"UTF8", buffer, start, 4);
764 throw_for_encoding_error(
"UTF8", buffer, start, 1);
784constexpr inline encoding_group
789 case encoding_group::MONOBYTE:
790 case encoding_group::EUC_CN:
791 case encoding_group::EUC_JP:
792 case encoding_group::EUC_KR:
793 case encoding_group::EUC_TW:
794 case encoding_group::MULE_INTERNAL:
795 case encoding_group::UTF8:
799 return encoding_group::MONOBYTE;
801 default: PQXX_UNLIKELY
return enc;
813template<
char... NEEDLE>
820 case encoding_group::MONOBYTE:
821 return pqxx::internal::find_ascii_char<
822 encoding_group::MONOBYTE, NEEDLE...>;
823 case encoding_group::BIG5:
824 return pqxx::internal::find_ascii_char<encoding_group::BIG5, NEEDLE...>;
825 case encoding_group::GB18030:
826 return pqxx::internal::find_ascii_char<encoding_group::GB18030, NEEDLE...>;
827 case encoding_group::GBK:
828 return pqxx::internal::find_ascii_char<encoding_group::GBK, NEEDLE...>;
829 case encoding_group::JOHAB:
830 return pqxx::internal::find_ascii_char<encoding_group::JOHAB, NEEDLE...>;
831 case encoding_group::SJIS:
832 return pqxx::internal::find_ascii_char<encoding_group::SJIS, NEEDLE...>;
833 case encoding_group::UHC:
834 return pqxx::internal::find_ascii_char<encoding_group::UHC, NEEDLE...>;
838 "Unexpected encoding group: ", as_if,
" (mapped from ", enc,
").")};
847template<
char... NEEDLE>
854 case encoding_group::MONOBYTE:
856 encoding_group::MONOBYTE, NEEDLE...>;
857 case encoding_group::BIG5:
859 case encoding_group::GB18030:
861 encoding_group::GB18030, NEEDLE...>;
862 case encoding_group::GBK:
864 case encoding_group::JOHAB:
866 case encoding_group::SJIS:
868 case encoding_group::UHC:
873 "Unexpected encoding group: ", as_if,
" (mapped from ", enc,
").")};
Invalid argument passed to libpqxx, similar to std::invalid_argument.
Definition except.hxx:266
Internal error in libpqxx library.
Definition except.hxx:242
Internal items for libpqxx' own use. Do not use these yourself.
Definition encodings.cxx:33
std::string concat(TYPE... item)
Efficiently combine a bunch of items into one big string.
Definition concat.hxx:31
PQXX_PURE constexpr char_finder_func * get_char_finder(encoding_group enc)
Look up a character search function for an encoding group.
Definition encodings.hxx:815
PQXX_PURE std::size_t find_s_ascii_char(std::string_view haystack, std::size_t here)
Find first of NEEDLE ASCII chars in haystack.
Definition encodings.hxx:211
PQXX_PURE char const * name_encoding(int encoding_id)
Return PostgreSQL's name for encoding enum value.
std::size_t(char const buffer[], std::size_t buffer_len, std::size_t start) glyph_scanner_func
Function type: "find the end of the current glyph.".
Definition encoding_group.hxx:53
PQXX_PURE constexpr char_finder_func * get_s_char_finder(encoding_group enc)
Look up a "sentry" character search function for an encoding group.
Definition encodings.hxx:849
pqxx::internal::encoding_group enc_group(std::string_view encoding_name)
Convert libpq encoding name to its libpqxx encoding group.
Definition encodings.cxx:35
void for_glyphs(encoding_group enc, CALLABLE callback, char const buffer[], std::size_t buffer_len, std::size_t start=0)
Iterate over the glyphs in a buffer.
Definition encodings.hxx:87
PQXX_LIBEXPORT glyph_scanner_func * get_glyph_scanner(encoding_group)
Look up the glyph scanner function for a given encoding group.
std::size_t(std::string_view haystack, std::size_t start) char_finder_func
Function type: "find first occurrence of specific any of ASCII characters.".
Definition encoding_group.hxx:71
std::size_t find_char(glyph_scanner_func *scanner, std::string_view haystack, std::size_t here=0u)
Find any of the ASCII characters NEEDLE in haystack.
Definition encodings.hxx:52
constexpr encoding_group map_ascii_search_group(encoding_group enc) noexcept
Just for searching an ASCII character, what encoding can we use here?
Definition encodings.hxx:785
The home of all libpqxx classes, functions, templates, etc.
Definition array.cxx:27
Wrapper struct template for "find next glyph" functions.
Definition encodings.hxx:143
static PQXX_PURE std::size_t call(char const buffer[], std::size_t buffer_len, std::size_t start)
Find the next glyph in buffer after position start.