Libparserutils
codec_utf8.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#include <assert.h>
9#include <stdlib.h>
10#include <string.h>
11
13
16#include "utils/endian.h"
17#include "utils/utils.h"
18
22typedef struct charset_utf8_codec {
25#define INVAL_BUFSIZE (32)
29 size_t inval_len; /*< Byte length of inval_buf **/
30
31#define READ_BUFSIZE (8)
35 size_t read_len;
37#define WRITE_BUFSIZE (8)
41 size_t write_len;
44
45static bool charset_utf8_codec_handles_charset(const char *charset);
46static parserutils_error charset_utf8_codec_create(const char *charset,
52 const uint8_t **source, size_t *sourcelen,
53 uint8_t **dest, size_t *destlen);
56 const uint8_t **source, size_t *sourcelen,
57 uint8_t **dest, size_t *destlen);
62 const uint8_t **source, size_t *sourcelen,
63 uint8_t **dest, size_t *destlen);
66 uint32_t ucs4, uint8_t **dest, size_t *destlen);
67
74bool charset_utf8_codec_handles_charset(const char *charset)
75{
77 strlen(charset)) ==
79 SLEN("UTF-8"));
80}
81
93{
95
96 UNUSED(charset);
97
98 c = malloc(sizeof(charset_utf8_codec));
99 if (c == NULL)
100 return PARSERUTILS_NOMEM;
101
102 c->inval_buf[0] = '\0';
103 c->inval_len = 0;
104
105 c->read_buf[0] = 0;
106 c->read_len = 0;
107
108 c->write_buf[0] = 0;
109 c->write_len = 0;
110
111 /* Finally, populate vtable */
116
117 *codec = (parserutils_charset_codec *) c;
118
119 return PARSERUTILS_OK;
120}
121
134
163 const uint8_t **source, size_t *sourcelen,
164 uint8_t **dest, size_t *destlen)
165{
167 uint32_t ucs4;
168 uint32_t *towrite;
169 size_t towritelen;
170 parserutils_error error;
171
172 /* Process any outstanding characters from the previous call */
173 if (c->write_len > 0) {
174 uint32_t *pwrite = c->write_buf;
175
176 while (c->write_len > 0) {
177 UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
178 if (error != PARSERUTILS_OK) {
179 uint32_t len;
180 assert(error == PARSERUTILS_NOMEM);
181
182 /* Insufficient output buffer space */
183 for (len = 0; len < c->write_len; len++) {
184 c->write_buf[len] = pwrite[len];
185 }
186
187 return PARSERUTILS_NOMEM;
188 }
189
190 pwrite++;
191 c->write_len--;
192 }
193 }
194
195 /* Now process the characters for this call */
196 while (*sourcelen > 0) {
197 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
198 towrite = &ucs4;
199 towritelen = 1;
200
201 /* Output current characters */
202 while (towritelen > 0) {
203 UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
204 if (error != PARSERUTILS_OK) {
205 uint32_t len;
206 assert(error == PARSERUTILS_NOMEM);
207
208 /* Insufficient output space */
209 assert(towritelen < WRITE_BUFSIZE);
210
211 c->write_len = towritelen;
212
213 /* Copy pending chars to save area, for
214 * processing next call. */
215 for (len = 0; len < towritelen; len++)
216 c->write_buf[len] = towrite[len];
217
218 /* Claim character we've just buffered,
219 * so it's not reprocessed */
220 *source += 4;
221 *sourcelen -= 4;
222
223 return PARSERUTILS_NOMEM;
224 }
225
226 towrite++;
227 towritelen--;
228 }
229
230 *source += 4;
231 *sourcelen -= 4;
232 }
233
234 return PARSERUTILS_OK;
235}
236
279 const uint8_t **source, size_t *sourcelen,
280 uint8_t **dest, size_t *destlen)
281{
283 parserutils_error error;
284
285 if (c->read_len > 0) {
286 /* Output left over from last decode */
287 uint32_t *pread = c->read_buf;
288
289 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
290 *((uint32_t *) (void *) *dest) =
291 endian_host_to_big(pread[0]);
292
293 *dest += 4;
294 *destlen -= 4;
295
296 pread++;
297 c->read_len--;
298 }
299
300 if (*destlen < c->read_len * 4) {
301 /* Ran out of output buffer */
302 size_t i;
303
304 /* Shuffle remaining output down */
305 for (i = 0; i < c->read_len; i++)
306 c->read_buf[i] = pread[i];
307
308 return PARSERUTILS_NOMEM;
309 }
310 }
311
312 if (c->inval_len > 0) {
313 /* The last decode ended in an incomplete sequence.
314 * Fill up inval_buf with data from the start of the
315 * new chunk and process it. */
316 uint8_t *in = c->inval_buf;
317 size_t ol = c->inval_len;
318 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
319 size_t orig_l = l;
320
321 memcpy(c->inval_buf + ol, *source, l);
322
323 l += c->inval_len;
324
326 (const uint8_t **) &in, &l, dest, destlen);
327 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
328 return error;
329 }
330
331 /* And now, fix up source pointers */
332 *source += max((signed) (orig_l - l), 0);
333 *sourcelen -= max((signed) (orig_l - l), 0);
334
335 /* Failed to resolve an incomplete character and
336 * ran out of buffer space. No recovery strategy
337 * possible, so explode everywhere. */
338 assert((orig_l + ol) - l != 0);
339
340 /* Report memory exhaustion case from above */
341 if (error != PARSERUTILS_OK)
342 return error;
343 }
344
345 /* Finally, the "normal" case; process all outstanding characters */
346 while (*sourcelen > 0) {
348 source, sourcelen, dest, destlen);
349 if (error != PARSERUTILS_OK) {
350 return error;
351 }
352 }
353
354 return PARSERUTILS_OK;
355}
356
364{
366
367 c->inval_buf[0] = '\0';
368 c->inval_len = 0;
369
370 c->read_buf[0] = 0;
371 c->read_len = 0;
372
373 c->write_buf[0] = 0;
374 c->write_len = 0;
375
376 return PARSERUTILS_OK;
377}
378
379
409 const uint8_t **source, size_t *sourcelen,
410 uint8_t **dest, size_t *destlen)
411{
412 uint32_t ucs4;
413 size_t sucs4;
414 parserutils_error error;
415
416 /* Convert a single character */
417 {
418 const uint8_t *src = *source;
419 size_t srclen = *sourcelen;
420 uint32_t *uptr = &ucs4;
421 size_t *usptr = &sucs4;
422 UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
423 }
424 if (error == PARSERUTILS_OK) {
425 /* Read a character */
427 ucs4, dest, destlen);
428 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
429 /* output succeeded; update source pointers */
430 *source += sucs4;
431 *sourcelen -= sucs4;
432 }
433
434 /* Clear inval buffer */
435 c->inval_buf[0] = '\0';
436 c->inval_len = 0;
437
438 return error;
439 } else if (error == PARSERUTILS_NEEDDATA) {
440 /* Incomplete input sequence */
441 assert(*sourcelen < INVAL_BUFSIZE);
442
443 memmove(c->inval_buf, *source, *sourcelen);
444 c->inval_buf[*sourcelen] = '\0';
445 c->inval_len = *sourcelen;
446
447 *source += *sourcelen;
448 *sourcelen = 0;
449
450 return PARSERUTILS_OK;
451 } else if (error == PARSERUTILS_INVALID) {
452 /* Illegal input sequence */
453 uint32_t nextchar;
454
455 /* Strict errormode; simply flag invalid character */
456 if (c->base.errormode ==
458 /* Clear inval buffer */
459 c->inval_buf[0] = '\0';
460 c->inval_len = 0;
461
462 return PARSERUTILS_INVALID;
463 }
464
465 /* Find next valid UTF-8 sequence.
466 * We're processing client-provided data, so let's
467 * be paranoid about its validity. */
468 {
469 const uint8_t *src = *source;
470 size_t srclen = *sourcelen;
471 uint32_t off = 0;
472 uint32_t *ncptr = &nextchar;
473
474 UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
475 }
476 if (error != PARSERUTILS_OK) {
477 if (error == PARSERUTILS_NEEDDATA) {
478 /* Need more data to be sure */
479 assert(*sourcelen < INVAL_BUFSIZE);
480
481 memmove(c->inval_buf, *source, *sourcelen);
482 c->inval_buf[*sourcelen] = '\0';
483 c->inval_len = *sourcelen;
484
485 *source += *sourcelen;
486 *sourcelen = 0;
487
488 nextchar = 0;
489 } else {
490 return error;
491 }
492 }
493
494 /* Clear inval buffer */
495 c->inval_buf[0] = '\0';
496 c->inval_len = 0;
497
498 /* output U+FFFD and continue processing. */
500 0xFFFD, dest, destlen);
501 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
502 /* output succeeded; update source pointers */
503 *source += nextchar;
504 *sourcelen -= nextchar;
505 }
506
507 return error;
508 }
509
510 return PARSERUTILS_OK;
511}
512
524 uint32_t ucs4, uint8_t **dest, size_t *destlen)
525{
526 if (*destlen < 4) {
527 /* Run out of output buffer */
528 c->read_len = 1;
529 c->read_buf[0] = ucs4;
530
531 return PARSERUTILS_NOMEM;
532 }
533
534 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
535 *dest += 4;
536 *destlen -= 4;
537
538 return PARSERUTILS_OK;
539}
540
541
546
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition codec.h:64
size_t len
Definition codec_8859.c:23
static parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of UTF-8 data into UCS-4 (big endian)
Definition codec_utf8.c:278
#define INVAL_BUFSIZE
Definition codec_utf8.c:25
const parserutils_charset_handler charset_utf8_codec_handler
Definition codec_utf8.c:542
static bool charset_utf8_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition codec_utf8.c:74
static parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into UTF-8.
Definition codec_utf8.c:162
static parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the UTF-8 to UCS-4 (big endian)
Definition codec_utf8.c:408
#define READ_BUFSIZE
Definition codec_utf8.c:31
struct charset_utf8_codec charset_utf8_codec
UTF-8 charset codec.
static parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition codec_utf8.c:523
#define WRITE_BUFSIZE
Definition codec_utf8.c:37
static parserutils_error charset_utf8_codec_create(const char *charset, parserutils_charset_codec **codec)
Create a UTF-8 codec.
Definition codec_utf8.c:91
static parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
Clear a UTF-8 codec's encoding state.
Definition codec_utf8.c:363
static parserutils_error charset_utf8_codec_destroy(parserutils_charset_codec *codec)
Destroy a UTF-8 codec.
Definition codec_utf8.c:128
static uint32_t endian_host_to_big(uint32_t host)
Definition endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition endian.h:32
parserutils_error
Definition errors.h:18
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_NEEDDATA
Definition errors.h:25
@ PARSERUTILS_INVALID
Definition errors.h:23
@ PARSERUTILS_NOMEM
Definition errors.h:21
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition aliases.c:107
UTF-8 charset codec.
Definition codec_utf8.c:22
size_t write_len
Character length of write_buf.
Definition codec_utf8.c:41
parserutils_charset_codec base
Base class.
Definition codec_utf8.c:23
size_t read_len
Character length of read_buf.
Definition codec_utf8.c:35
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition codec_utf8.c:38
uint8_t inval_buf[INVAL_BUFSIZE]
Buffer for fixing up incomplete input sequences.
Definition codec_utf8.c:26
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition codec_utf8.c:32
Core charset codec definition; implementations extend this.
Definition codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:29
struct parserutils_charset_codec::@271367034342366162232062053053007137175253257255 handler
Vtable for handler code.
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition codec_impl.h:32
Codec factory component definition.
Definition codec_impl.h:39
UTF-8 manipulation macros (implementation).
#define UTF8_TO_UCS4(s, len, ucs4, clen, error)
Convert a UTF-8 multibyte sequence into a single UCS-4 character.
Definition utf8impl.h:34
#define UTF8_FROM_UCS4(ucs4, s, len, error)
Convert a single UCS-4 character into a UTF-8 multibyte sequence.
Definition utf8impl.h:123
#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)
Skip to start of next sequence in UTF-8 input.
Definition utf8impl.h:303
#define UNUSED(x)
Definition utils.h:25
#define min(a, b)
Definition utils.h:16
#define SLEN(s)
Definition utils.h:21
#define max(a, b)
Definition utils.h:12