Libparserutils
codec_utf16.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#include <assert.h>
9#include <stdlib.h>
10#include <string.h>
11
14
16#include "utils/endian.h"
17#include "utils/utils.h"
18
22typedef struct charset_utf16_codec {
25#define INVAL_BUFSIZE (32)
29 size_t inval_len; /*< Byte length of inval_buf **/
30
31#define READ_BUFSIZE (8)
35 size_t read_len;
37#define WRITE_BUFSIZE (8)
41 size_t write_len;
44
45static bool charset_utf16_codec_handles_charset(const char *charset);
46static parserutils_error charset_utf16_codec_create(const char *charset,
52 const uint8_t **source, size_t *sourcelen,
53 uint8_t **dest, size_t *destlen);
56 const uint8_t **source, size_t *sourcelen,
57 uint8_t **dest, size_t *destlen);
62 const uint8_t **source, size_t *sourcelen,
63 uint8_t **dest, size_t *destlen);
66 uint32_t ucs4, uint8_t **dest, size_t *destlen);
67
74bool charset_utf16_codec_handles_charset(const char *charset)
75{
76 return parserutils_charset_mibenum_from_name(charset, strlen(charset))
77 ==
79}
80
92{
94
95 UNUSED(charset);
96
97 c = malloc(sizeof(charset_utf16_codec));
98 if (c == NULL)
99 return PARSERUTILS_NOMEM;
100
101 c->inval_buf[0] = '\0';
102 c->inval_len = 0;
103
104 c->read_buf[0] = 0;
105 c->read_len = 0;
106
107 c->write_buf[0] = 0;
108 c->write_len = 0;
109
110 /* Finally, populate vtable */
115
116 *codec = (parserutils_charset_codec *) c;
117
118 return PARSERUTILS_OK;
119}
120
133
162 const uint8_t **source, size_t *sourcelen,
163 uint8_t **dest, size_t *destlen)
164{
166 uint32_t ucs4;
167 uint32_t *towrite;
168 size_t towritelen;
169 parserutils_error error;
170
171 /* Process any outstanding characters from the previous call */
172 if (c->write_len > 0) {
173 uint32_t *pwrite = c->write_buf;
174 uint8_t buf[4];
175 size_t len;
176
177 while (c->write_len > 0) {
179 pwrite[0], buf, &len);
180 assert(error == PARSERUTILS_OK);
181
182 if (*destlen < len) {
183 /* Insufficient output buffer space */
184 for (len = 0; len < c->write_len; len++)
185 c->write_buf[len] = pwrite[len];
186
187 return PARSERUTILS_NOMEM;
188 }
189
190 memcpy(*dest, buf, len);
191
192 *dest += len;
193 *destlen -= len;
194
195 pwrite++;
196 c->write_len--;
197 }
198 }
199
200 /* Now process the characters for this call */
201 while (*sourcelen > 0) {
202 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
203 towrite = &ucs4;
204 towritelen = 1;
205
206 /* Output current characters */
207 while (towritelen > 0) {
208 uint8_t buf[4];
209 size_t len;
210
212 towrite[0], buf, &len);
213 assert(error == PARSERUTILS_OK);
214
215 if (*destlen < len) {
216 /* Insufficient output space */
217 assert(towritelen < WRITE_BUFSIZE);
218
219 c->write_len = towritelen;
220
221 /* Copy pending chars to save area, for
222 * processing next call. */
223 for (len = 0; len < towritelen; len++)
224 c->write_buf[len] = towrite[len];
225
226 /* Claim character we've just buffered,
227 * so it's not reprocessed */
228 *source += 4;
229 *sourcelen -= 4;
230
231 return PARSERUTILS_NOMEM;
232 }
233
234 memcpy(*dest, buf, len);
235
236 *dest += len;
237 *destlen -= len;
238
239 towrite++;
240 towritelen--;
241 }
242
243 *source += 4;
244 *sourcelen -= 4;
245 }
246
247 (void) error;
248
249 return PARSERUTILS_OK;
250}
251
294 const uint8_t **source, size_t *sourcelen,
295 uint8_t **dest, size_t *destlen)
296{
298 parserutils_error error;
299
300 if (c->read_len > 0) {
301 /* Output left over from last decode */
302 uint32_t *pread = c->read_buf;
303
304 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
305 *((uint32_t *) (void *) *dest) =
306 endian_host_to_big(pread[0]);
307
308 *dest += 4;
309 *destlen -= 4;
310
311 pread++;
312 c->read_len--;
313 }
314
315 if (*destlen < c->read_len * 4) {
316 /* Ran out of output buffer */
317 size_t i;
318
319 /* Shuffle remaining output down */
320 for (i = 0; i < c->read_len; i++)
321 c->read_buf[i] = pread[i];
322
323 return PARSERUTILS_NOMEM;
324 }
325 }
326
327 if (c->inval_len > 0) {
328 /* The last decode ended in an incomplete sequence.
329 * Fill up inval_buf with data from the start of the
330 * new chunk and process it. */
331 uint8_t *in = c->inval_buf;
332 size_t ol = c->inval_len;
333 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
334 size_t orig_l = l;
335
336 memcpy(c->inval_buf + ol, *source, l);
337
338 l += c->inval_len;
339
341 (const uint8_t **) &in, &l, dest, destlen);
342 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
343 return error;
344 }
345
346 /* And now, fix up source pointers */
347 *source += max((signed) (orig_l - l), 0);
348 *sourcelen -= max((signed) (orig_l - l), 0);
349
350 /* Failed to resolve an incomplete character and
351 * ran out of buffer space. No recovery strategy
352 * possible, so explode everywhere. */
353 assert((orig_l + ol) - l != 0);
354
355 /* Report memory exhaustion case from above */
356 if (error != PARSERUTILS_OK)
357 return error;
358 }
359
360 /* Finally, the "normal" case; process all outstanding characters */
361 while (*sourcelen > 0) {
363 source, sourcelen, dest, destlen);
364 if (error != PARSERUTILS_OK) {
365 return error;
366 }
367 }
368
369 return PARSERUTILS_OK;
370}
371
379{
381
382 c->inval_buf[0] = '\0';
383 c->inval_len = 0;
384
385 c->read_buf[0] = 0;
386 c->read_len = 0;
387
388 c->write_buf[0] = 0;
389 c->write_len = 0;
390
391 return PARSERUTILS_OK;
392}
393
394
424 const uint8_t **source, size_t *sourcelen,
425 uint8_t **dest, size_t *destlen)
426{
427 uint32_t ucs4;
428 size_t sucs4;
429 parserutils_error error;
430
431 /* Convert a single character */
432 error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
433 &ucs4, &sucs4);
434 if (error == PARSERUTILS_OK) {
435 /* Read a character */
437 ucs4, dest, destlen);
438 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
439 /* output succeeded; update source pointers */
440 *source += sucs4;
441 *sourcelen -= sucs4;
442 }
443
444 /* Clear inval buffer */
445 c->inval_buf[0] = '\0';
446 c->inval_len = 0;
447
448 return error;
449 } else if (error == PARSERUTILS_NEEDDATA) {
450 /* Incomplete input sequence */
451 assert(*sourcelen < INVAL_BUFSIZE);
452
453 memmove(c->inval_buf, *source, *sourcelen);
454 c->inval_buf[*sourcelen] = '\0';
455 c->inval_len = *sourcelen;
456
457 *source += *sourcelen;
458 *sourcelen = 0;
459
460 return PARSERUTILS_OK;
461 } else if (error == PARSERUTILS_INVALID) {
462 /* Illegal input sequence */
463 uint32_t nextchar;
464
465 /* Clear inval buffer */
466 c->inval_buf[0] = '\0';
467 c->inval_len = 0;
468
469 /* Strict errormode; simply flag invalid character */
470 if (c->base.errormode ==
472 return PARSERUTILS_INVALID;
473 }
474
475 /* Find next valid UTF-16 sequence.
476 * We're processing client-provided data, so let's
477 * be paranoid about its validity. */
479 *source, *sourcelen, 0, &nextchar);
480 if (error != PARSERUTILS_OK) {
481 if (error == PARSERUTILS_NEEDDATA) {
482 /* Need more data to be sure */
483 assert(*sourcelen < INVAL_BUFSIZE);
484
485 memmove(c->inval_buf, *source, *sourcelen);
486 c->inval_buf[*sourcelen] = '\0';
487 c->inval_len = *sourcelen;
488
489 *source += *sourcelen;
490 *sourcelen = 0;
491
492 nextchar = 0;
493 } else {
494 return error;
495 }
496 }
497
498 /* output U+FFFD and continue processing. */
500 0xFFFD, dest, destlen);
501 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
502 /* output succeeded; update source pointers */
503 *source += nextchar;
504 *sourcelen -= nextchar;
505 }
506
507 return error;
508 }
509
510 return PARSERUTILS_OK;
511}
512
524 uint32_t ucs4, uint8_t **dest, size_t *destlen)
525{
526 if (*destlen < 4) {
527 /* Run out of output buffer */
528 c->read_len = 1;
529 c->read_buf[0] = ucs4;
530
531 return PARSERUTILS_NOMEM;
532 }
533
534 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
535 *dest += 4;
536 *destlen -= 4;
537
538 return PARSERUTILS_OK;
539}
540
541
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition codec.h:64
size_t len
Definition codec_8859.c:23
static parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
Clear a UTF-16 codec's encoding state.
#define INVAL_BUFSIZE
Definition codec_utf16.c:25
struct charset_utf16_codec charset_utf16_codec
UTF-16 charset codec.
const parserutils_charset_handler charset_utf16_codec_handler
static parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the UTF-16 to UCS-4 (big endian)
static parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
#define READ_BUFSIZE
Definition codec_utf16.c:31
static parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into UTF-16.
static parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of UTF-16 data into UCS-4 (big endian)
static bool charset_utf16_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition codec_utf16.c:74
static parserutils_error charset_utf16_codec_destroy(parserutils_charset_codec *codec)
Destroy a UTF-16 codec.
static parserutils_error charset_utf16_codec_create(const char *charset, parserutils_charset_codec **codec)
Create a UTF-16 codec.
Definition codec_utf16.c:90
#define WRITE_BUFSIZE
Definition codec_utf16.c:37
static uint32_t endian_host_to_big(uint32_t host)
Definition endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition endian.h:32
parserutils_error
Definition errors.h:18
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_NEEDDATA
Definition errors.h:25
@ PARSERUTILS_INVALID
Definition errors.h:23
@ PARSERUTILS_NOMEM
Definition errors.h:21
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition aliases.c:107
UTF-16 charset codec.
Definition codec_utf16.c:22
parserutils_charset_codec base
Base class.
Definition codec_utf16.c:23
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition codec_utf16.c:38
size_t read_len
Character length of read_buf.
Definition codec_utf16.c:35
uint8_t inval_buf[INVAL_BUFSIZE]
Buffer for fixing up incomplete input sequences.
Definition codec_utf16.c:26
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition codec_utf16.c:32
size_t write_len
Character length of write_buf.
Definition codec_utf16.c:41
Core charset codec definition; implementations extend this.
Definition codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:29
struct parserutils_charset_codec::@271367034342366162232062053053007137175253257255 handler
Vtable for handler code.
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition codec_impl.h:32
Codec factory component definition.
Definition codec_impl.h:39
UTF-16 manipulation functions (interface).
parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, size_t len, uint32_t *ucs4, size_t *clen)
Convert a UTF-16 sequence into a single UCS-4 character.
Definition utf16.c:27
parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s, uint32_t len, uint32_t off, uint32_t *nextoff)
Find next legal UTF-16 char in string.
Definition utf16.c:214
parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s, size_t *len)
Convert a single UCS-4 character into a UTF-16 sequence.
Definition utf16.c:70
#define UNUSED(x)
Definition utils.h:25
#define min(a, b)
Definition utils.h:16
#define SLEN(s)
Definition utils.h:21
#define max(a, b)
Definition utils.h:12