NetSurf
utf8.c
Go to the documentation of this file.
1/*
2 * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
3 *
4 * This file is part of NetSurf, http://www.netsurf-browser.org/
5 *
6 * NetSurf is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License.
9 *
10 * NetSurf is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19/** \file
20 * UTF-8 manipulation functions (implementation).
21 */
22
23#include <assert.h>
24#include <errno.h>
25#include <stdlib.h>
26#include <string.h>
27#include <strings.h>
28#include <iconv.h>
29#include <parserutils/charset/utf8.h>
30
31#include "utils/config.h"
32#include "utils/log.h"
33#include "utils/utf8.h"
34
35#include "netsurf/inttypes.h"
36#include "netsurf/utf8.h"
38
39
40/* exported interface documented in utils/utf8.h */
41uint32_t utf8_to_ucs4(const char *s_in, size_t l)
42{
43 uint32_t ucs4;
44 size_t len;
45 parserutils_error perror;
46
47 perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,
48 &ucs4, &len);
49 if (perror != PARSERUTILS_OK)
50 ucs4 = 0xfffd;
51
52 return ucs4;
53}
54
55/* exported interface documented in utils/utf8.h */
56size_t utf8_from_ucs4(uint32_t c, char *s)
57{
58 uint8_t *in = (uint8_t *) s;
59 size_t len = 6;
60 parserutils_error perror;
61
62 perror = parserutils_charset_utf8_from_ucs4(c, &in, &len);
63 if (perror != PARSERUTILS_OK) {
64 s[0] = 0xef;
65 s[1] = 0xbf;
66 s[2] = 0xbd;
67 return 3;
68 }
69
70 return 6 - len;
71}
72
73/* exported interface documented in utils/utf8.h */
74size_t utf8_length(const char *s)
75{
76 return utf8_bounded_length(s, strlen(s));
77}
78
79/* exported interface documented in utils/utf8.h */
80size_t utf8_bounded_length(const char *s, size_t l)
81{
82 size_t len;
83 parserutils_error perror;
84
85 perror = parserutils_charset_utf8_length((const uint8_t *) s, l, &len);
86 if (perror != PARSERUTILS_OK)
87 return 0;
88
89 return len;
90}
91
92/* exported interface documented in utils/utf8.h */
93size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c)
94{
95 size_t len = 0;
96
97 while (len < l && c-- > 0)
98 len = utf8_next(s, l, len);
99
100 return len;
101}
102
103/* exported interface documented in utils/utf8.h */
104size_t utf8_char_byte_length(const char *s)
105{
106 size_t len;
107 parserutils_error perror;
108
109 perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,
110 &len);
111 assert(perror == PARSERUTILS_OK);
112
113 return len;
114}
115
116/* exported interface documented in utils/utf8.h */
117size_t utf8_prev(const char *s, size_t o)
118{
119 uint32_t prev;
120 parserutils_error perror;
121
122 perror = parserutils_charset_utf8_prev((const uint8_t *) s, o, &prev);
123 assert(perror == PARSERUTILS_OK);
124
125 return prev;
126}
127
128/* exported interface documented in utils/utf8.h */
129size_t utf8_next(const char *s, size_t l, size_t o)
130{
131 uint32_t next;
132 parserutils_error perror;
133
134 perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,
135 &next);
136 assert(perror == PARSERUTILS_OK);
137
138 return next;
139}
140
141/* Cache of previous iconv conversion descriptor used by utf8_convert */
142static struct {
143 char from[32]; /**< Encoding name to convert from */
144 char to[32]; /**< Encoding name to convert to */
145 iconv_t cd; /**< Iconv conversion descriptor */
147
148static inline void utf8_clear_cd_cache(void)
149{
150 last_cd.from[0] = '\0';
151 last_cd.to[0] = '\0';
152 last_cd.cd = 0;
153}
154
155/**
156 * obtain a cached conversion descriptor
157 *
158 * either return the cached conversion descriptor or create one if required
159 */
160static nserror
161get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)
162{
163 iconv_t cd;
164 /* we cache the last used conversion descriptor,
165 * so check if we're trying to use it here */
166 if (strncasecmp(last_cd.from, enc_from, sizeof(last_cd.from)) == 0 &&
167 strncasecmp(last_cd.to, enc_to, sizeof(last_cd.to)) == 0 &&
168 last_cd.cd != 0) {
169 *cd_out = last_cd.cd;
170 return NSERROR_OK;
171 }
172
173 /* no match, so create a new cd */
174 cd = iconv_open(enc_to, enc_from);
175 if (cd == (iconv_t) -1) {
176 if (errno == EINVAL) {
178 }
179 /* default to no memory */
180 return NSERROR_NOMEM;
181 }
182
183 /* close the last cd - we don't care if this fails */
184 if (last_cd.cd) {
185 iconv_close(last_cd.cd);
186 }
187
188 /* and safely copy the to/from/cd data into last_cd */
189 snprintf(last_cd.from, sizeof(last_cd.from), "%s", enc_from);
190 snprintf(last_cd.to, sizeof(last_cd.to), "%s", enc_to);
191 *cd_out = last_cd.cd = cd;
192
193 return NSERROR_OK;
194}
195
196/* exported interface documented in utils/utf8.h */
198{
199 if (last_cd.cd != 0)
200 iconv_close(last_cd.cd);
201
202 /* paranoia follows */
204
205 return NSERROR_OK;
206}
207
208
209/**
210 * Convert a string from one encoding to another
211 *
212 * \param string The NULL-terminated string to convert
213 * \param slen Length of input string to consider (in bytes), or 0
214 * \param from The encoding name to convert from
215 * \param to The encoding name to convert to
216 * \param result_out Pointer to location in which to store result.
217 * \param result_len_out Pointer to location in which to store result length.
218 * \return NSERROR_OK for no error, NSERROR_NOMEM on allocation error,
219 * NSERROR_BAD_ENCODING for a bad character encoding
220 */
221static nserror
222utf8_convert(const char *string,
223 size_t slen,
224 const char *from,
225 const char *to,
226 char **result_out,
227 size_t *result_len_out)
228{
229 iconv_t cd;
230 char *temp, *out, *in, *result;
231 size_t result_len;
232 nserror res;
233
234 assert(string && from && to && result_out);
235
236 /* calculate the source length if not given */
237 if (slen==0) {
238 slen = strlen(string);
239 }
240
241 /* process the empty string separately avoiding any conversion
242 * check for the source and destination encoding being the same
243 *
244 * This optimisation is necessary on AmigaOS as iconv()
245 * returns an error if an empty string is passed.
246 */
247 if ((slen == 0) || (strcasecmp(from, to) == 0)) {
248 *result_out = strndup(string, slen);
249 if (*result_out == NULL) {
250 return NSERROR_NOMEM;
251 }
252 if (result_len_out != NULL) {
253 *result_len_out = slen;
254 }
255
256 return NSERROR_OK;
257 }
258
259 in = (char *)string;
260
261 res = get_cached_cd(from, to, &cd);
262 if (res != NSERROR_OK) {
263 return res;
264 }
265
266 /* Worst case = ASCII -> UCS4, so allocate an output buffer
267 * 4 times larger than the input buffer, and add 4 bytes at
268 * the end for the NULL terminator
269 */
270 result_len = slen * 4 + 4;
271
272 temp = out = malloc(result_len);
273 if (!out) {
274 return NSERROR_NOMEM;
275 }
276
277 /* perform conversion */
278 if (iconv(cd, (void *) &in, &slen, &out, &result_len) == (size_t)-1) {
279 free(temp);
280 /* clear the cached conversion descriptor as it's invalid */
281 if (last_cd.cd)
282 iconv_close(last_cd.cd);
284 /** \todo handle the various cases properly
285 * There are 3 possible error cases:
286 * a) Insufficiently large output buffer
287 * b) Invalid input byte sequence
288 * c) Incomplete input sequence */
289 return NSERROR_NOMEM;
290 }
291
292 result_len = out - temp;
293
294 /* resize buffer allowing for null termination */
295 result = realloc(temp, result_len + 4);
296 if (result == NULL) {
297 free(temp);
298 return NSERROR_NOMEM;
299 }
300
301 /* NULL terminate - needs 4 characters as we may have
302 * converted to UTF-32 */
303 memset(result + result_len, 0, 4);
304
305 *result_out = result;
306 if (result_len_out != NULL) {
307 *result_len_out = result_len;
308 }
309
310 return NSERROR_OK;
311}
312
313/* exported interface documented in utils/utf8.h */
314nserror utf8_to_enc(const char *string, const char *encname,
315 size_t len, char **result)
316{
317 return utf8_convert(string, len, "UTF-8", encname, result, NULL);
318}
319
320/* exported interface documented in utils/utf8.h */
321nserror utf8_from_enc(const char *string, const char *encname,
322 size_t len, char **result, size_t *result_len)
323{
324 return utf8_convert(string, len, encname, "UTF-8", result, result_len);
325}
326
327/**
328 * convert a chunk of html data
329 */
330static nserror
332 const char *chunk,
333 size_t inlen,
334 char **out,
335 size_t *outlen)
336{
337 size_t ret, esclen;
338 uint32_t ucs4;
339 char *pescape, escape[11];
340
341 while (inlen > 0) {
342 ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen);
343 if (ret != (size_t) -1)
344 break;
345
346 if (errno != EILSEQ)
347 return NSERROR_NOMEM;
348
349 ucs4 = utf8_to_ucs4(chunk, inlen);
350 esclen = snprintf(escape, sizeof(escape), "&#x%06"PRIx32";", ucs4);
351 pescape = escape;
352 ret = iconv(cd, (void *) &pescape, &esclen,
353 (void *) out, outlen);
354 if (ret == (size_t) -1)
355 return NSERROR_NOMEM;
356
357 esclen = utf8_next(chunk, inlen, 0);
358 chunk += esclen;
359 inlen -= esclen;
360 }
361
362 return NSERROR_OK;
363}
364
365
366
367/* exported interface documented in utils/utf8.h */
369utf8_to_html(const char *string, const char *encname, size_t len, char **result_out)
370{
371 iconv_t cd;
372 const char *in;
373 char *out, *origout, *result;
374 size_t off, prev_off, inlen, outlen, origoutlen, esclen;
375 nserror ret;
376 char *pescape, escape[11];
377 nserror res;
378
379 if (len == 0)
380 len = strlen(string);
381
382 res = get_cached_cd("UTF-8", encname, &cd);
383 if (res != NSERROR_OK) {
384 return res;
385 }
386
387 /* Worst case is ASCII -> UCS4, with all characters escaped:
388 * "&#xYYYYYY;", thus each input character may become a string
389 * of 10 UCS4 characters, each 4 bytes in length, plus four for
390 * terminating the string */
391 origoutlen = outlen = len * 10 * 4 + 4;
392 origout = out = malloc(outlen);
393 if (out == NULL) {
394 iconv_close(cd);
396 return NSERROR_NOMEM;
397 }
398
399 /* Process input in chunks between characters we must escape */
400 prev_off = off = 0;
401 while (off < len) {
402 /* Must escape '&', '<', and '>' */
403 if (string[off] == '&' || string[off] == '<' ||
404 string[off] == '>') {
405 if (off - prev_off > 0) {
406 /* Emit chunk */
407 in = string + prev_off;
408 inlen = off - prev_off;
409 ret = utf8_convert_html_chunk(cd, in, inlen,
410 &out, &outlen);
411 if (ret != NSERROR_OK) {
412 free(origout);
413 iconv_close(cd);
415 return ret;
416 }
417 }
418
419 /* Emit mandatory escape */
420 esclen = snprintf(escape, sizeof(escape),
421 "&#x%06x;", string[off]);
422 pescape = escape;
423 ret = utf8_convert_html_chunk(cd, pescape, esclen,
424 &out, &outlen);
425 if (ret != NSERROR_OK) {
426 free(origout);
427 iconv_close(cd);
429 return ret;
430 }
431
432 prev_off = off = utf8_next(string, len, off);
433 } else {
434 off = utf8_next(string, len, off);
435 }
436 }
437
438 /* Process final chunk */
439 if (prev_off < len) {
440 in = string + prev_off;
441 inlen = len - prev_off;
442 ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen);
443 if (ret != NSERROR_OK) {
444 free(origout);
445 iconv_close(cd);
447 return ret;
448 }
449 }
450
451 /* Terminate string */
452 memset(out, 0, 4);
453 outlen -= 4;
454
455 /* Shrink-wrap */
456 result = realloc(origout, origoutlen - outlen);
457 if (result == NULL) {
458 free(origout);
459 return NSERROR_NOMEM;
460 }
461 *result_out = result;
462
463 return NSERROR_OK;
464}
465
466/* exported interface documented in utils/utf8.h */
467bool utf8_save_text(const char *utf8_text, const char *path)
468{
469 nserror ret;
470 char *conv;
471 FILE *out;
472
473 ret = guit->utf8->utf8_to_local(utf8_text, strlen(utf8_text), &conv);
474 if (ret != NSERROR_OK) {
475 NSLOG(netsurf, INFO,
476 "failed to convert to local encoding, return %d", ret);
477 return false;
478 }
479
480 out = fopen(path, "w");
481 if (out) {
482 int res = fputs(conv, out);
483 if (res < 0) {
484 NSLOG(netsurf, INFO, "Warning: writing data failed");
485 }
486
487 res = fputs("\n", out);
488 fclose(out);
489 free(conv);
490 return (res != EOF);
491 }
492 free(conv);
493
494 return false;
495}
STATIC char result[100]
Definition: arexx.c:77
char * strndup(const char *s, size_t n)
Duplicate up to n characters of a string.
Definition: utils.c:332
nserror
Enumeration of error codes.
Definition: errors.h:29
@ NSERROR_BAD_ENCODING
The character set is unknown.
Definition: errors.h:45
@ NSERROR_NOMEM
Memory exhaustion.
Definition: errors.h:32
@ NSERROR_OK
No error.
Definition: errors.h:30
struct netsurf_table * guit
The global interface table.
Definition: gui_factory.c:50
Interface to core interface table.
Interface to platform-specific utf8 operations.
Netsurf additional integer type formatting macros.
#define NSLOG(catname, level, logmsg, args...)
Definition: log.h:116
Interface to utility string handling.
nserror(* utf8_to_local)(const char *string, size_t len, char **result)
Convert a UTF-8 encoded string into the system local encoding.
Definition: utf8.h:40
struct gui_utf8_table * utf8
UTF8 table.
Definition: gui_table.h:115
static nserror utf8_convert(const char *string, size_t slen, const char *from, const char *to, char **result_out, size_t *result_len_out)
Convert a string from one encoding to another.
Definition: utf8.c:222
size_t utf8_prev(const char *s, size_t o)
Find previous legal UTF-8 char in string.
Definition: utf8.c:117
uint32_t utf8_to_ucs4(const char *s_in, size_t l)
Convert a UTF-8 multibyte sequence into a single UCS4 character.
Definition: utf8.c:41
nserror utf8_finalise(void)
Finalise the UTF-8 library.
Definition: utf8.c:197
size_t utf8_from_ucs4(uint32_t c, char *s)
Convert a single UCS4 character into a UTF-8 multibyte sequence.
Definition: utf8.c:56
size_t utf8_next(const char *s, size_t l, size_t o)
Find next legal UTF-8 char in string.
Definition: utf8.c:129
bool utf8_save_text(const char *utf8_text, const char *path)
Save the given utf8 text to a file, converting to local encoding.
Definition: utf8.c:467
iconv_t cd
Iconv conversion descriptor.
Definition: utf8.c:145
size_t utf8_bounded_length(const char *s, size_t l)
Calculated the length (in characters) of a bounded UTF-8 string.
Definition: utf8.c:80
nserror utf8_from_enc(const char *string, const char *encname, size_t len, char **result, size_t *result_len)
Convert a string in the named encoding into a UTF-8 string.
Definition: utf8.c:321
size_t utf8_length(const char *s)
Calculate the length (in characters) of a NULL-terminated UTF-8 string.
Definition: utf8.c:74
static struct @151 last_cd
nserror utf8_to_html(const char *string, const char *encname, size_t len, char **result_out)
Convert a UTF-8 encoded string into a string of the given encoding, applying HTML escape sequences wh...
Definition: utf8.c:369
static void utf8_clear_cd_cache(void)
Definition: utf8.c:148
static nserror get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)
obtain a cached conversion descriptor
Definition: utf8.c:161
nserror utf8_to_enc(const char *string, const char *encname, size_t len, char **result)
Convert a UTF8 string into the named encoding.
Definition: utf8.c:314
char from[32]
Encoding name to convert from.
Definition: utf8.c:143
size_t utf8_char_byte_length(const char *s)
Calculate the length (in bytes) of a UTF-8 character.
Definition: utf8.c:104
char to[32]
Encoding name to convert to.
Definition: utf8.c:144
size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c)
Calculate the length (in bytes) of a bounded UTF-8 string.
Definition: utf8.c:93
static nserror utf8_convert_html_chunk(iconv_t cd, const char *chunk, size_t inlen, char **out, size_t *outlen)
convert a chunk of html data
Definition: utf8.c:331
UTF-8 manipulation functions (interface).
static nserror path(const struct redraw_context *ctx, const plot_style_t *pstyle, const float *p, unsigned int n, const float transform[6])
Plots a path.
Definition: plot.c:821