docs-netsurf/doxygen/utils_2utf8_8c_source.html

/*

 * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>

 *

 * This file is part of NetSurf, http://www.netsurf-browser.org/

 *

 * NetSurf is free software; you can redistribute it and/or modify

 * it under the terms of the GNU General Public License as published by

 * the Free Software Foundation; version 2 of the License.

 *

 * NetSurf is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 * GNU General Public License for more details.

 *

 * You should have received a copy of the GNU General Public License

 * along with this program.  If not, see <http://www.gnu.org/licenses/>.

 */


/** \file

 * UTF-8 manipulation functions (implementation).

 */


#include <assert.h>

#include <errno.h>

#include <stdlib.h>

#include <string.h>

#include <strings.h>

#include <iconv.h>

#include <parserutils/charset/utf8.h>


#include "utils/config.h"

#include "utils/log.h"

#include "utils/utf8.h"


#include "netsurf/inttypes.h"

#include "netsurf/utf8.h"

#include "desktop/gui_internal.h"


/* exported interface documented in utils/utf8.h */

uint32_t utf8_to_ucs4(const char *s_in, size_t l)

{

        uint32_t ucs4;

        size_t len;

        parserutils_error perror;


        perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,

                                                  &ucs4, &len);

        if (perror != PARSERUTILS_OK)

                ucs4 = 0xfffd;


        return ucs4;

}


/* exported interface documented in utils/utf8.h */

size_t utf8_from_ucs4(uint32_t c, char *s)

{

        uint8_t *in = (uint8_t *) s;

        size_t len = 6;

        parserutils_error perror;


        perror = parserutils_charset_utf8_from_ucs4(c, &in, &len);

        if (perror != PARSERUTILS_OK) {

                s[0] = 0xef;

                s[1] = 0xbf;

                s[2] = 0xbd;

                return 3;

        }


        return 6 - len;

}


/* exported interface documented in utils/utf8.h */

size_t utf8_length(const char *s)

{

        return utf8_bounded_length(s, strlen(s));

}


/* exported interface documented in utils/utf8.h */

size_t utf8_bounded_length(const char *s, size_t l)

{

        size_t len;

        parserutils_error perror;


        perror = parserutils_charset_utf8_length((const uint8_t *) s, l, &len);

        if (perror != PARSERUTILS_OK)

                return 0;


        return len;

}


/* exported interface documented in utils/utf8.h */

size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c)

{

        size_t len = 0;


        while (len < l && c-- > 0)

                len = utf8_next(s, l, len);


        return len;

}


/* exported interface documented in utils/utf8.h */

size_t utf8_char_byte_length(const char *s)

{

        size_t len;

        parserutils_error perror;


        perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,

                                                           &len);

        assert(perror == PARSERUTILS_OK);


        return len;

}


/* exported interface documented in utils/utf8.h */

size_t utf8_prev(const char *s, size_t o)

{

        uint32_t prev;

        parserutils_error perror;


        perror = parserutils_charset_utf8_prev((const uint8_t *) s, o, &prev);

        assert(perror == PARSERUTILS_OK);


        return prev;

}


/* exported interface documented in utils/utf8.h */

size_t utf8_next(const char *s, size_t l, size_t o)

{

        uint32_t next;

        parserutils_error perror;


        perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,

                                               &next);

        assert(perror == PARSERUTILS_OK);


        return next;

}


/* Cache of previous iconv conversion descriptor used by utf8_convert */

static struct {

        char from[32];  /**< Encoding name to convert from */

        char to[32];    /**< Encoding name to convert to */

        iconv_t cd;     /**< Iconv conversion descriptor */

} last_cd;


static inline void utf8_clear_cd_cache(void)

{

        last_cd.from[0] = '\0';

        last_cd.to[0] = '\0';

        last_cd.cd = 0;

}


/**

 * obtain a cached conversion descriptor

 *

 * either return the cached conversion descriptor or create one if required

 */

static nserror

get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)

{

        iconv_t cd;

        /* we cache the last used conversion descriptor,

         * so check if we're trying to use it here */

        if (strncasecmp(last_cd.from, enc_from, sizeof(last_cd.from)) == 0 &&

            strncasecmp(last_cd.to, enc_to, sizeof(last_cd.to)) == 0 &&

            last_cd.cd != 0) {

                *cd_out = last_cd.cd;

                return NSERROR_OK;

        }


        /* no match, so create a new cd */

        cd = iconv_open(enc_to, enc_from);

        if (cd == (iconv_t) -1) {

                if (errno == EINVAL) {

                        return NSERROR_BAD_ENCODING;

                }

                /* default to no memory */

                return NSERROR_NOMEM;

        }


        /* close the last cd - we don't care if this fails */

        if (last_cd.cd) {

                iconv_close(last_cd.cd);

        }


        /* and safely copy the to/from/cd data into last_cd */

        snprintf(last_cd.from, sizeof(last_cd.from), "%s", enc_from);

        snprintf(last_cd.to, sizeof(last_cd.to), "%s", enc_to);

        *cd_out = last_cd.cd = cd;


        return NSERROR_OK;

}


/* exported interface documented in utils/utf8.h */

nserror utf8_finalise(void)

{

        if (last_cd.cd != 0)

                iconv_close(last_cd.cd);


        /* paranoia follows */

        utf8_clear_cd_cache();


        return NSERROR_OK;

}


/**

 * Convert a string from one encoding to another

 *

 * \param string  The NULL-terminated string to convert

 * \param slen    Length of input string to consider (in bytes), or 0

 * \param from    The encoding name to convert from

 * \param to      The encoding name to convert to

 * \param result_out  Pointer to location in which to store result.

 * \param result_len_out Pointer to location in which to store result length.

 * \return NSERROR_OK for no error, NSERROR_NOMEM on allocation error,

 *         NSERROR_BAD_ENCODING for a bad character encoding

 */

static nserror

utf8_convert(const char *string,

             size_t slen,

             const char *from,

             const char *to,

             char **result_out,

             size_t *result_len_out)

{

        iconv_t cd;

        char *temp, *out, *in, *result;

        size_t result_len;

        nserror res;


        assert(string && from && to && result_out);


        /* calculate the source length if not given */

        if (slen==0) {

                slen = strlen(string);

        }


        /* process the empty string separately avoiding any conversion

         * check for the source and destination encoding being the same

         *

         * This optimisation is necessary on AmigaOS as iconv()

         * returns an error if an empty string is passed.

         */

        if ((slen == 0) || (strcasecmp(from, to) == 0)) {

                *result_out = strndup(string, slen);

                if (*result_out == NULL) {

                        return NSERROR_NOMEM;

                }

                if (result_len_out != NULL) {

                        *result_len_out = slen;

                }


                return NSERROR_OK;

        }


        in = (char *)string;


        res = get_cached_cd(from, to, &cd);

        if (res != NSERROR_OK) {

                return res;

        }


        /* Worst case = ASCII -> UCS4, so allocate an output buffer

         * 4 times larger than the input buffer, and add 4 bytes at

         * the end for the NULL terminator

         */

        result_len = slen * 4 + 4;


        temp = out = malloc(result_len);

        if (!out) {

                return NSERROR_NOMEM;

        }


        /* perform conversion */

        if (iconv(cd, (void *) &in, &slen, &out, &result_len) == (size_t)-1) {

                free(temp);

                /* clear the cached conversion descriptor as it's invalid */

                if (last_cd.cd)

                        iconv_close(last_cd.cd);

                utf8_clear_cd_cache();

                /** \todo handle the various cases properly

                 * There are 3 possible error cases:

                 * a) Insufficiently large output buffer

                 * b) Invalid input byte sequence

                 * c) Incomplete input sequence */

                return NSERROR_NOMEM;

        }


        result_len = out - temp;


        /* resize buffer allowing for null termination */

        result = realloc(temp, result_len + 4);

        if (result == NULL) {

                free(temp);

                return NSERROR_NOMEM;

        }


        /* NULL terminate - needs 4 characters as we may have

         * converted to UTF-32 */

        memset(result + result_len, 0, 4);


        *result_out = result;

        if (result_len_out != NULL) {

                *result_len_out = result_len;

        }


        return NSERROR_OK;

}


/* exported interface documented in utils/utf8.h */

nserror utf8_to_enc(const char *string, const char *encname,

                    size_t len, char **result)

{

        return utf8_convert(string, len, "UTF-8", encname, result, NULL);

}


/* exported interface documented in utils/utf8.h */

nserror utf8_from_enc(const char *string, const char *encname,

                      size_t len, char **result, size_t *result_len)

{

        return utf8_convert(string, len, encname, "UTF-8", result, result_len);

}


/**

 * convert a chunk of html data

 */

static nserror

utf8_convert_html_chunk(iconv_t cd,

                        const char *chunk,

                        size_t inlen,

                        char **out,

                        size_t *outlen)

{

        size_t ret, esclen;

        uint32_t ucs4;

        char *pescape, escape[11];


        while (inlen > 0) {

                ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen);

                if (ret != (size_t) -1)

                        break;


                if (errno != EILSEQ)

                        return NSERROR_NOMEM;


                ucs4 = utf8_to_ucs4(chunk, inlen);

                esclen = snprintf(escape, sizeof(escape), "&#x%06"PRIx32";", ucs4);

                pescape = escape;

                ret = iconv(cd, (void *) &pescape, &esclen,

                            (void *) out, outlen);

                if (ret == (size_t) -1)

                        return NSERROR_NOMEM;


                esclen = utf8_next(chunk, inlen, 0);

                chunk += esclen;

                inlen -= esclen;

        }


        return NSERROR_OK;

}


/* exported interface documented in utils/utf8.h */

nserror

utf8_to_html(const char *string, const char *encname, size_t len, char **result_out)

{

        iconv_t cd;

        const char *in;

        char *out, *origout, *result;

        size_t off, prev_off, inlen, outlen, origoutlen, esclen;

        nserror ret;

        char *pescape, escape[11];

        nserror res;


        if (len == 0)

                len = strlen(string);


        res = get_cached_cd("UTF-8", encname, &cd);

        if (res != NSERROR_OK) {

                return res;

        }


        /* Worst case is ASCII -> UCS4, with all characters escaped:

         * "&#xYYYYYY;", thus each input character may become a string

         * of 10 UCS4 characters, each 4 bytes in length, plus four for

         * terminating the string */

        origoutlen = outlen = len * 10 * 4 + 4;

        origout = out = malloc(outlen);

        if (out == NULL) {

                iconv_close(cd);

                utf8_clear_cd_cache();

                return NSERROR_NOMEM;

        }


        /* Process input in chunks between characters we must escape */

        prev_off = off = 0;

        while (off < len) {

                /* Must escape '&', '<', and '>' */

                if (string[off] == '&' || string[off] == '<' ||

                    string[off] == '>') {

                        if (off - prev_off > 0) {

                                /* Emit chunk */

                                in = string + prev_off;

                                inlen = off - prev_off;

                                ret = utf8_convert_html_chunk(cd, in, inlen,

                                                              &out, &outlen);

                                if (ret != NSERROR_OK) {

                                        free(origout);

                                        iconv_close(cd);

                                        utf8_clear_cd_cache();

                                        return ret;

                                }

                        }


                        /* Emit mandatory escape */

                        esclen = snprintf(escape, sizeof(escape),

                                          "&#x%06x;", string[off]);

                        pescape = escape;

                        ret = utf8_convert_html_chunk(cd, pescape, esclen,

                                                      &out, &outlen);

                        if (ret != NSERROR_OK) {

                                free(origout);

                                iconv_close(cd);

                                utf8_clear_cd_cache();

                                return ret;

                        }


                        prev_off = off = utf8_next(string, len, off);

                } else {

                        off = utf8_next(string, len, off);

                }

        }


        /* Process final chunk */

        if (prev_off < len) {

                in = string + prev_off;

                inlen = len - prev_off;

                ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen);

                if (ret != NSERROR_OK) {

                        free(origout);

                        iconv_close(cd);

                        utf8_clear_cd_cache();

                        return ret;

                }

        }


        /* Terminate string */

        memset(out, 0, 4);

        outlen -= 4;


        /* Shrink-wrap */

        result = realloc(origout, origoutlen - outlen);

        if (result == NULL) {

                free(origout);

                return NSERROR_NOMEM;

        }

        *result_out = result;


        return NSERROR_OK;

}


/* exported interface documented in utils/utf8.h */

bool utf8_save_text(const char *utf8_text, const char *path)

{

        nserror ret;

        char *conv;

        FILE *out;


        ret = guit->utf8->utf8_to_local(utf8_text, strlen(utf8_text), &conv);

        if (ret != NSERROR_OK) {

                NSLOG(netsurf, INFO,

                      "failed to convert to local encoding, return %d", ret);

                return false;

        }


        out = fopen(path, "w");

        if (out) {

                int res = fputs(conv, out);

                if (res < 0) {

                        NSLOG(netsurf, INFO, "Warning: writing data failed");

                }


                res = fputs("\n", out);

                fclose(out);

                free(conv);

                return (res != EOF);

        }

        free(conv);


        return false;

}

result
STATIC char result[100]
Definition: arexx.c:77

config.h

strndup
char * strndup(const char *s, size_t n)
Duplicate up to n characters of a string.
Definition: utils.c:332

nserror
nserror
Enumeration of error codes.
Definition: errors.h:29

NSERROR_BAD_ENCODING
@ NSERROR_BAD_ENCODING
The character set is unknown.
Definition: errors.h:45

NSERROR_NOMEM
@ NSERROR_NOMEM
Memory exhaustion.
Definition: errors.h:32

NSERROR_OK
@ NSERROR_OK
No error.
Definition: errors.h:30

guit
struct netsurf_table * guit
The global interface table.
Definition: gui_factory.c:50

gui_internal.h
Interface to core interface table.

utf8.h
Interface to platform-specific utf8 operations.

inttypes.h
Netsurf additional integer type formatting macros.

log.h

NSLOG
#define NSLOG(catname, level, logmsg, args...)
Definition: log.h:116

string.h
Interface to utility string handling.

gui_utf8_table::utf8_to_local
nserror(* utf8_to_local)(const char *string, size_t len, char **result)
Convert a UTF-8 encoded string into the system local encoding.
Definition: utf8.h:40

netsurf_table::utf8
struct gui_utf8_table * utf8
UTF8 table.
Definition: gui_table.h:115

utf8_convert
static nserror utf8_convert(const char *string, size_t slen, const char *from, const char *to, char **result_out, size_t *result_len_out)
Convert a string from one encoding to another.
Definition: utf8.c:222

utf8_prev
size_t utf8_prev(const char *s, size_t o)
Find previous legal UTF-8 char in string.
Definition: utf8.c:117

utf8_to_ucs4
uint32_t utf8_to_ucs4(const char *s_in, size_t l)
Convert a UTF-8 multibyte sequence into a single UCS4 character.
Definition: utf8.c:41

utf8_finalise
nserror utf8_finalise(void)
Finalise the UTF-8 library.
Definition: utf8.c:197

utf8_from_ucs4
size_t utf8_from_ucs4(uint32_t c, char *s)
Convert a single UCS4 character into a UTF-8 multibyte sequence.
Definition: utf8.c:56

utf8_next
size_t utf8_next(const char *s, size_t l, size_t o)
Find next legal UTF-8 char in string.
Definition: utf8.c:129

utf8_save_text
bool utf8_save_text(const char *utf8_text, const char *path)
Save the given utf8 text to a file, converting to local encoding.
Definition: utf8.c:467

cd
iconv_t cd
Iconv conversion descriptor.
Definition: utf8.c:145

utf8_bounded_length
size_t utf8_bounded_length(const char *s, size_t l)
Calculated the length (in characters) of a bounded UTF-8 string.
Definition: utf8.c:80

utf8_from_enc
nserror utf8_from_enc(const char *string, const char *encname, size_t len, char **result, size_t *result_len)
Convert a string in the named encoding into a UTF-8 string.
Definition: utf8.c:321

utf8_length
size_t utf8_length(const char *s)
Calculate the length (in characters) of a NULL-terminated UTF-8 string.
Definition: utf8.c:74

last_cd
static struct @151 last_cd

utf8_to_html
nserror utf8_to_html(const char *string, const char *encname, size_t len, char **result_out)
Convert a UTF-8 encoded string into a string of the given encoding, applying HTML escape sequences wh...
Definition: utf8.c:369

utf8_clear_cd_cache
static void utf8_clear_cd_cache(void)
Definition: utf8.c:148

get_cached_cd
static nserror get_cached_cd(const char *enc_from, const char *enc_to, iconv_t *cd_out)
obtain a cached conversion descriptor
Definition: utf8.c:161

utf8_to_enc
nserror utf8_to_enc(const char *string, const char *encname, size_t len, char **result)
Convert a UTF8 string into the named encoding.
Definition: utf8.c:314

from
char from[32]
Encoding name to convert from.
Definition: utf8.c:143

utf8_char_byte_length
size_t utf8_char_byte_length(const char *s)
Calculate the length (in bytes) of a UTF-8 character.
Definition: utf8.c:104

to
char to[32]
Encoding name to convert to.
Definition: utf8.c:144

utf8_bounded_byte_length
size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c)
Calculate the length (in bytes) of a bounded UTF-8 string.
Definition: utf8.c:93

utf8_convert_html_chunk
static nserror utf8_convert_html_chunk(iconv_t cd, const char *chunk, size_t inlen, char **out, size_t *outlen)
convert a chunk of html data
Definition: utf8.c:331

utf8.h
UTF-8 manipulation functions (interface).

path
static nserror path(const struct redraw_context *ctx, const plot_style_t *pstyle, const float *p, unsigned int n, const float transform[6])
Plots a path.
Definition: plot.c:821