NetSurf
utf8.h
Go to the documentation of this file.
1/*
2 * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
3 *
4 * This file is part of NetSurf, http://www.netsurf-browser.org/
5 *
6 * NetSurf is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License.
9 *
10 * NetSurf is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19/** \file
20 * UTF-8 manipulation functions (interface).
21 */
22
23#ifndef _NETSURF_UTILS_UTF8_H_
24#define _NETSURF_UTILS_UTF8_H_
25
26#include <stdbool.h>
27#include <stdint.h>
28
29#include "utils/errors.h"
30
31/**
32 * Convert a UTF-8 multibyte sequence into a single UCS4 character
33 *
34 * Encoding of UCS values outside the UTF-16 plane has been removed from
35 * RFC3629. This function conforms to RFC2279, however.
36 *
37 * \param[in] s The sequence to process
38 * \param[in] l Length of sequence
39 * \return UCS4 character
40 */
41uint32_t utf8_to_ucs4(const char *s, size_t l);
42
43/**
44 * Convert a single UCS4 character into a UTF-8 multibyte sequence
45 *
46 * Encoding of UCS values outside the UTF-16 plane has been removed from
47 * RFC3629. This function conforms to RFC2279, however.
48 *
49 * \param c The character to process (0 <= c <= 0x7FFFFFFF)
50 * \param s Pointer to 6 byte long output buffer
51 * \return Length of multibyte sequence
52 */
53size_t utf8_from_ucs4(uint32_t c, char *s);
54
55
56/**
57 * Calculate the length (in characters) of a NULL-terminated UTF-8 string
58 *
59 * \param s The string
60 * \return Length of string
61 */
62size_t utf8_length(const char *s);
63
64/**
65 * Calculated the length (in characters) of a bounded UTF-8 string
66 *
67 * \param s The string
68 * \param l Maximum length of input (in bytes)
69 * \return Length of string, in characters
70 */
71size_t utf8_bounded_length(const char *s, size_t l);
72
73/**
74 * Calculate the length (in bytes) of a bounded UTF-8 string
75 *
76 * \param s The string
77 * \param l Maximum length of input (in bytes)
78 * \param c Maximum number of characters to measure
79 * \return Length of string, in bytes
80 */
81size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c);
82
83/**
84 * Calculate the length (in bytes) of a UTF-8 character
85 *
86 * \param s Pointer to start of character
87 * \return Length of character, in bytes
88 */
89size_t utf8_char_byte_length(const char *s);
90
91
92/**
93 * Find previous legal UTF-8 char in string
94 *
95 * \param s The string
96 * \param o Offset in the string to start at
97 * \return Offset of first byte of previous legal character
98 */
99size_t utf8_prev(const char *s, size_t o);
100
101/**
102 * Find next legal UTF-8 char in string
103 *
104 * \param s The string
105 * \param l Maximum offset in string
106 * \param o Offset in the string to start at
107 * \return Offset of first byte of next legal character
108 */
109size_t utf8_next(const char *s, size_t l, size_t o);
110
111
112/**
113 * Convert a UTF8 string into the named encoding
114 *
115 * \param string The NULL-terminated string to convert
116 * \param encname The encoding name (suitable for passing to iconv)
117 * \param len Length of input string to consider (in bytes), or 0
118 * \param result Pointer to location to store result (allocated on heap)
119 * \return standard nserror value
120 */
121nserror utf8_to_enc(const char *string, const char *encname,
122 size_t len, char **result);
123
124/**
125 * Convert a string in the named encoding into a UTF-8 string
126 *
127 * \param string The NULL-terminated string to convert
128 * \param encname The encoding name (suitable for passing to iconv)
129 * \param len Length of input string to consider (in bytes), or 0
130 * \param result Pointer to location to store result (allocated on heap)
131 * \param result_len The length of the data placed in result.
132 * \return standard nserror value
133 */
134nserror utf8_from_enc(const char *string, const char *encname,
135 size_t len, char **result, size_t *result_len);
136
137/**
138 * Convert a UTF-8 encoded string into a string of the given encoding,
139 * applying HTML escape sequences where necessary.
140 *
141 * \param string String to convert (NUL-terminated)
142 * \param encname Name of encoding to convert to
143 * \param len Length, in bytes, of the input string, or 0
144 * \param result Pointer to location to receive result
145 * \return standard nserror code
146 */
147nserror utf8_to_html(const char *string, const char *encname,
148 size_t len, char **result);
149
150/**
151 * Save the given utf8 text to a file, converting to local encoding.
152 *
153 * \param utf8_text text to save to file
154 * \param path pathname to save to
155 * \return true iff the save succeeded
156 */
157bool utf8_save_text(const char *utf8_text, const char *path);
158
159
160/**
161 * Finalise the UTF-8 library
162 */
164
165#endif
STATIC char result[100]
Definition: arexx.c:77
Error codes.
nserror
Enumeration of error codes.
Definition: errors.h:29
size_t utf8_prev(const char *s, size_t o)
Find previous legal UTF-8 char in string.
Definition: utf8.c:117
nserror utf8_finalise(void)
Finalise the UTF-8 library.
Definition: utf8.c:197
size_t utf8_from_ucs4(uint32_t c, char *s)
Convert a single UCS4 character into a UTF-8 multibyte sequence.
Definition: utf8.c:56
size_t utf8_next(const char *s, size_t l, size_t o)
Find next legal UTF-8 char in string.
Definition: utf8.c:129
bool utf8_save_text(const char *utf8_text, const char *path)
Save the given utf8 text to a file, converting to local encoding.
Definition: utf8.c:467
uint32_t utf8_to_ucs4(const char *s, size_t l)
Convert a UTF-8 multibyte sequence into a single UCS4 character.
Definition: utf8.c:41
size_t utf8_bounded_length(const char *s, size_t l)
Calculated the length (in characters) of a bounded UTF-8 string.
Definition: utf8.c:80
nserror utf8_from_enc(const char *string, const char *encname, size_t len, char **result, size_t *result_len)
Convert a string in the named encoding into a UTF-8 string.
Definition: utf8.c:321
size_t utf8_length(const char *s)
Calculate the length (in characters) of a NULL-terminated UTF-8 string.
Definition: utf8.c:74
nserror utf8_to_enc(const char *string, const char *encname, size_t len, char **result)
Convert a UTF8 string into the named encoding.
Definition: utf8.c:314
nserror utf8_to_html(const char *string, const char *encname, size_t len, char **result)
Convert a UTF-8 encoded string into a string of the given encoding, applying HTML escape sequences wh...
Definition: utf8.c:369
size_t utf8_char_byte_length(const char *s)
Calculate the length (in bytes) of a UTF-8 character.
Definition: utf8.c:104
size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c)
Calculate the length (in bytes) of a bounded UTF-8 string.
Definition: utf8.c:93
static nserror path(const struct redraw_context *ctx, const plot_style_t *pstyle, const float *p, unsigned int n, const float transform[6])
Plots a path.
Definition: plot.c:821