NetSurf
|
UTF-8 manipulation functions (implementation). More...
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <iconv.h>
#include <parserutils/charset/utf8.h>
#include "utils/config.h"
#include "utils/log.h"
#include "utils/utf8.h"
#include "netsurf/inttypes.h"
#include "netsurf/utf8.h"
#include "desktop/gui_internal.h"
Go to the source code of this file.
Functions | |
uint32_t | utf8_to_ucs4 (const char *s_in, size_t l) |
Convert a UTF-8 multibyte sequence into a single UCS4 character. More... | |
size_t | utf8_from_ucs4 (uint32_t c, char *s) |
Convert a single UCS4 character into a UTF-8 multibyte sequence. More... | |
size_t | utf8_length (const char *s) |
Calculate the length (in characters) of a NULL-terminated UTF-8 string. More... | |
size_t | utf8_bounded_length (const char *s, size_t l) |
Calculated the length (in characters) of a bounded UTF-8 string. More... | |
size_t | utf8_bounded_byte_length (const char *s, size_t l, size_t c) |
Calculate the length (in bytes) of a bounded UTF-8 string. More... | |
size_t | utf8_char_byte_length (const char *s) |
Calculate the length (in bytes) of a UTF-8 character. More... | |
size_t | utf8_prev (const char *s, size_t o) |
Find previous legal UTF-8 char in string. More... | |
size_t | utf8_next (const char *s, size_t l, size_t o) |
Find next legal UTF-8 char in string. More... | |
static void | utf8_clear_cd_cache (void) |
static nserror | get_cached_cd (const char *enc_from, const char *enc_to, iconv_t *cd_out) |
obtain a cached conversion descriptor More... | |
nserror | utf8_finalise (void) |
Finalise the UTF-8 library. More... | |
static nserror | utf8_convert (const char *string, size_t slen, const char *from, const char *to, char **result_out, size_t *result_len_out) |
Convert a string from one encoding to another. More... | |
nserror | utf8_to_enc (const char *string, const char *encname, size_t len, char **result) |
Convert a UTF8 string into the named encoding. More... | |
nserror | utf8_from_enc (const char *string, const char *encname, size_t len, char **result, size_t *result_len) |
Convert a string in the named encoding into a UTF-8 string. More... | |
static nserror | utf8_convert_html_chunk (iconv_t cd, const char *chunk, size_t inlen, char **out, size_t *outlen) |
convert a chunk of html data More... | |
nserror | utf8_to_html (const char *string, const char *encname, size_t len, char **result_out) |
Convert a UTF-8 encoded string into a string of the given encoding, applying HTML escape sequences where necessary. More... | |
bool | utf8_save_text (const char *utf8_text, const char *path) |
Save the given utf8 text to a file, converting to local encoding. More... | |
Variables | |
struct { | |
char from [32] | |
Encoding name to convert from. More... | |
char to [32] | |
Encoding name to convert to. More... | |
iconv_t cd | |
Iconv conversion descriptor. More... | |
} | last_cd |
UTF-8 manipulation functions (implementation).
Definition in file utf8.c.
|
static |
obtain a cached conversion descriptor
either return the cached conversion descriptor or create one if required
Definition at line 161 of file utf8.c.
References cd, last_cd, NSERROR_BAD_ENCODING, NSERROR_NOMEM, and NSERROR_OK.
Referenced by utf8_convert(), and utf8_to_html().
size_t utf8_bounded_byte_length | ( | const char * | s, |
size_t | l, | ||
size_t | c | ||
) |
Calculate the length (in bytes) of a bounded UTF-8 string.
s | The string |
l | Maximum length of input (in bytes) |
c | Maximum number of characters to measure |
Definition at line 93 of file utf8.c.
References utf8_next().
Referenced by textarea_insert_text(), and textarea_set_caret().
size_t utf8_bounded_length | ( | const char * | s, |
size_t | l | ||
) |
Calculated the length (in characters) of a bounded UTF-8 string.
s | The string |
l | Maximum length of input (in bytes) |
Definition at line 80 of file utf8.c.
Referenced by nsfont_width(), textarea_insert_text(), textarea_replace_text_internal(), and utf8_length().
size_t utf8_char_byte_length | ( | const char * | s | ) |
Calculate the length (in bytes) of a UTF-8 character.
s | Pointer to start of character |
Definition at line 104 of file utf8.c.
Referenced by ami_key_to_nskey().
|
inlinestatic |
Definition at line 148 of file utf8.c.
References last_cd.
Referenced by utf8_convert(), utf8_finalise(), and utf8_to_html().
|
static |
Convert a string from one encoding to another.
string | The NULL-terminated string to convert |
slen | Length of input string to consider (in bytes), or 0 |
from | The encoding name to convert from |
to | The encoding name to convert to |
result_out | Pointer to location in which to store result. |
result_len_out | Pointer to location in which to store result length. |
Definition at line 222 of file utf8.c.
References cd, from, get_cached_cd(), last_cd, NSERROR_NOMEM, NSERROR_OK, result, strndup(), to, and utf8_clear_cd_cache().
Referenced by utf8_from_enc(), and utf8_to_enc().
|
static |
convert a chunk of html data
Definition at line 331 of file utf8.c.
References cd, NSERROR_NOMEM, NSERROR_OK, utf8_next(), and utf8_to_ucs4().
Referenced by utf8_to_html().
nserror utf8_finalise | ( | void | ) |
Finalise the UTF-8 library.
Definition at line 197 of file utf8.c.
References last_cd, NSERROR_OK, and utf8_clear_cd_cache().
Referenced by netsurf_exit().
nserror utf8_from_enc | ( | const char * | string, |
const char * | encname, | ||
size_t | len, | ||
char ** | result, | ||
size_t * | result_len | ||
) |
Convert a string in the named encoding into a UTF-8 string.
string | The NULL-terminated string to convert |
encname | The encoding name (suitable for passing to iconv) |
len | Length of input string to consider (in bytes), or 0 |
result | Pointer to location to store result (allocated on heap) |
result_len | The length of the data placed in result. |
Definition at line 321 of file utf8.c.
References result, and utf8_convert().
Referenced by ami_clipboard_cat_collection(), nsgtk_viewsource(), and utf8_from_local_encoding().
size_t utf8_from_ucs4 | ( | uint32_t | c, |
char * | s | ||
) |
Convert a single UCS4 character into a UTF-8 multibyte sequence.
Encoding of UCS values outside the UTF-16 plane has been removed from RFC3629. This function conforms to RFC2279, however.
c | The character to process (0 <= c <= 0x7FFFFFFF) |
s | Pointer to 6 byte long output buffer |
Definition at line 56 of file utf8.c.
Referenced by fire_dom_keyboard_event(), ro_textarea_key_press(), and textarea_keypress().
size_t utf8_length | ( | const char * | s | ) |
Calculate the length (in characters) of a NULL-terminated UTF-8 string.
s | The string |
Definition at line 74 of file utf8.c.
References utf8_bounded_length().
Referenced by ro_textarea_insert_text(), ro_textarea_key_press(), ro_textarea_replace_text(), ro_textarea_set_caret(), textarea_replace_text_internal(), and textarea_set_text().
size_t utf8_next | ( | const char * | s, |
size_t | l, | ||
size_t | o | ||
) |
Find next legal UTF-8 char in string.
s | The string |
l | Maximum offset in string |
o | Offset in the string to start at |
Definition at line 129 of file utf8.c.
Referenced by ami_font_bm_convert_local_to_utf8_offset(), amiga_nsfont_position_in_string(), amiga_nsfont_split(), fb_font_position(), fb_font_split(), fb_font_width(), framebuffer_plot_text(), nsgtk_cw_input_method_commit(), nsgtk_window_input_method_commit(), ro_gui_window_import_text(), ro_textarea_get_caret(), ro_textarea_insert_text(), ro_textarea_replace_text(), ro_textarea_set_caret(), ro_textarea_set_caret_xy(), textarea_char_to_byte_offset(), textarea_keypress(), textplain_coord_from_offset(), textplain_offset_from_coords(), textplain_redraw(), utf8_bounded_byte_length(), utf8_convert_html_chunk(), utf8_to_html(), and utf8_to_local_encoding().
size_t utf8_prev | ( | const char * | s, |
size_t | o | ||
) |
Find previous legal UTF-8 char in string.
s | The string |
o | Offset in the string to start at |
Definition at line 117 of file utf8.c.
Referenced by textarea_keypress().
bool utf8_save_text | ( | const char * | utf8_text, |
const char * | path | ||
) |
Save the given utf8 text to a file, converting to local encoding.
utf8_text | text to save to file |
path | pathname to save to |
Definition at line 467 of file utf8.c.
References guit, NSERROR_OK, NSLOG, path(), netsurf_table::utf8, and gui_utf8_table::utf8_to_local.
Referenced by ro_gui_save_content().
nserror utf8_to_enc | ( | const char * | string, |
const char * | encname, | ||
size_t | len, | ||
char ** | result | ||
) |
Convert a UTF8 string into the named encoding.
string | The NULL-terminated string to convert |
encname | The encoding name (suitable for passing to iconv) |
len | Length of input string to consider (in bytes), or 0 |
result | Pointer to location to store result (allocated on heap) |
Definition at line 314 of file utf8.c.
References result, and utf8_convert().
Referenced by ami_font_unicode_width(), amiga_nsfont_position_in_string(), amiga_nsfont_split(), amiga_nsfont_text(), form_encode_item(), utf8_to_font_encoding(), utf8_to_local(), and utf8_to_local_encoding().
nserror utf8_to_html | ( | const char * | string, |
const char * | encname, | ||
size_t | len, | ||
char ** | result | ||
) |
Convert a UTF-8 encoded string into a string of the given encoding, applying HTML escape sequences where necessary.
string | String to convert (NUL-terminated) |
encname | Name of encoding to convert to |
len | Length, in bytes, of the input string, or 0 |
result | Pointer to location to receive result |
Definition at line 369 of file utf8.c.
References cd, get_cached_cd(), NSERROR_NOMEM, NSERROR_OK, result, utf8_clear_cd_cache(), utf8_convert_html_chunk(), and utf8_next().
Referenced by global_history_export_enter_cb(), hotlist_export_enter_cb(), save_complete_node_handler(), save_complete_rewrite_url_value(), and save_complete_write_value().
uint32_t utf8_to_ucs4 | ( | const char * | s, |
size_t | l | ||
) |
Convert a UTF-8 multibyte sequence into a single UCS4 character.
Encoding of UCS values outside the UTF-16 plane has been removed from RFC3629. This function conforms to RFC2279, however.
[in] | s | The sequence to process |
[in] | l | Length of sequence |
Definition at line 41 of file utf8.c.
Referenced by ami_key_to_nskey(), fb_font_position(), fb_font_split(), fb_font_width(), framebuffer_plot_text(), nsbeos_window_keypress_event(), nsgtk_cw_input_method_commit(), nsgtk_window_input_method_commit(), and utf8_convert_html_chunk().
iconv_t cd |
Iconv conversion descriptor.
Definition at line 145 of file utf8.c.
Referenced by get_cached_cd(), http_parse_content_disposition(), llcache_object_is_fresh(), llcache_object_rfc2616_remaining_lifetime(), utf8_convert(), utf8_convert_html_chunk(), and utf8_to_html().
char from[32] |
Encoding name to convert from.
Definition at line 143 of file utf8.c.
Referenced by bitmap__format_convert(), bitmap__format_convert_from_pma(), bitmap__format_convert_to_pma(), bitmap_format_convert(), bitmap_format_to_client(), nsbeos_rgba_to_bgra(), ro_gui_menu_window_changed(), ro_gui_wimp_event_transfer(), and utf8_convert().
struct { ... } last_cd |
Referenced by get_cached_cd(), utf8_clear_cd_cache(), utf8_convert(), and utf8_finalise().
char to[32] |
Encoding name to convert to.
Definition at line 144 of file utf8.c.
Referenced by bitmap__format_convert(), bitmap__format_convert_from_pma(), bitmap__format_convert_to_pma(), bitmap_format_convert(), bitmap_format_from_client(), nsbeos_rgba_to_bgra(), ro_gui_menu_window_changed(), ro_gui_send_datasave(), ro_gui_wimp_event_transfer(), and utf8_convert().