NetSurf
ucstables.c
Go to the documentation of this file.
1/*
2 * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
3 *
4 * This file is part of NetSurf, http://www.netsurf-browser.org/
5 *
6 * NetSurf is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; version 2 of the License.
9 *
10 * NetSurf is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19/** \file
20 * UCS conversion tables and RISC OS-specific UTF-8 text handling
21 */
22
23#include <assert.h>
24#include <limits.h>
25#include <string.h>
26#include <stdlib.h>
27#include <oslib/osbyte.h>
28#include <oslib/territory.h>
29
30#include "utils/config.h"
31#include "utils/errors.h"
32#include "utils/log.h"
33#include "utils/utf8.h"
34#include "utils/utils.h"
35#include "netsurf/utf8.h"
36
37#include "riscos/ucstables.h"
38
39/* Common values (ASCII) */
40#define common \
41 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
42 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, \
43 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, \
44 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, \
45 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, \
46 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, \
47 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111, \
48 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127 \
49
50/* 0x8c->0x9F, used by many of the encodings */
51#define common2 \
52 0x2026, 0x2122, 0x2030, 0x2022, 0x2018, 0x2019, 0x2039, 0x203a, \
53 0x201c, 0x201d, 0x201e, 0x2013, 0x2014, 0x2212, 0x0152, 0x0153, \
54 0x2020, 0x2021, 0xfb01, 0xfb02
55
56static const int latin1_table[256] =
57{
58 common,
59 0x20ac, 0x0174, 0x0175, -1, -1, 0x0176, 0x0177, -1, -1, -1, -1, -1,
60 common2,
61 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
62 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
63 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
64 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
65 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
66 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
67};
68
69static const int latin2_table[256] =
70{
71 common,
72 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
73 common2,
74 0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
75 0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
76 0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
77 0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
78 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
79 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
80 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
81 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
82 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
83 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
84 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
85 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9
86};
87
88static const int latin3_table[256] =
89{
90 common,
91 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
92 common2,
93 0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, -1, 0x0124, 0x00A7,
94 0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, -1, 0x017B,
95 0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7,
96 0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, -1, 0x017C,
97 0x00C0, 0x00C1, 0x00C2, -1, 0x00C4, 0x010A, 0x0108, 0x00C7,
98 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
99 -1, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7,
100 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
101 0x00E0, 0x00E1, 0x00E2, -1, 0x00E4, 0x010B, 0x0109, 0x00E7,
102 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
103 -1, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7,
104 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9
105};
106
107static const int latin4_table[256] =
108{
109 common,
110 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
111 common2,
112 0x00A0, 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7,
113 0x00A8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00AD, 0x017D, 0x00AF,
114 0x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4, 0x0129, 0x013C, 0x02C7,
115 0x00B8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014A, 0x017E, 0x014B,
116 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
117 0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A,
118 0x0110, 0x0145, 0x014C, 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
119 0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x0168, 0x016A, 0x00DF,
120 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
121 0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x012B,
122 0x0111, 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
123 0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x0169, 0x016B, 0x02D9
124};
125
126static const int latin5_table[256] =
127{
128 common,
129 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130 common2,
131 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
132 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
133 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
134 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
135 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
136 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
137 0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
138 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF,
139 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
140 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
141 0x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
142 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF
143};
144
145static const int latin6_table[256] =
146{
147 common,
148 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
149 common2,
150 0x00A0, 0x0104, 0x0112, 0x0122, 0x012A, 0x0128, 0x0136, 0x00A7,
151 0x013B, 0x0110, 0x0160, 0x0166, 0x017D, 0x00AD, 0x016A, 0x014A,
152 0x00B0, 0x0105, 0x0113, 0x0123, 0x012B, 0x0129, 0x0137, 0x00B7,
153 0x013C, 0x0111, 0x0161, 0x0167, 0x017E, 0x2015, 0x016B, 0x014B,
154 0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
155 0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x00CF,
156 0x00D0, 0x0145, 0x014C, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0168,
157 0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
158 0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
159 0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x00EF,
160 0x00F0, 0x0146, 0x014D, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0169,
161 0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x0138
162};
163
164static const int latin7_table[256] =
165{
166 common,
167 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
168 0x2026, 0x2122, 0x2030, 0x2022, 0x2018, -1, 0x2039, 0x203a,
169 -1, -1, -1, 0x2013, 0x2014, 0x2212, 0x0152, 0x0153,
170 0x2020, 0x2021, 0xfb01, 0xfb02,
171 0x00A0, 0x201D, 0x00A2, 0x00A3, 0x00A4, 0x201E, 0x00A6, 0x00A7,
172 0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
173 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x201C, 0x00B5, 0x00B6, 0x00B7,
174 0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
175 0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112,
176 0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
177 0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7,
178 0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
179 0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113,
180 0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
181 0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7,
182 0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x2019
183};
184
185static const int latin8_table[256] =
186{
187 common,
188 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
189 common2,
190 0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7,
191 0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
192 0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56,
193 0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
194 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
195 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
196 0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A,
197 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
198 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
199 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
200 0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B,
201 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF
202};
203
204static const int latin9_table[256] =
205{
206 common,
207 -1, 0x0174, 0x0175, -1, -1, 0x0176, 0x0177, -1, -1, -1, -1, -1,
208 0x2026, 0x2122, 0x2030, 0x2022, 0x2018, 0x2019, 0x2039, 0x203a,
209 0x201c, 0x201d, 0x201e, 0x2013, 0x2014, 0x2212, -1, -1,
210 0x2020, 0x2021, 0xfb01, 0xfb02,
211 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AC, 0x00A5, 0x0160, 0x00A7,
212 0x0161, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
213 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x017D, 0x00B5, 0x00B6, 0x00B7,
214 0x017E, 0x00B9, 0x00BA, 0x00BB, 0x0152, 0x0153, 0x0178, 0x00BF,
215 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
216 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
217 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
218 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
219 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
220 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
221 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
222 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF
223};
224
225static const int latin10_table[256] =
226{
227 common,
228 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
229 0x2026, 0x2122, 0x2030, 0x2022, 0x2018, 0x2019, 0x2039, 0x203a,
230 0x201c, -1, -1, 0x2013, 0x2014, 0x2212, -1, -1,
231 0x2020, 0x2021, 0xfb01, 0xfb02,
232 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00a7,
233 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
234 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
235 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
236 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
237 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
238 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
239 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
240 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
241 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
242 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
243 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
244};
245
246static const int welsh_table[256] =
247{
248 common,
249 0x20ac, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
250 common2,
251 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
252 0x1E80, 0x00A9, 0x1E82, 0x00AB, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
253 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
254 0x1E81, 0x00B9, 0x1E83, 0x00BB, 0x1EF3, 0x1E84, 0x1E85, 0x00BF,
255 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
256 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
257 0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
258 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
259 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
260 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
261 0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
262 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF
263};
264
265static const int greek_table[256] =
266{
267 common,
268 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
269 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
270 0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7,
271 0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0x037E, 0x2015,
272 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x0387,
273 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
274 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
275 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
276 0x03A0, 0x03A1, -1, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
277 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
278 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
279 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
280 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
281 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, -1
282};
283
284static const int cyrillic_table[256] =
285{
286 common,
287 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
288 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
289 0x00A0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
290 0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F,
291 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
292 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
293 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
294 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
295 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
296 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
297 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
298 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
299 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
300 0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F
301};
302
303static const int hebrew_table[256] =
304{
305 common,
306 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
307 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
308 0x00A0, -1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
309 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x203E,
310 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
311 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, -1,
312 -1, -1, -1, -1, -1, -1, -1, -1,
313 -1, -1, -1, -1, -1, -1, -1, -1,
314 -1, -1, -1, -1, -1, -1, -1, -1,
315 -1, -1, -1, -1, -1, -1, -1, 0x2017,
316 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
317 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
318 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
319 0x05E8, 0x05E9, 0x05EA, -1, -1, 0x200E, 0x200F, -1
320};
321
322/**
323 * Retrieve UCS table (above), given alphabet number
324 *
325 * \param alphabet The RISC OS alphabet number
326 * \return pointer to table, or NULL if not found
327 */
328const int *ucstable_from_alphabet(int alphabet)
329{
330 const int *ucstable = NULL;
331
332 switch (alphabet) {
333 case territory_ALPHABET_LATIN1:
334 ucstable = latin1_table;
335 break;
336 case territory_ALPHABET_LATIN2:
337 ucstable = latin2_table;
338 break;
339 case territory_ALPHABET_LATIN3:
340 ucstable = latin3_table;
341 break;
342 case territory_ALPHABET_LATIN4:
343 ucstable = latin4_table;
344 break;
345 case territory_ALPHABET_LATIN5:
346 ucstable = latin5_table;
347 break;
348 case territory_ALPHABET_LATIN6:
349 ucstable = latin6_table;
350 break;
351 case 114: /* Latin7 */
352 ucstable = latin7_table;
353 break;
354 case 115: /* Latin8 */
355 ucstable = latin8_table;
356 break;
357 case 116: /* Latin10 */
358 ucstable = latin10_table;
359 break;
360 case territory_ALPHABET_LATIN9:
361 ucstable = latin9_table;
362 break;
363 case territory_ALPHABET_WELSH:
364 ucstable = welsh_table;
365 break;
366 case territory_ALPHABET_GREEK:
367 ucstable = greek_table;
368 break;
369 case territory_ALPHABET_CYRILLIC:
370 ucstable = cyrillic_table;
371 break;
372 case territory_ALPHABET_HEBREW:
373 ucstable = hebrew_table;
374 break;
375 default:
376 ucstable = NULL;
377 break;
378 }
379
380 return ucstable;
381}
382
383
384static const char *localencodings[] = {
385 "ISO-8859-1//TRANSLIT", /* BFont - 100 - just use Latin1, instead */
386 "ISO-8859-1//TRANSLIT",
387 "ISO-8859-2//TRANSLIT",
388 "ISO-8859-3//TRANSLIT",
389 "ISO-8859-4//TRANSLIT",
390 "ISO-8859-5//TRANSLIT",
391 "ISO-8859-6//TRANSLIT",
392 "ISO-8859-7//TRANSLIT",
393 "ISO-8859-8//TRANSLIT",
394 "ISO-8859-9//TRANSLIT",
395 "ISO-IR-182//TRANSLIT",
396 "UTF-8",
397 "ISO-8859-15//TRANSLIT",
398 "ISO-8859-10//TRANSLIT",
399 "ISO-8859-13//TRANSLIT",
400 "ISO-8859-14//TRANSLIT",
401 "ISO-8859-16//TRANSLIT", /* Latin10 - 116 */
402 NULL, /* UTF-16, if you believe HdrSrc (Unused) */
403 NULL, /* Unused */
404 NULL, /* Unused */
405 "CP866//TRANSLIT" /* Cyrillic2 - 120 */
406};
407
408/* These are the Acorn Latin1 C1 block between [0x80,0x9f] */
409static const char *special_chars[] = {
410 "\xE2\x82\xAC", /* EURO SIGN */
411 "\xC5\xB4", /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */
412 "\xC5\xB5", /* LATIN SMALL LETTER W WITH CIRCUMFLEX */
413 NULL, /* unused */
414 "\xE2\x9C\x98", /* HEAVY BALLOT X */
415 "\xC5\xB6", /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */
416 "\xC5\xB7", /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */
417 NULL, /* unused */
418 "\xE2\x87\x90", /* LEFTWARDS DOUBLE ARROW */
419 "\xE2\x87\x92", /* RIGHTWARDS DOUBLE ARROW */
420 "\xE2\x87\x93", /* DOWNWARDS DOUBLE ARROW */
421 "\xE2\x87\x91", /* UPWARDS DOUBLE ARROW */
422 "\xE2\x80\xA6", /* HORIZONTAL ELLIPSIS */
423 "\xE2\x84\xA2", /* TRADE MARK SIGN */
424 "\xE2\x80\xB0", /* PER MILLE SIGN */
425 "\xE2\x80\xA2", /* BULLET */
426 "\xE2\x80\x98", /* LEFT SINGLE QUOTATION MARK */
427 "\xE2\x80\x99", /* RIGHT SINGLE QUOTATION MARK */
428 "\xE2\x80\xB9", /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
429 "\xE2\x80\xBA", /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
430 "\xE2\x80\x9C", /* LEFT DOUBLE QUOTATION MARK */
431 "\xE2\x80\x9D", /* RIGHT DOUBLE QUOTATION MARK */
432 "\xE2\x80\x9E", /* DOUBLE LOW-9 QUOTATION MARK */
433 "\xE2\x80\x93", /* EN DASH */
434 "\xE2\x80\x94", /* EM DASH */
435 "\xE2\x88\x92", /* MINUS SIGN */
436 "\xC5\x92", /* LATIN CAPITAL LIGATURE OE */
437 "\xC5\x93", /* LATIN SMALL LIGATURE OE */
438 "\xE2\x80\xA0", /* DAGGER */
439 "\xE2\x80\xA1", /* DOUBLE DAGGER */
440 "\xEF\xAC\x81", /* LATIN SMALL LIGATURE FI */
441 "\xEF\xAC\x82" /* LATIN SMALL LIGATURE FL */
442};
443
444
445/**
446 * Convert a UTF-8 encoded string into the system local encoding
447 *
448 * \param string The string to convert
449 * \param len The length (in bytes) of the string, or 0
450 * \param result Pointer to location in which to store result
451 * \return An nserror code
452 */
453nserror utf8_to_local_encoding(const char *string, size_t len, char **result)
454{
455 os_error *error;
456 int alphabet, i;
457 size_t off, prev_off;
458 char *temp, *cur_pos;
459 const char *enc;
460 nserror err;
461
462 assert(string);
463 assert(result);
464
465 /* get length, if necessary */
466 if (len == 0)
467 len = strlen(string);
468
469 /* read system alphabet */
470 error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet);
471 /* Assume Latin1 for anything we know nothing about */
472 if (error || alphabet < territory_ALPHABET_BFONT ||
473 alphabet > territory_ALPHABET_CYRILLIC2)
474 alphabet = territory_ALPHABET_LATIN1;
475
476 /* UTF-8 -> simply copy string */
477 if (alphabet == territory_ALPHABET_UTF8) {
478 *result = strndup(string, len);
479 return NSERROR_OK;
480 }
481
482 /* get encoding name */
483 enc = localencodings[alphabet - territory_ALPHABET_BFONT];
484 /* Assume Latin1 for any that are unused */
485 if (enc == NULL)
486 enc = localencodings[0];
487
488 /* create output buffer */
489 *(result) = malloc(len + 1);
490 if (!(*result))
491 return NSERROR_NOMEM;
492 *(*result) = '\0';
493
494 prev_off = 0;
495 cur_pos = (*result);
496
497 /* Iterate over string, converting input between unconvertable
498 * characters and inserting appropriate output for characters
499 * that iconv can't handle. */
500 for (off = 0; off < len; off = utf8_next(string, len, off)) {
501 /* Specials only start with C5/E2/EF */
502 if (string[off] != 0xC5 &&
503 string[off] != 0xE2 && string[off] != 0xEF)
504 continue;
505
506 /* Ignore truncated input */
507 if (off + 2 + (string[off] == 0xC5 ? 0 : 1) >= len)
508 continue;
509
510 /* Search to see if this character is special */
511 for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
512 /* Skip unused special char */
513 if (special_chars[i] == NULL)
514 continue;
515
516 /* Skip 2-byte non-match */
517 if (string[off] == 0xC5 &&
518 (string[off] != special_chars[i][0] ||
519 string[off+1] != special_chars[i][1]))
520 continue;
521
522 /* Skip 3-byte non-match */
523 if (string[off] != 0xC5 &&
524 (string[off] != special_chars[i][0] ||
525 string[off+1] != special_chars[i][1] ||
526 string[off+2] != special_chars[i][2]))
527 continue;
528
529 /* 0 length has a special meaning to utf8_to_enc */
530 if (off - prev_off > 0) {
531 err = utf8_to_enc(string + prev_off, enc,
532 off - prev_off, &temp);
533 if (err != NSERROR_OK) {
534 assert(err != NSERROR_BAD_ENCODING);
535 free(*result);
536 return NSERROR_NOMEM;
537 }
538
539 strcat(cur_pos, temp);
540
541 cur_pos += strlen(temp);
542
543 free(temp);
544 }
545
546 /* Emit conversion for this special character */
547 *cur_pos = 0x80 + i;
548 *(++cur_pos) = '\0';
549 prev_off = off + 2 + (string[off] == 0xC5 ? 0 : 1);
550
551 /* Return to outer loop to process remaining input */
552 break;
553 }
554 }
555
556 /* handle last chunk
557 * NB. 0 length has a special meaning to utf8_to_enc */
558
559 if (prev_off < len) {
560 err = utf8_to_enc(string + prev_off, enc, len - prev_off,
561 &temp);
562 if (err != NSERROR_OK) {
563 assert(err != NSERROR_BAD_ENCODING);
564 free(*result);
565 return NSERROR_NOMEM;
566 }
567
568 strcat(cur_pos, temp);
569
570 free(temp);
571 }
572
573 return NSERROR_OK;
574}
575
576/**
577 * Convert a string encoded in the system local encoding to UTF-8
578 *
579 * \param string The string to convert
580 * \param len The length (in bytes) of the string, or 0
581 * \param result Pointer to location in which to store result
582 * \return An nserror code
583 */
584nserror utf8_from_local_encoding(const char *string, size_t len, char **result)
585{
586 os_error *error;
587 int alphabet, num_specials = 0, result_alloc;
588#define SPECIAL_CHUNK_SIZE 255
589 size_t off, prev_off, cur_off;
590 char *temp;
591 const char *enc;
592 nserror err;
593
594 assert(string && result);
595
596 /* get length, if necessary */
597 if (len == 0)
598 len = strlen(string);
599
600 /* read system alphabet */
601 error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet);
602 /* Assume Latin1 for anything we know nothing about */
603 if (error || alphabet < territory_ALPHABET_BFONT ||
604 alphabet > territory_ALPHABET_CYRILLIC2)
605 alphabet = territory_ALPHABET_LATIN1;
606
607 /* UTF-8 -> simply copy string */
608 if (alphabet == territory_ALPHABET_UTF8) {
609 temp = strndup(string, len);
610 if (!temp)
611 return NSERROR_NOMEM;
612
613 *result = temp;
614 return NSERROR_OK;
615 }
616
617 /* get encoding name */
618 enc = localencodings[alphabet - territory_ALPHABET_BFONT];
619 /* Assume Latin1 for any that are unused */
620 if (enc == NULL)
621 enc = localencodings[0];
622
623 /* create output buffer (oversized) */
624 result_alloc = (len * 4) + (3 * SPECIAL_CHUNK_SIZE) + 1;
625
626 *(result) = malloc(result_alloc);
627 if (!(*result))
628 return NSERROR_NOMEM;
629 *(*result) = '\0';
630
631 prev_off = 0;
632 cur_off = 0;
633
634 /* Iterate over string, converting input between unconvertable
635 * characters and inserting appropriate output for characters
636 * that iconv can't handle. */
637 for (off = 0; off < len; off++) {
638 /* Skip non-special characters */
639 if (string[off] < 0x80 || string[off] > 0x9f)
640 continue;
641
642 /* 0 length has a special meaning to utf8_from_enc */
643 if (off - prev_off > 0) {
644 err = utf8_from_enc(string + prev_off, enc,
645 off - prev_off, &temp, NULL);
646 if (err != NSERROR_OK) {
647 assert(err != NSERROR_BAD_ENCODING);
648 NSLOG(netsurf, INFO, "utf8_from_enc failed");
649 free(*result);
650 return NSERROR_NOMEM;
651 }
652
653 strcat((*result) + cur_off, temp);
654
655 cur_off += strlen(temp);
656
657 free(temp);
658 }
659
660 /* Append UTF-8 encoded special character or U+FFFD if none */
661 if (special_chars[string[off]-0x80] != NULL) {
662 const char *special = special_chars[string[off]-0x80];
663 strcat((*result) + cur_off, special);
664 cur_off += 2 + (special[0] == 0xC5 ? 0 : 1);
665 } else {
666 strcat((*result) + cur_off, "\xef\xbf\xbd");
667 cur_off += 3;
668 }
669
670 prev_off = off + 1;
671
672 /* Resize output buffer if necessary */
673 num_specials++;
674 if (num_specials % SPECIAL_CHUNK_SIZE ==
675 SPECIAL_CHUNK_SIZE - 1) {
676 char *temp = realloc((*result),
677 result_alloc +
678 (3 * SPECIAL_CHUNK_SIZE));
679 if (!temp) {
680 free(*result);
681 return NSERROR_NOMEM;
682 }
683
684 *result = temp;
685 result_alloc += (3 * SPECIAL_CHUNK_SIZE);
686 }
687 }
688
689 /* handle last chunk
690 * NB. 0 length has a special meaning to utf8_from_enc */
691 if (prev_off < len) {
692 err = utf8_from_enc(string + prev_off, enc, len - prev_off,
693 &temp, NULL);
694 if (err != NSERROR_OK) {
695 assert(err != NSERROR_BAD_ENCODING);
696 NSLOG(netsurf, INFO, "utf8_from_enc failed");
697 free(*result);
698 return NSERROR_NOMEM;
699 }
700
701 strcat((*result) + cur_off, temp);
702
703 cur_off += strlen(temp);
704
705 free(temp);
706 }
707
708 /* and copy into more reasonably-sized buffer */
709 temp = realloc((*result), cur_off + 1);
710 if (!temp) {
711 NSLOG(netsurf, INFO, "realloc failed");
712 free(*result);
713 return NSERROR_NOMEM;
714 }
715 *result = temp;
716
717 return NSERROR_OK;
718}
719
720static struct gui_utf8_table utf8_table = {
722 .local_to_utf8 = utf8_from_local_encoding,
723};
724
STATIC char result[100]
Definition: arexx.c:77
char * strndup(const char *s, size_t n)
Duplicate up to n characters of a string.
Definition: utils.c:332
Error codes.
nserror
Enumeration of error codes.
Definition: errors.h:29
@ NSERROR_BAD_ENCODING
The character set is unknown.
Definition: errors.h:45
@ NSERROR_NOMEM
Memory exhaustion.
Definition: errors.h:32
@ NSERROR_OK
No error.
Definition: errors.h:30
#define NOF_ELEMENTS(array)
Definition: search.c:67
Interface to platform-specific utf8 operations.
#define NSLOG(catname, level, logmsg, args...)
Definition: log.h:116
Interface to utility string handling.
User interface utf8 characterset conversion routines.
Definition: utf8.h:31
nserror(* utf8_to_local)(const char *string, size_t len, char **result)
Convert a UTF-8 encoded string into the system local encoding.
Definition: utf8.h:40
static const int latin3_table[256]
Definition: ucstables.c:88
static const int latin5_table[256]
Definition: ucstables.c:126
static struct gui_utf8_table utf8_table
Definition: ucstables.c:720
static const int greek_table[256]
Definition: ucstables.c:265
static const int latin2_table[256]
Definition: ucstables.c:69
static const int latin1_table[256]
Definition: ucstables.c:56
struct gui_utf8_table * riscos_utf8_table
Definition: ucstables.c:725
#define common2
Definition: ucstables.c:51
#define common
Definition: ucstables.c:40
static const char * special_chars[]
Definition: ucstables.c:409
static const int latin10_table[256]
Definition: ucstables.c:225
nserror utf8_to_local_encoding(const char *string, size_t len, char **result)
Convert a UTF-8 encoded string into the system local encoding.
Definition: ucstables.c:453
static const int latin7_table[256]
Definition: ucstables.c:164
static const int welsh_table[256]
Definition: ucstables.c:246
static const int latin6_table[256]
Definition: ucstables.c:145
static const int hebrew_table[256]
Definition: ucstables.c:303
nserror utf8_from_local_encoding(const char *string, size_t len, char **result)
Convert a string encoded in the system local encoding to UTF-8.
Definition: ucstables.c:584
const int * ucstable_from_alphabet(int alphabet)
Retrieve UCS table (above), given alphabet number.
Definition: ucstables.c:328
static const int cyrillic_table[256]
Definition: ucstables.c:284
static const int latin4_table[256]
Definition: ucstables.c:107
static const int latin8_table[256]
Definition: ucstables.c:185
static const char * localencodings[]
Definition: ucstables.c:384
#define SPECIAL_CHUNK_SIZE
static const int latin9_table[256]
Definition: ucstables.c:204
UCS conversion tables (interface) This is only used if nothing claims Service_International,...
size_t utf8_next(const char *s, size_t l, size_t o)
Find next legal UTF-8 char in string.
Definition: utf8.c:129
nserror utf8_from_enc(const char *string, const char *encname, size_t len, char **result, size_t *result_len)
Convert a string in the named encoding into a UTF-8 string.
Definition: utf8.c:321
nserror utf8_to_enc(const char *string, const char *encname, size_t len, char **result)
Convert a UTF8 string into the named encoding.
Definition: utf8.c:314
UTF-8 manipulation functions (interface).
Interface to a number of general purpose functionality.