Edinburgh Speech Tools  2.1-release
charset.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4 /* University of Edinburgh. */
5 /* */
6 /* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8 /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9 /* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10 /* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11 /* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12 /* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13 /* */
14 /*************************************************************************/
15 #include <stdio.h>
16 #include <stdlib.h>
17 
18 #ifdef FOR_LT
19 
20 #include "lt-memory.h"
21 
22 #define Malloc salloc
23 
24 #else
25 
26 #include "system.h"
27 
28 #endif
29 
30 #include "charset.h"
31 #include "string16.h"
32 
33 int iso_to_unicode[8][256]; /* latin-2 ... latin-9 */
34 int iso_max_val[8];
36 
37 /* This table is used to initialise the above arrays */
38 
39 static int latin_table[8][96] = {
40 
41 /* latin2 */
42 {
43 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,
44 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,
45 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,
46 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,
47 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
48 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
49 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
50 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
51 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
52 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
53 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
54 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
55 },
56 
57 /* latin3 */
58 {
59 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, -00001, 0x0124, 0x00a7,
60 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, -00001, 0x017b,
61 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,
62 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, -00001, 0x017c,
63 0x00c0, 0x00c1, 0x00c2, -00001, 0x00c4, 0x010a, 0x0108, 0x00c7,
64 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
65 -00001, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,
66 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,
67 0x00e0, 0x00e1, 0x00e2, -00001, 0x00e4, 0x010b, 0x0109, 0x00e7,
68 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
69 -00001, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,
70 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,
71 },
72 
73 /* latin4 */
74 {
75 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,
76 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,
77 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,
78 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,
79 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,
80 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,
81 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
82 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,
83 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,
84 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,
85 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
86 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,
87 },
88 
89 /* latin5 */
90 {
91 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
92 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f,
93 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
94 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
95 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
96 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
97 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
98 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
99 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
100 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
101 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
102 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f,
103 },
104 
105 /* latin6 */
106 {
107 0x00a0, -00001, -00001, -00001, 0x00a4, -00001, -00001, -00001,
108 -00001, -00001, -00001, -00001, 0x060c, 0x00ad, -00001, -00001,
109 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
110 -00001, -00001, -00001, 0x061b, -00001, -00001, -00001, 0x061f,
111 -00001, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
112 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,
113 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
114 0x0638, 0x0639, 0x063a, -00001, -00001, -00001, -00001, -00001,
115 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
116 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,
117 0x0650, 0x0651, 0x0652, -00001, -00001, -00001, -00001, -00001,
118 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
119 },
120 
121 /* latin7 */
122 {
123 0x00a0, 0x02bd, 0x02bc, 0x00a3, -00001, -00001, 0x00a6, 0x00a7,
124 0x00a8, 0x00a9, -00001, 0x00ab, 0x00ac, 0x00ad, -00001, 0x2015,
125 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,
126 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,
127 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
128 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
129 0x03a0, 0x03a1, -00001, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
130 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,
131 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,
132 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,
133 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,
134 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, -00001,
135 },
136 
137 /* latin8 */
138 {
139 0x00a0, -00001, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
140 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x203e,
141 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
142 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, -00001,
143 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
144 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
145 -00001, -00001, -00001, -00001, -00001, -00001, -00001, -00001,
146 -00001, -00001, -00001, -00001, -00001, -00001, -00001, 0x2017,
147 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,
148 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,
149 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,
150 0x05e8, 0x05e9, 0x05ea, -00001, -00001, -00001, -00001, -00001,
151 },
152 
153 /* latin9 */
154 {
155 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
156 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
157 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
158 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
159 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
160 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
161 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
162 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,
163 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
164 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
165 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
166 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff,
167 }
168 };
169 
171  "unknown",
172  "unspecified-ascii-superset",
173 
174  "UTF-8",
175  "ISO-646",
176 
177  "ISO-8859-1",
178  "ISO-8859-2",
179  "ISO-8859-3",
180  "ISO-8859-4",
181  "ISO-8859-5",
182  "ISO-8859-6",
183  "ISO-8859-7",
184  "ISO-8859-8",
185  "ISO-8859-9",
186 
187  "UTF-16",
188  "UTF-16",
189  "ISO-10646-UCS-2",
190  "ISO-10646-UCS-2",
191 };
192 
194  "unknown",
195  "unspecified_ascii_superset",
196 
197  "UTF-8",
198  "ISO-646",
199 
200  "ISO-8859-1",
201  "ISO-8859-2",
202  "ISO-8859-3",
203  "ISO-8859-4",
204  "ISO-8859-5",
205  "ISO-8859-6",
206  "ISO-8859-7",
207  "ISO-8859-8",
208  "ISO-8859-9",
209 
210  "UTF-16-B",
211  "UTF-16-L",
212  "ISO-10646-UCS-2-B",
213  "ISO-10646-UCS-2-L",
214 };
215 
217  {"ASCII", CE_ISO_646},
218  {"ISO-Latin-1", CE_ISO_8859_1},
219  {"ISO-Latin-2", CE_ISO_8859_2},
220  {"ISO-Latin-3", CE_ISO_8859_3},
221  {"ISO-Latin-4", CE_ISO_8859_4},
222  {"ISO-Latin-5", CE_ISO_8859_5},
223  {"ISO-Latin-6", CE_ISO_8859_6},
224  {"ISO-Latin-7", CE_ISO_8859_7},
225  {"ISO-Latin-8", CE_ISO_8859_8},
226  {"UCS-2", CE_ISO_10646_UCS_2B},
227 };
228 const int CE_alias_count =
229  sizeof(CharacterEncodingAlias)/sizeof(CharacterEncodingAlias[0]);
230 
232 
233 void init_charset(void)
234 {
235  int i, j;
236 
237 #if CHAR_SIZE == 8
238  InternalCharacterEncoding = CE_unspecified_ascii_superset;
239 #else
240  union {char b[2]; short s;} bytes;
241 
242  /* Determine internal encoding */
243  bytes.s = 1;
244  InternalCharacterEncoding = (bytes.b[0] == 0) ? CE_UTF_16B : CE_UTF_16L;
245 #endif
246 
247  /* Make ISO-Latin-N tables */
248 
249  for(i=0; i<8; i++)
250  {
251  int max = 0x9f;
252 
253  for(j=0; j<0xa0; j++)
254  iso_to_unicode[i][j] = j;
255  for(j=0xa0; j<0x100; j++)
256  {
257  int code = latin_table[i][j-0xa0];
258  iso_to_unicode[i][j] = code;
259  if(code > max) max = code;
260  }
261 
262  iso_max_val[i] = max;
263 
264  if(!(unicode_to_iso[i] = Malloc(max+1)))
265  {
266  fprintf(stderr, "Malloc failed in charset initialisation\n");
267  exit(1);
268  }
269 
270  for(j=0; j<0xa0; j++)
271  unicode_to_iso[i][j] = j;
272  for(j=0xa0; j<=max; j++)
273  unicode_to_iso[i][j] = '?';
274  for(j=0xa0; j<0x100; j++)
275  {
276  int code = latin_table[i][j-0xa0];
277  if(code != -1)
278  unicode_to_iso[i][code] = j;
279  }
280  }
281 }
282 
283 /* Return true if the encoding has 8-bit input units and is the same
284  as ascii for characters <= 127 */
285 
287 {
288  return enc >= CE_unspecified_ascii_superset && enc <= CE_ISO_8859_9;
289 }
290 
291 /*
292  * Return true if enc1 and enc2 have the same size input units, and are
293  * the same for Unicode <= 127.
294  * If so, *enc3 is set to enc2 modified to have the same byte order as enc1.
295  */
296 
298  CharacterEncoding *enc3)
299 {
300  if(EncodingIsAsciiSuperset(enc1))
301  {
302  if(EncodingIsAsciiSuperset(enc2))
303  {
304  *enc3 = enc2;
305  return 1;
306  }
307  return 0;
308  }
309 
310  if(enc1 == CE_UTF_16B || enc1 == CE_ISO_10646_UCS_2B)
311  {
312  if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
313  *enc3 = CE_UTF_16B;
314  else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
315  *enc3 = CE_ISO_10646_UCS_2B;
316  else
317  return 0;
318  return 1;
319  }
320 
321  if(enc1 == CE_UTF_16L || enc1 == CE_ISO_10646_UCS_2L)
322  {
323  if(enc2 == CE_UTF_16B || enc2 == CE_UTF_16L)
324  *enc3 = CE_UTF_16L;
325  else if(enc2 == CE_ISO_10646_UCS_2B || enc2 == CE_ISO_10646_UCS_2L)
326  *enc3 = CE_ISO_10646_UCS_2L;
327  else
328  return 0;
329  return 1;
330  }
331 
332  return 0;
333 }
334 
336 {
337  int i;
338 
339  for(i=0; i<CE_enum_count; i++)
341  return (CharacterEncoding)i;
342 
343  for(i=0; i<CE_enum_count; i++)
344  if(strcasecmp8(name, CharacterEncodingName[i]) == 0)
345  return (CharacterEncoding)i;
346 
347  for(i=0; i<CE_alias_count; i++)
348  if(strcasecmp8(name, CharacterEncodingAlias[i].name) == 0)
349  return CharacterEncodingAlias[i].enc;
350 
351  return CE_unknown;
352 }
353 
void init_charset(void)
Definition: charset.c:233
int EncodingIsAsciiSuperset(CharacterEncoding enc)
Definition: charset.c:286
int iso_to_unicode[8][256]
Definition: charset.c:33
const char8 * CharacterEncodingName[CE_enum_count]
Definition: charset.c:170
CharacterEncoding FindEncoding(char8 *name)
Definition: charset.c:335
CharacterEncoding InternalCharacterEncoding
Definition: charset.c:231
const int CE_alias_count
Definition: charset.c:228
STD_API int strcasecmp8(const char8 *, const char8 *)
Definition: string16.c:33
const char8 * name
Definition: charset.h:68
char8 * unicode_to_iso[8]
Definition: charset.c:35
float max(float a, float b)
Definition: EST_cluster.cc:143
int iso_max_val[8]
Definition: charset.c:34
enum character_encoding CharacterEncoding
Definition: charset.h:61
CharacterEncoding enc
Definition: charset.h:68
char char8
Definition: charset.h:31
void * Malloc(int bytes)
Definition: system.c:19
int EncodingsCompatible(CharacterEncoding enc1, CharacterEncoding enc2, CharacterEncoding *enc3)
Definition: charset.c:297
struct character_encoding_alias CharacterEncodingAlias[]
Definition: charset.c:216
const char8 * CharacterEncodingNameAndByteOrder[CE_enum_count]
Definition: charset.c:193