iconv support for legacy Korean encodings
[musl] / src / locale / iconv.c
1 #include <iconv.h>
2 #include <errno.h>
3 #include <wchar.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <limits.h>
7 #include <stdint.h>
8
9 #define UTF_32BE    0300
10 #define UTF_16LE    0301
11 #define UTF_16BE    0302
12 #define UTF_32LE    0303
13 #define UCS2BE      0304
14 #define UCS2LE      0305
15 #define WCHAR_T     0306
16 #define US_ASCII    0307
17 #define UTF_8       0310
18 #define EUC_JP      0320
19 #define SHIFT_JIS   0321
20 #define GB18030     0330
21 #define GBK         0331
22 #define GB2312      0332
23 #define EUC_KR      0350
24
25 /* FIXME: these are not implemented yet
26  * EUC:   A1-FE A1-FE
27  * GBK:   81-FE 40-7E,80-FE
28  * Big5:  A1-FE 40-7E,A1-FE
29  */
30
31 /* Definitions of charmaps. Each charmap consists of:
32  * 1. Empty-string-terminated list of null-terminated aliases.
33  * 2. Special type code or number of elided entries.
34  * 3. Character table (size determined by field 2). */
35
36 static const unsigned char charmaps[] =
37 "utf8\0\0\310"
38 "wchart\0\0\306"
39 "ucs2\0ucs2be\0\0\304"
40 "ucs2le\0\0\305"
41 "utf16\0utf16be\0\0\302"
42 "utf16le\0\0\301"
43 "ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
44 "ucs4le\0utf32le\0\0\303"
45 "ascii\0usascii\0iso646\0iso646us\0\0\307"
46 "eucjp\0\0\320"
47 "shiftjis\0sjis\0\0\321"
48 "gb18030\0\0\330"
49 "gbk\0\0\331"
50 "gb2312\0\0\332"
51 "euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
52 #include "codepages.h"
53 ;
54
55 static const unsigned short legacy_chars[] = {
56 #include "legacychars.h"
57 };
58
59 static const unsigned short jis0208[84][94] = {
60 #include "jis0208.h"
61 };
62
63 static const unsigned short gb18030[126][190] = {
64 #include "gb18030.h"
65 };
66
67 static const unsigned short ksc[93][94] = {
68 #include "ksc.h"
69 };
70
71 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
72 {
73         for (; *a && *b; a++, b++) {
74                 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
75                 if ((*a|32U) != *b) return 1;
76         }
77         return *a != *b;
78 }
79
80 static size_t find_charmap(const void *name)
81 {
82         const unsigned char *s;
83         for (s=charmaps; *s; ) {
84                 if (!fuzzycmp(name, s)) {
85                         for (; *s; s+=strlen((void *)s)+1);
86                         return s+1-charmaps;
87                 }
88                 s += strlen((void *)s)+1;
89                 if (!*s) {
90                         if (s[1] > 0200) s+=2;
91                         else s+=2+(128U-s[1])/4*5;
92                 }
93         }
94         return -1;
95 }
96
97 iconv_t iconv_open(const char *to, const char *from)
98 {
99         size_t f, t;
100
101         if ((t = find_charmap(to))==-1
102          || (f = find_charmap(from))==-1
103          || (charmaps[t] >= 0320)) {
104                 errno = EINVAL;
105                 return (iconv_t)-1;
106         }
107
108         return (void *)(f<<16 | t);
109 }
110
111 int iconv_close(iconv_t cd)
112 {
113         return 0;
114 }
115
116 static unsigned get_16(const unsigned char *s, int e)
117 {
118         e &= 1;
119         return s[e]<<8 | s[1-e];
120 }
121
122 static void put_16(unsigned char *s, unsigned c, int e)
123 {
124         e &= 1;
125         s[e] = c>>8;
126         s[1-e] = c;
127 }
128
129 static unsigned get_32(const unsigned char *s, int e)
130 {
131         e &= 3;
132         return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
133 }
134
135 static void put_32(unsigned char *s, unsigned c, int e)
136 {
137         e &= 3;
138         s[e^0] = c>>24;
139         s[e^1] = c>>16;
140         s[e^2] = c>>8;
141         s[e^3] = c;
142 }
143
144 /* Adapt as needed */
145 #define mbrtowc_utf8 mbrtowc
146 #define wctomb_utf8 wctomb
147
148 size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
149 {
150         size_t x=0;
151         unsigned long cd = (unsigned long)cd0;
152         unsigned to = cd & 0xffff;
153         unsigned from = cd >> 16;
154         const unsigned char *map = charmaps+from+1;
155         const unsigned char *tomap = charmaps+to+1;
156         mbstate_t st = {0};
157         wchar_t wc;
158         unsigned c, d;
159         size_t k, l;
160         int err;
161         unsigned char type = map[-1];
162         unsigned char totype = tomap[-1];
163
164         if (!in || !*in || !*inb) return 0;
165
166         for (; *inb; *in+=l, *inb-=l) {
167                 c = *(unsigned char *)*in;
168                 l = 1;
169
170                 if (c >= 128 || type-UTF_32BE < 7U) switch (type) {
171                 case UTF_8:
172                         l = mbrtowc_utf8(&wc, *in, *inb, &st);
173                         if (!l) l++;
174                         else if (l == (size_t)-1) goto ilseq;
175                         else if (l == (size_t)-2) goto starved;
176                         c = wc;
177                         break;
178                 case US_ASCII:
179                         goto ilseq;
180                 case WCHAR_T:
181                         l = sizeof(wchar_t);
182                         if (*inb < l) goto starved;
183                         c = *(wchar_t *)*in;
184                         if (0) {
185                 case UTF_32BE:
186                 case UTF_32LE:
187                         l = 4;
188                         if (*inb < 4) goto starved;
189                         c = get_32((void *)*in, type);
190                         }
191                         if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
192                         break;
193                 case UCS2BE:
194                 case UCS2LE:
195                 case UTF_16BE:
196                 case UTF_16LE:
197                         l = 2;
198                         if (*inb < 2) goto starved;
199                         c = get_16((void *)*in, type);
200                         if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
201                         if ((unsigned)(c-0xd800) < 0x400) {
202                                 if (type-UCS2BE < 2U) goto ilseq;
203                                 l = 4;
204                                 if (*inb < 4) goto starved;
205                                 d = get_16((void *)(*in + 2), type);
206                                 if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
207                                 c = ((c-0xd7c0)<<10) + (d-0xdc00);
208                         }
209                         break;
210                 case SHIFT_JIS:
211                         if (c-0xa1 <= 0xdf-0xa1) {
212                                 c += 0xff61-0xa1;
213                                 break;
214                         }
215                         l = 2;
216                         if (*inb < 2) goto starved;
217                         d = *((unsigned char *)*in + 1);
218                         if (c-129 <= 159-129) c -= 129;
219                         else if (c-224 <= 239-224) c -= 193;
220                         else goto ilseq;
221                         c *= 2;
222                         if (d-64 <= 158-64) {
223                                 if (d==127) goto ilseq;
224                                 if (d>127) d--;
225                                 d -= 64;
226                         } else if (d-159 <= 252-159) {
227                                 c++;
228                                 d -= 159;
229                         }
230                         c = jis0208[c][d];
231                         if (!c) goto ilseq;
232                         break;
233                 case EUC_JP:
234                         l = 2;
235                         if (*inb < 2) goto starved;
236                         d = *((unsigned char *)*in + 1);
237                         if (c==0x8e) {
238                                 c = d;
239                                 if (c-0xa1 > 0xdf-0xa1) goto ilseq;
240                                 c += 0xff61 - 0xa1;
241                                 break;
242                         }
243                         c -= 0xa1;
244                         d -= 0xa1;
245                         if (c >= 84 || d >= 94) goto ilseq;
246                         c = jis0208[c][d];
247                         if (!c) goto ilseq;
248                         break;
249                 case GB2312:
250                         if (c < 0xa1) goto ilseq;
251                 case GBK:
252                 case GB18030:
253                         c -= 0x81;
254                         if (c >= 126) goto ilseq;
255                         l = 2;
256                         if (*inb < 2) goto starved;
257                         d = *((unsigned char *)*in + 1);
258                         if (d < 0xa1 && type == GB2312) goto ilseq;
259                         if (d-0x40>=191 || d==127) {
260                                 if (d-'0'>9 || type != GB18030)
261                                         goto ilseq;
262                                 l = 4;
263                                 if (*inb < 4) goto starved;
264                                 c = (10*c + d-'0') * 1260;
265                                 d = *((unsigned char *)*in + 2);
266                                 if (d-0x81>126) goto ilseq;
267                                 c += 10*(d-0x81);
268                                 d = *((unsigned char *)*in + 3);
269                                 if (d-'0'>9) goto ilseq;
270                                 c += d-'0';
271                                 c += 128;
272                                 for (d=0; d<=c; ) {
273                                         k = 0;
274                                         for (int i=0; i<126; i++)
275                                                 for (int j=0; j<190; j++)
276                                                         if (gb18030[i][j]-d <= c-d)
277                                                                 k++;
278                                         d = c+1;
279                                         c += k;
280                                 }
281                                 break;
282                         }
283                         d -= 0x40;
284                         if (d>63) d--;
285                         c = gb18030[c][d];
286                         break;
287                 case EUC_KR:
288                         l = 2;
289                         if (*inb < 2) goto starved;
290                         d = *((unsigned char *)*in + 1);
291                         c -= 0xa1;
292                         d -= 0xa1;
293                         if (c >= 93 || d >= 94) {
294                                 c += (0xa1-0x81);
295                                 d += 0xa1;
296                                 if (c >= 93 || c>=0xc6-0x81 && d>0x52)
297                                         goto ilseq;
298                                 if (d-'A'<26) d = d-'A';
299                                 else if (d-'a'<26) d = d-'a'+26;
300                                 else if (d-0x81<0xff-0x81) d = d-0x81+52;
301                                 else goto ilseq;
302                                 if (c < 0x20) c = 178*c + d;
303                                 else c = 178*0x20 + 84*(c-0x20) + d;
304                                 c += 0xac00;
305                                 for (d=0xac00; d<=c; ) {
306                                         k = 0;
307                                         for (int i=0; i<93; i++)
308                                                 for (int j=0; j<94; j++)
309                                                         if (ksc[i][j]-d <= c-d)
310                                                                 k++;
311                                         d = c+1;
312                                         c += k;
313                                 }
314                                 break;
315                         }
316                         c = ksc[c][d];
317                         if (!c) goto ilseq;
318                         break;
319                 default:
320                         if (c < 128+type) break;
321                         c -= 128+type;
322                         c = legacy_chars[ map[c*5/4]>>2*c%8 |
323                                 map[c*5/4+1]<<8-2*c%8 & 1023 ];
324                         if (!c) c = *(unsigned char *)*in;
325                         if (c==1) goto ilseq;
326                 }
327
328                 switch (totype) {
329                 case WCHAR_T:
330                         if (*outb < sizeof(wchar_t)) goto toobig;
331                         *(wchar_t *)*out = c;
332                         *out += sizeof(wchar_t);
333                         *outb -= sizeof(wchar_t);
334                         break;
335                 case UTF_8:
336                         if (*outb < 4) {
337                                 char tmp[4];
338                                 k = wctomb_utf8(tmp, c);
339                                 if (*outb < k) goto toobig;
340                                 memcpy(*out, tmp, k);
341                         } else k = wctomb_utf8(*out, c);
342                         *out += k;
343                         *outb -= k;
344                         break;
345                 case US_ASCII:
346                         if (c > 0x7f) subst: x++, c='*';
347                 default:
348                         if (*outb < 1) goto toobig;
349                         if (c < 128+totype) {
350                         revout:
351                                 *(*out)++ = c;
352                                 *outb -= 1;
353                                 break;
354                         }
355                         d = c;
356                         for (c=0; c<128-totype; c++) {
357                                 if (d == legacy_chars[ tomap[c*5/4]>>2*c%8 |
358                                         tomap[c*5/4+1]<<8-2*c%8 & 1023 ]) {
359                                         c += 128;
360                                         goto revout;
361                                 }
362                         }
363                         goto subst;
364                 case UCS2BE:
365                 case UCS2LE:
366                 case UTF_16BE:
367                 case UTF_16LE:
368                         if (c < 0x10000 || type-UCS2BE < 2U) {
369                                 if (c >= 0x10000) c = 0xFFFD;
370                                 if (*outb < 2) goto toobig;
371                                 put_16((void *)*out, c, totype);
372                                 *out += 2;
373                                 *outb -= 2;
374                                 break;
375                         }
376                         if (*outb < 4) goto toobig;
377                         c -= 0x10000;
378                         put_16((void *)*out, (c>>10)|0xd800, totype);
379                         put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
380                         *out += 4;
381                         *outb -= 4;
382                         break;
383                 case UTF_32BE:
384                 case UTF_32LE:
385                         if (*outb < 4) goto toobig;
386                         put_32((void *)*out, c, totype);
387                         *out += 4;
388                         *outb -= 4;
389                         break;
390                 }
391         }
392         return x;
393 ilseq:
394         err = EILSEQ;
395         x = -1;
396         goto end;
397 toobig:
398         err = E2BIG;
399         x = -1;
400         goto end;
401 starved:
402         err = EINVAL;
403         x = -1;
404 end:
405         errno = err;
406         return x;
407 }