4498e40e9bdf668e6bc72e88a9b23792748860f2
[musl] / src / locale / iconv.c
1 #include <iconv.h>
2 #include <errno.h>
3 #include <wchar.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include <limits.h>
7 #include <stdint.h>
8
9 #define UTF_32BE    0300
10 #define UTF_16LE    0301
11 #define UTF_16BE    0302
12 #define UTF_32LE    0303
13 #define UCS2BE      0304
14 #define UCS2LE      0305
15 #define US_ASCII    0306
16 #define WCHAR_T     0307
17 #define UTF_8       0310
18
19 /* FIXME: these are not implemented yet
20  * EUC:   A1-FE A1-FE
21  * GBK:   81-FE 40-7E,80-FE
22  * Big5:  A1-FE 40-7E,A1-FE
23  */
24
25 /* Definitions of charmaps. Each charmap consists of:
26  * 1. Empty-string-terminated list of null-terminated aliases.
27  * 2. Special type code or number of elided entries.
28  * 3. Character table (size determined by field 2). */
29
30 static const unsigned char charmaps[] =
31 "utf8\0\0\310"
32 "wchart\0\0\307"
33 "ucs2\0ucs2be\0\0\304"
34 "ucs2le\0\0\305"
35 "utf16\0utf16be\0\0\302"
36 "utf16le\0\0\301"
37 "ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
38 "ucs4le\0utf32le\0\0\303"
39 "ascii\0usascii\0iso646\0iso646us\0\0\306"
40 #include "codepages.h"
41 ;
42
43 static const unsigned short legacy_chars[] = {
44 #include "legacychars.h"
45 };
46
47 static int fuzzycmp(const unsigned char *a, const unsigned char *b)
48 {
49         for (; *a && *b; a++, b++) {
50                 while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
51                 if ((*a|32U) != *b) return 1;
52         }
53         return *a != *b;
54 }
55
56 static size_t find_charmap(const void *name)
57 {
58         const unsigned char *s;
59         for (s=charmaps; *s; ) {
60                 if (!fuzzycmp(name, s)) {
61                         for (; *s; s+=strlen((void *)s)+1);
62                         return s+1-charmaps;
63                 }
64                 s += strlen((void *)s)+1;
65                 if (!*s) {
66                         if (s[1] > 0200) s+=2;
67                         else s+=2+(128U-s[1])/4*5;
68                 }
69         }
70         return -1;
71 }
72
73 iconv_t iconv_open(const char *to, const char *from)
74 {
75         size_t f, t;
76
77         if ((t = find_charmap(to))==-1 || (f = find_charmap(from))==-1) {
78                 errno = EINVAL;
79                 return (iconv_t)-1;
80         }
81
82         return (void *)(f<<16 | t);
83 }
84
85 int iconv_close(iconv_t cd)
86 {
87         return 0;
88 }
89
90 static unsigned get_16(const unsigned char *s, int e)
91 {
92         e &= 1;
93         return s[e]<<8 | s[1-e];
94 }
95
96 static void put_16(unsigned char *s, unsigned c, int e)
97 {
98         e &= 1;
99         s[e] = c>>8;
100         s[1-e] = c;
101 }
102
103 static unsigned get_32(const unsigned char *s, int e)
104 {
105         e &= 3;
106         return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
107 }
108
109 static void put_32(unsigned char *s, unsigned c, int e)
110 {
111         e &= 3;
112         s[e^0] = c>>24;
113         s[e^1] = c>>16;
114         s[e^2] = c>>8;
115         s[e^3] = c;
116 }
117
118 /* Adapt as needed */
119 #define mbrtowc_utf8 mbrtowc
120 #define wctomb_utf8 wctomb
121
122 #include <stdio.h>
123 size_t iconv(iconv_t cd0, char **in, size_t *inb, char **out, size_t *outb)
124 {
125         size_t x=0;
126         unsigned long cd = (unsigned long)cd0;
127         unsigned to = cd & 0xffff;
128         unsigned from = cd >> 16;
129         const unsigned char *map = charmaps+from+1;
130         const unsigned char *tomap = charmaps+to+1;
131         mbstate_t st = {0};
132         wchar_t wc;
133         unsigned c, d;
134         size_t k, l;
135         int err;
136         unsigned char type = map[-1];
137         unsigned char totype = tomap[-1];
138
139         if (!in || !*in || !*inb) return 0;
140
141         for (; *inb; *in+=l, *inb-=l) {
142                 c = *(unsigned char *)*in;
143                 l = 1;
144
145                 if (c >= 128) switch (type) {
146                 case UTF_8:
147                         l = mbrtowc_utf8(&wc, *in, *inb, &st);
148                         if (!l) l++;
149                         else if (l == (size_t)-1) goto ilseq;
150                         else if (l == (size_t)-2) goto starved;
151                         c = wc;
152                         break;
153                 case US_ASCII:
154                         goto ilseq;
155                 case WCHAR_T:
156                         l = sizeof(wchar_t);
157                         if (*inb < l) goto starved;
158                         c = *(wchar_t *)*in;
159                         if (0) {
160                 case UTF_32BE:
161                 case UTF_32LE:
162                         l = 4;
163                         if (*inb < 4) goto starved;
164                         c = get_32((void *)*in, type);
165                         }
166                         if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
167                         break;
168                 case UCS2BE:
169                 case UCS2LE:
170                 case UTF_16BE:
171                 case UTF_16LE:
172                         l = 2;
173                         if (*inb < 2) goto starved;
174                         c = get_16((void *)*in, type);
175                         if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
176                         if ((unsigned)(c-0xd800) < 0x400) {
177                                 if (type-UCS2BE < 2U) goto ilseq;
178                                 l = 4;
179                                 if (*inb < 4) goto starved;
180                                 d = get_16((void *)(*in + 2), from);
181                                 if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq;
182                                 c = ((c-0xd800)<<10) | (d-0xdc00);
183                         }
184                         break;
185                 default:
186                         if (c < 128+type) break;
187                         c -= 128+type;
188                         c = legacy_chars[ map[c*5/4]>>2*c%8 |
189                                 map[c*5/4+1]<<8-2*c%8 & 1023 ];
190                         if (!c) c = *(unsigned char *)*in;
191                         if (c==1) goto ilseq;
192                 }
193
194                 switch (totype) {
195                 case WCHAR_T:
196                         if (*outb < sizeof(wchar_t)) goto toobig;
197                         *(wchar_t *)*out = c;
198                         *out += sizeof(wchar_t);
199                         *outb -= sizeof(wchar_t);
200                         break;
201                 case UTF_8:
202                         if (*outb < 4) {
203                                 char tmp[4];
204                                 k = wctomb_utf8(tmp, c);
205                                 if (*outb < k) goto toobig;
206                                 memcpy(*out, tmp, k);
207                         } else k = wctomb_utf8(*out, c);
208                         *out += k;
209                         *outb -= k;
210                         break;
211                 case US_ASCII:
212                         if (c > 0x7f) subst: x++, c='*';
213                 default:
214                         if (*outb < 1) goto toobig;
215                         if (c < 128+totype) {
216                         revout:
217                                 *(*out)++ = c;
218                                 *outb -= 1;
219                                 break;
220                         }
221                         d = c;
222                         for (c=0; c<128-totype; c++) {
223                                 if (d == legacy_chars[ map[c*5/4]>>2*c%8 |
224                                         map[c*5/4+1]<<8-2*c%8 & 1023 ]) {
225                                         c += 128;
226                                         goto revout;
227                                 }
228                         }
229                         goto subst;
230                 case UCS2BE:
231                 case UCS2LE:
232                 case UTF_16BE:
233                 case UTF_16LE:
234                         if (c < 0x10000) {
235                                 if (*outb < 2) goto toobig;
236                                 put_16((void *)*out, c, totype);
237                                 *out += 2;
238                                 *outb -= 2;
239                                 break;
240                         }
241                         if (type-UCS2BE < 2U) goto ilseq;
242                         if (*outb < 4) goto toobig;
243                         put_16((void *)*out, (c>>10)|0xd800, totype);
244                         put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
245                         *out += 4;
246                         *outb -= 4;
247                         break;
248                 case UTF_32BE:
249                 case UTF_32LE:
250                         if (*outb < 4) goto toobig;
251                         put_32((void *)*out, c, totype);
252                         *out += 4;
253                         *outb -= 4;
254                         break;
255                 }
256         }
257         return x;
258 ilseq:
259         err = EILSEQ;
260         x = -1;
261         goto end;
262 toobig:
263         err = E2BIG;
264         x = -1;
265         goto end;
266 starved:
267         err = EINVAL;
268         x = -1;
269 end:
270         errno = err;
271         return x;
272 }