adapt to latest libfirm
[cparser] / input.c
1 #include "config.h"
2
3 #include "diagnostic.h"
4 #include "input.h"
5
6 #include <ctype.h>
7
8 typedef size_t (*decode_func)(input_t *input, utf32 *buffer, size_t buffer_size);
9
10 typedef enum {
11         INPUT_FILE,
12         INPUT_STRING
13 } input_kind_t;
14
15 struct input_t {
16         input_kind_t kind;
17         union {
18                 FILE *file;
19                 const char *string;
20         } in;
21         decode_func decode;
22
23         /* state for utf-8 decoder */
24         utf32  utf8_part_decoded_min_code;
25         utf32  utf8_part_decoded_char;
26         size_t utf8_part_decoded_rest_len;
27 };
28
29 static input_error_callback_func input_error;
30
31 void set_input_error_callback(input_error_callback_func new_func)
32 {
33         input_error = new_func;
34 }
35
36 static size_t read_block(input_t *input, unsigned char *const read_buf,
37                          size_t const n)
38 {
39         if (input->kind == INPUT_FILE) {
40                 FILE *file = input->in.file;
41                 size_t const s = fread(read_buf, 1, n, file);
42                 if (s == 0) {
43                         /* on OS/X ferror appears to return true on eof as well when running
44                          * the application in gdb... */
45                         if (!feof(file) && ferror(file))
46                                 input_error(0, 0, "read from input failed");
47                         return 0;
48                 }
49                 return s;
50         } else {
51                 assert(input->kind == INPUT_STRING);
52                 size_t len = strlen(input->in.string);
53                 if (len > n)
54                         len = n;
55                 memcpy(read_buf, input->in.string, len);
56                 input->in.string += len;
57                 return len;
58         }
59 }
60
61 static size_t decode_iso_8859_1(input_t *input, utf32 *buffer,
62                                 size_t buffer_size)
63 {
64         unsigned char read_buf[buffer_size];
65         size_t const s = read_block(input, read_buf, sizeof(read_buf));
66
67         unsigned char const *src = read_buf;
68         unsigned char const *end = read_buf + s;
69         utf32               *dst = buffer;
70         while (src != end)
71                 *dst++ = *src++;
72
73         return s;
74 }
75
76 static size_t decode_iso_8859_15(input_t *input, utf32 *buffer,
77                                  size_t buffer_size)
78 {
79         unsigned char read_buf[buffer_size];
80         size_t const s = read_block(input, read_buf, sizeof(read_buf));
81
82         unsigned char const *src = read_buf;
83         unsigned char const *end = read_buf + s;
84         utf32               *dst = buffer;
85         while (src != end) {
86                 utf32 tc = *src++;
87                 switch (tc) {
88                         case 0xA4: tc = 0x20AC; break; // €
89                         case 0xA6: tc = 0x0160; break; // Š
90                         case 0xA8: tc = 0x0161; break; // š
91                         case 0xB4: tc = 0x017D; break; // Ž
92                         case 0xB8: tc = 0x017E; break; // ž
93                         case 0xBC: tc = 0x0152; break; // Œ
94                         case 0xBD: tc = 0x0153; break; // œ
95                         case 0xBE: tc = 0x0178; break; // Ÿ
96                 }
97                 *dst++ = tc;
98         }
99
100         return s;
101 }
102
103 static size_t decode_utf8(input_t *input, utf32 *buffer, size_t buffer_size)
104 {
105         unsigned char read_buf[buffer_size];
106
107         for (;;) {
108                 size_t const s = read_block(input, read_buf, sizeof(read_buf));
109                 if (s == 0) {
110                         if (input->utf8_part_decoded_rest_len > 0)
111                                 input_error(0, 0, "incomplete input char at end of input");
112                         return 0;
113                 }
114
115                 unsigned char const *src = read_buf;
116                 unsigned char const *end = read_buf + s;
117                 utf32               *dst = buffer;
118                 utf32                decoded;
119                 utf32                min_code;
120
121                 if (input->utf8_part_decoded_rest_len != 0) {
122                         min_code              = input->utf8_part_decoded_min_code;
123                         decoded               = input->utf8_part_decoded_char;
124                         size_t const rest_len = input->utf8_part_decoded_rest_len;
125                         input->utf8_part_decoded_rest_len = 0;
126                         switch (rest_len) {
127                                 case 4:  goto realign;
128                                 case 3:  goto three_more;
129                                 case 2:  goto two_more;
130                                 default: goto one_more;
131                         }
132                 }
133
134                 while (src != end) {
135                         if ((*src & 0x80) == 0) {
136                                 decoded = *src++;
137                         } else if ((*src & 0xE0) == 0xC0) {
138                                 min_code = 0x80;
139                                 decoded  = *src++ & 0x1F;
140 one_more:
141                                 if (src == end) {
142                                         input->utf8_part_decoded_min_code = min_code;
143                                         input->utf8_part_decoded_char     = decoded;
144                                         input->utf8_part_decoded_rest_len = 1;
145                                         break;
146                                 }
147                                 if ((*src & 0xC0) == 0x80) {
148                                         decoded = (decoded << 6) | (*src++ & 0x3F);
149                                 } else {
150                                         goto invalid_char;
151                                 }
152                                 if (decoded < min_code                      ||
153                                                 decoded > 0x10FFFF                      ||
154                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
155                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
156                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
157                                         input_error(0, 0, "invalid byte sequence in input");
158                                 }
159                         } else if ((*src & 0xF0) == 0xE0) {
160                                 min_code = 0x800;
161                                 decoded  = *src++ & 0x0F;
162 two_more:
163                                 if (src == end) {
164                                         input->utf8_part_decoded_min_code = min_code;
165                                         input->utf8_part_decoded_char     = decoded;
166                                         input->utf8_part_decoded_rest_len = 2;
167                                         break;
168                                 }
169                                 if ((*src & 0xC0) == 0x80) {
170                                         decoded = (decoded << 6) | (*src++ & 0x3F);
171                                 } else {
172                                         goto invalid_char;
173                                 }
174                                 goto one_more;
175                         } else if ((*src & 0xF8) == 0xF0) {
176                                 min_code = 0x10000;
177                                 decoded  = *src++ & 0x07;
178 three_more:
179                                 if (src == end) {
180                                         input->utf8_part_decoded_min_code = min_code;
181                                         input->utf8_part_decoded_char     = decoded;
182                                         input->utf8_part_decoded_rest_len = 3;
183                                         break;
184                                 }
185                                 if ((*src & 0xC0) == 0x80) {
186                                         decoded = (decoded << 6) | (*src++ & 0x3F);
187                                 } else {
188                                         goto invalid_char;
189                                 }
190                                 goto two_more;
191                         } else {
192 invalid_char:
193                                 input_error(0, 0, "invalid byte sequence in input");
194 realign:
195                                 do {
196                                         ++src;
197                                         if (src == end) {
198                                                 input->utf8_part_decoded_rest_len = 4;
199                                                 break;
200                                         }
201                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
202                                 continue;
203                         }
204                         *dst++ = decoded;
205                 }
206
207                 /* we're done when we could read more than 1 char */
208                 if (buffer != dst)
209                         return dst - buffer;
210         }
211 }
212
213 static size_t decode_windows_1252(input_t *input, utf32 *buffer,
214                                   size_t buffer_size)
215 {
216         unsigned char read_buf[buffer_size];
217         size_t const s = read_block(input, read_buf, sizeof(read_buf));
218
219         unsigned char const *src = read_buf;
220         unsigned char const *end = read_buf + s;
221         utf32               *dst = buffer;
222         while (src != end) {
223                 utf32 tc = *src++;
224                 switch (tc) {
225                         case 0x80: tc = 0x20AC; break; // €
226                         case 0x82: tc = 0x201A; break; // ‚
227                         case 0x83: tc = 0x0192; break; // ƒ
228                         case 0x84: tc = 0x201E; break; // „
229                         case 0x85: tc = 0x2026; break; // …
230                         case 0x86: tc = 0x2020; break; // †
231                         case 0x87: tc = 0x2021; break; // ‡
232                         case 0x88: tc = 0x02C6; break; // ˆ
233                         case 0x89: tc = 0x2030; break; // ‰
234                         case 0x8A: tc = 0x0160; break; // Š
235                         case 0x8B: tc = 0x2039; break; // ‹
236                         case 0x8C: tc = 0x0152; break; // Œ
237                         case 0x8E: tc = 0x017D; break; // Ž
238                         case 0x91: tc = 0x2018; break; // ‘
239                         case 0x92: tc = 0x2019; break; // ’
240                         case 0x93: tc = 0x201C; break; // “
241                         case 0x94: tc = 0x201D; break; // ”
242                         case 0x95: tc = 0x2022; break; // •
243                         case 0x96: tc = 0x2013; break; // –
244                         case 0x97: tc = 0x2014; break; // —
245                         case 0x98: tc = 0x02DC; break; // ˜
246                         case 0x99: tc = 0x2122; break; // ™
247                         case 0x9A: tc = 0x0161; break; // š
248                         case 0x9B: tc = 0x203A; break; // ›
249                         case 0x9C: tc = 0x0153; break; // œ
250                         case 0x9E: tc = 0x017E; break; // ž
251                         case 0x9F: tc = 0x0178; break; // Ÿ
252                 }
253                 *dst++ = tc;
254         }
255
256         return s;
257 }
258
259 typedef struct named_decoder_t {
260         char const *name;
261         decode_func decoder;
262 } named_decoder_t;
263
264 static named_decoder_t const decoders[] = {
265         { "CP819",           decode_iso_8859_1   }, // official alias
266         { "IBM819",          decode_iso_8859_1   }, // official alias
267         { "ISO-8859-1",      decode_iso_8859_1   }, // official alias
268         { "ISO-8859-15",     decode_iso_8859_15  }, // official name
269         { "ISO8859-1",       decode_iso_8859_1   },
270         { "ISO8859-15",      decode_iso_8859_15  },
271         { "ISO_8859-1",      decode_iso_8859_1   }, // official alias
272         { "ISO_8859-15",     decode_iso_8859_15  }, // official alias
273         { "ISO_8859-1:1987", decode_iso_8859_1   }, // official name
274         { "Latin-9",         decode_iso_8859_15  }, // official alias
275         { "UTF-8",           decode_utf8         }, // official name
276         { "csISOLatin1",     decode_iso_8859_1   }, // official alias
277         { "cp1252",          decode_windows_1252 },
278         { "iso-ir-100",      decode_iso_8859_1   }, // official alias
279         { "l1",              decode_iso_8859_1   }, // official alias
280         { "latin1",          decode_iso_8859_1   }, // official alias
281         { "windows-1252",    decode_windows_1252 }, // official name
282
283         { NULL,              NULL                }
284 };
285
286 /** strcasecmp is not part of C99 so we need our own implementation here */
287 static int my_strcasecmp(const char *s1, const char *s2)
288 {
289         for ( ; *s1 != 0; ++s1, ++s2) {
290                 if (tolower(*s1) != tolower(*s2))
291                         break;
292         }
293         return (unsigned char)*s1 - (unsigned char)*s2;
294 }
295
296 static void choose_decoder(input_t *result, const char *encoding)
297 {
298         if (encoding) {
299                 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
300                         if (my_strcasecmp(encoding, i->name) != 0)
301                                 continue;
302                         result->decode = i->decoder;
303                         return;
304                 }
305                 errorf(NULL, "input encoding \"%s\" not supported", encoding);
306         }
307         result->decode = decode_utf8;
308 }
309
310 input_t *input_from_stream(FILE *file, const char *encoding)
311 {
312         input_t *result = XMALLOCZ(input_t);
313         result->kind    = INPUT_FILE;
314         result->in.file = file;
315
316         choose_decoder(result, encoding);
317
318         return result;
319 }
320
321 input_t *input_from_string(const char *string, const char *encoding)
322 {
323         input_t *result   = XMALLOCZ(input_t);
324         result->kind      = INPUT_STRING;
325         result->in.string = string;
326
327         choose_decoder(result, encoding);
328
329         return result;
330 }
331
332 size_t decode(input_t *input, utf32 *buffer, size_t buffer_size)
333 {
334         return input->decode(input, buffer, buffer_size);
335 }
336
337 void input_free(input_t *input)
338 {
339         xfree(input);
340 }