rework input logic to allow parsing from strings
[cparser] / input.c
1 #include "config.h"
2
3 #include "input.h"
4
5 #include <ctype.h>
6 #include "lexer.h"
7 #include "diagnostic.h"
8
9 typedef size_t (*decode_func)(input_t *input, utf32 *buffer, size_t buffer_size);
10
11 typedef enum {
12         INPUT_FILE,
13         INPUT_STRING
14 } input_kind_t;
15
16 struct input_t {
17         input_kind_t kind;
18         union {
19                 FILE *file;
20                 const char *string;
21         } in;
22         decode_func decode;
23
24         /* state for utf-8 decoder */
25         utf32  utf8_part_decoded_min_code;
26         utf32  utf8_part_decoded_char;
27         size_t utf8_part_decoded_rest_len;
28 };
29
30 static input_error_callback_func input_error;
31
32 void set_input_error_callback(input_error_callback_func new_func)
33 {
34         input_error = new_func;
35 }
36
37 static size_t read_block(input_t *input, unsigned char *const read_buf,
38                          size_t const n)
39 {
40         if (input->kind == INPUT_FILE) {
41                 FILE *file = input->in.file;
42                 size_t const s = fread(read_buf, 1, n, file);
43                 if (s == 0) {
44                         /* on OS/X ferror appears to return true on eof as well when running
45                          * the application in gdb... */
46                         if (!feof(file) && ferror(file))
47                                 input_error(0, 0, "read from input failed");
48                         return 0;
49                 }
50                 return s;
51         } else {
52                 assert(input->kind == INPUT_STRING);
53                 size_t len = strlen(input->in.string);
54                 if (len > n)
55                         len = n;
56                 memcpy(read_buf, input->in.string, len);
57                 input->in.string += len;
58                 return len;
59         }
60 }
61
62 static size_t decode_iso_8859_1(input_t *input, utf32 *buffer,
63                                 size_t buffer_size)
64 {
65         unsigned char read_buf[buffer_size];
66         size_t const s = read_block(input, read_buf, sizeof(read_buf));
67
68         unsigned char const *src = read_buf;
69         unsigned char const *end = read_buf + s;
70         utf32               *dst = buffer;
71         while (src != end)
72                 *dst++ = *src++;
73
74         return s;
75 }
76
77 static size_t decode_iso_8859_15(input_t *input, utf32 *buffer,
78                                  size_t buffer_size)
79 {
80         unsigned char read_buf[buffer_size];
81         size_t const s = read_block(input, read_buf, sizeof(read_buf));
82
83         unsigned char const *src = read_buf;
84         unsigned char const *end = read_buf + s;
85         utf32               *dst = buffer;
86         while (src != end) {
87                 utf32 tc = *src++;
88                 switch (tc) {
89                         case 0xA4: tc = 0x20AC; break; // €
90                         case 0xA6: tc = 0x0160; break; // Š
91                         case 0xA8: tc = 0x0161; break; // š
92                         case 0xB4: tc = 0x017D; break; // Ž
93                         case 0xB8: tc = 0x017E; break; // ž
94                         case 0xBC: tc = 0x0152; break; // Œ
95                         case 0xBD: tc = 0x0153; break; // œ
96                         case 0xBE: tc = 0x0178; break; // Ÿ
97                 }
98                 *dst++ = tc;
99         }
100
101         return s;
102 }
103
104 static size_t decode_utf8(input_t *input, utf32 *buffer, size_t buffer_size)
105 {
106         unsigned char read_buf[buffer_size];
107
108         while (true) {
109                 size_t const s = read_block(input, read_buf, sizeof(read_buf));
110                 if (s == 0) {
111                         if (input->utf8_part_decoded_rest_len > 0)
112                                 input_error(0, 0, "incomplete input char at end of input");
113                         return 0;
114                 }
115
116                 unsigned char const *src = read_buf;
117                 unsigned char const *end = read_buf + s;
118                 utf32               *dst = buffer;
119                 utf32                decoded;
120                 utf32                min_code;
121
122                 if (input->utf8_part_decoded_rest_len != 0) {
123                         min_code              = input->utf8_part_decoded_min_code;
124                         decoded               = input->utf8_part_decoded_char;
125                         size_t const rest_len = input->utf8_part_decoded_rest_len;
126                         input->utf8_part_decoded_rest_len = 0;
127                         switch (rest_len) {
128                                 case 4:  goto realign;
129                                 case 3:  goto three_more;
130                                 case 2:  goto two_more;
131                                 default: goto one_more;
132                         }
133                 }
134
135                 while (src != end) {
136                         if ((*src & 0x80) == 0) {
137                                 decoded = *src++;
138                         } else if ((*src & 0xE0) == 0xC0) {
139                                 min_code = 0x80;
140                                 decoded  = *src++ & 0x1F;
141 one_more:
142                                 if (src == end) {
143                                         input->utf8_part_decoded_min_code = min_code;
144                                         input->utf8_part_decoded_char     = decoded;
145                                         input->utf8_part_decoded_rest_len = 1;
146                                         break;
147                                 }
148                                 if ((*src & 0xC0) == 0x80) {
149                                         decoded = (decoded << 6) | (*src++ & 0x3F);
150                                 } else {
151                                         goto invalid_char;
152                                 }
153                                 if (decoded < min_code                      ||
154                                                 decoded > 0x10FFFF                      ||
155                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
156                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
157                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
158                                         input_error(0, 0, "invalid byte sequence in input");
159                                 }
160                         } else if ((*src & 0xF0) == 0xE0) {
161                                 min_code = 0x800;
162                                 decoded  = *src++ & 0x0F;
163 two_more:
164                                 if (src == end) {
165                                         input->utf8_part_decoded_min_code = min_code;
166                                         input->utf8_part_decoded_char     = decoded;
167                                         input->utf8_part_decoded_rest_len = 2;
168                                         break;
169                                 }
170                                 if ((*src & 0xC0) == 0x80) {
171                                         decoded = (decoded << 6) | (*src++ & 0x3F);
172                                 } else {
173                                         goto invalid_char;
174                                 }
175                                 goto one_more;
176                         } else if ((*src & 0xF8) == 0xF0) {
177                                 min_code = 0x10000;
178                                 decoded  = *src++ & 0x07;
179 three_more:
180                                 if (src == end) {
181                                         input->utf8_part_decoded_min_code = min_code;
182                                         input->utf8_part_decoded_char     = decoded;
183                                         input->utf8_part_decoded_rest_len = 3;
184                                         break;
185                                 }
186                                 if ((*src & 0xC0) == 0x80) {
187                                         decoded = (decoded << 6) | (*src++ & 0x3F);
188                                 } else {
189                                         goto invalid_char;
190                                 }
191                                 goto two_more;
192                         } else {
193 invalid_char:
194                                 input_error(0, 0, "invalid byte sequence in input");
195 realign:
196                                 do {
197                                         ++src;
198                                         if (src == end) {
199                                                 input->utf8_part_decoded_rest_len = 4;
200                                                 break;
201                                         }
202                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
203                                 continue;
204                         }
205                         *dst++ = decoded;
206                 }
207
208                 /* we're done when we could read more than 1 char */
209                 if (buffer != dst)
210                         return dst - buffer;
211         }
212 }
213
214 static size_t decode_windows_1252(input_t *input, utf32 *buffer,
215                                   size_t buffer_size)
216 {
217         unsigned char read_buf[buffer_size];
218         size_t const s = read_block(input, read_buf, sizeof(read_buf));
219
220         unsigned char const *src = read_buf;
221         unsigned char const *end = read_buf + s;
222         utf32               *dst = buffer;
223         while (src != end) {
224                 utf32 tc = *src++;
225                 switch (tc) {
226                         case 0x80: tc = 0x20AC; break; // €
227                         case 0x82: tc = 0x201A; break; // ‚
228                         case 0x83: tc = 0x0192; break; // ƒ
229                         case 0x84: tc = 0x201E; break; // „
230                         case 0x85: tc = 0x2026; break; // …
231                         case 0x86: tc = 0x2020; break; // †
232                         case 0x87: tc = 0x2021; break; // ‡
233                         case 0x88: tc = 0x02C6; break; // ˆ
234                         case 0x89: tc = 0x2030; break; // ‰
235                         case 0x8A: tc = 0x0160; break; // Š
236                         case 0x8B: tc = 0x2039; break; // ‹
237                         case 0x8C: tc = 0x0152; break; // Œ
238                         case 0x8E: tc = 0x017D; break; // Ž
239                         case 0x91: tc = 0x2018; break; // ‘
240                         case 0x92: tc = 0x2019; break; // ’
241                         case 0x93: tc = 0x201C; break; // “
242                         case 0x94: tc = 0x201D; break; // ”
243                         case 0x95: tc = 0x2022; break; // •
244                         case 0x96: tc = 0x2013; break; // –
245                         case 0x97: tc = 0x2014; break; // —
246                         case 0x98: tc = 0x02DC; break; // ˜
247                         case 0x99: tc = 0x2122; break; // ™
248                         case 0x9A: tc = 0x0161; break; // š
249                         case 0x9B: tc = 0x203A; break; // ›
250                         case 0x9C: tc = 0x0153; break; // œ
251                         case 0x9E: tc = 0x017E; break; // ž
252                         case 0x9F: tc = 0x0178; break; // Ÿ
253                 }
254                 *dst++ = tc;
255         }
256
257         return s;
258 }
259
260 typedef struct named_decoder_t {
261         char const *name;
262         decode_func decoder;
263 } named_decoder_t;
264
265 static named_decoder_t const decoders[] = {
266         { "CP819",           decode_iso_8859_1   }, // official alias
267         { "IBM819",          decode_iso_8859_1   }, // official alias
268         { "ISO-8859-1",      decode_iso_8859_1   }, // official alias
269         { "ISO-8859-15",     decode_iso_8859_15  }, // official name
270         { "ISO8859-1",       decode_iso_8859_1   },
271         { "ISO8859-15",      decode_iso_8859_15  },
272         { "ISO_8859-1",      decode_iso_8859_1   }, // official alias
273         { "ISO_8859-15",     decode_iso_8859_15  }, // official alias
274         { "ISO_8859-1:1987", decode_iso_8859_1   }, // official name
275         { "Latin-9",         decode_iso_8859_15  }, // official alias
276         { "UTF-8",           decode_utf8         }, // official name
277         { "csISOLatin1",     decode_iso_8859_1   }, // official alias
278         { "cp1252",          decode_windows_1252 },
279         { "iso-ir-100",      decode_iso_8859_1   }, // official alias
280         { "l1",              decode_iso_8859_1   }, // official alias
281         { "latin1",          decode_iso_8859_1   }, // official alias
282         { "windows-1252",    decode_windows_1252 }, // official name
283
284         { NULL,              NULL                }
285 };
286
287 /** strcasecmp is not part of C99 so we need our own implementation here */
288 static int my_strcasecmp(const char *s1, const char *s2)
289 {
290         for ( ; *s1 != 0; ++s1, ++s2) {
291                 if (tolower(*s1) != tolower(*s2))
292                         break;
293         }
294         return (unsigned char)*s1 - (unsigned char)*s2;
295 }
296
297 static void choose_decoder(input_t *result, const char *encoding)
298 {
299         if (encoding == NULL) {
300                 result->decode = decode_utf8;
301         } else {
302                 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
303                         if (my_strcasecmp(encoding, i->name) != 0)
304                                 continue;
305                         result->decode = i->decoder;
306                         break;
307                 }
308                 if (result->decode == NULL) {
309                         fprintf(stderr, "error: input encoding \"%s\" not supported\n",
310                                         encoding);
311                         result->decode = decode_utf8;
312                 }
313         }
314 }
315
316 input_t *input_from_stream(FILE *file, const char *encoding)
317 {
318         input_t *result = XMALLOCZ(input_t);
319         result->kind    = INPUT_FILE;
320         result->in.file = file;
321
322         choose_decoder(result, encoding);
323
324         return result;
325 }
326
327 input_t *input_from_string(const char *string, const char *encoding)
328 {
329         input_t *result   = XMALLOCZ(input_t);
330         result->kind      = INPUT_STRING;
331         result->in.string = string;
332
333         choose_decoder(result, encoding);
334
335         return result;
336 }
337
338 size_t decode(input_t *input, utf32 *buffer, size_t buffer_size)
339 {
340         return input->decode(input, buffer, buffer_size);
341 }
342
343 void input_free(input_t *input)
344 {
345         xfree(input);
346 }