b405f6b8f7cd8589a9ef9c6fb3f513244d0dc32a
[cparser] / input.c
1 /*
2  * This file is part of cparser.
3  * Copyright (C) 2012 Matthias Braun <matze@braunis.de>
4  */
5 #include "config.h"
6
7 #include "diagnostic.h"
8 #include "input.h"
9
10 #include <ctype.h>
11
12 typedef size_t (*decode_func)(input_t *input, utf32 *buffer, size_t buffer_size);
13
14 typedef enum {
15         INPUT_FILE,
16         INPUT_STRING
17 } input_kind_t;
18
19 struct input_t {
20         input_kind_t kind;
21         union {
22                 FILE *file;
23                 const char *string;
24         } in;
25         decode_func decode;
26
27         /* state for utf-8 decoder */
28         utf32  utf8_part_decoded_min_code;
29         utf32  utf8_part_decoded_char;
30         size_t utf8_part_decoded_rest_len;
31 };
32
33 static input_error_callback_func input_error;
34
35 void set_input_error_callback(input_error_callback_func new_func)
36 {
37         input_error = new_func;
38 }
39
40 static size_t read_block(input_t *input, unsigned char *const read_buf,
41                          size_t const n)
42 {
43         if (input->kind == INPUT_FILE) {
44                 FILE *file = input->in.file;
45                 size_t const s = fread(read_buf, 1, n, file);
46                 if (s == 0) {
47                         /* on OS/X ferror appears to return true on eof as well when running
48                          * the application in gdb... */
49                         if (!feof(file) && ferror(file))
50                                 input_error(0, 0, "read from input failed");
51                         return 0;
52                 }
53                 return s;
54         } else {
55                 assert(input->kind == INPUT_STRING);
56                 size_t len = strlen(input->in.string);
57                 if (len > n)
58                         len = n;
59                 memcpy(read_buf, input->in.string, len);
60                 input->in.string += len;
61                 return len;
62         }
63 }
64
65 static size_t decode_iso_8859_1(input_t *input, utf32 *buffer,
66                                 size_t buffer_size)
67 {
68         unsigned char read_buf[buffer_size];
69         size_t const s = read_block(input, read_buf, sizeof(read_buf));
70
71         unsigned char const *src = read_buf;
72         unsigned char const *end = read_buf + s;
73         utf32               *dst = buffer;
74         while (src != end)
75                 *dst++ = *src++;
76
77         return s;
78 }
79
80 static size_t decode_iso_8859_15(input_t *input, utf32 *buffer,
81                                  size_t buffer_size)
82 {
83         unsigned char read_buf[buffer_size];
84         size_t const s = read_block(input, read_buf, sizeof(read_buf));
85
86         unsigned char const *src = read_buf;
87         unsigned char const *end = read_buf + s;
88         utf32               *dst = buffer;
89         while (src != end) {
90                 utf32 tc = *src++;
91                 switch (tc) {
92                         case 0xA4: tc = 0x20AC; break; // €
93                         case 0xA6: tc = 0x0160; break; // Š
94                         case 0xA8: tc = 0x0161; break; // š
95                         case 0xB4: tc = 0x017D; break; // Ž
96                         case 0xB8: tc = 0x017E; break; // ž
97                         case 0xBC: tc = 0x0152; break; // Œ
98                         case 0xBD: tc = 0x0153; break; // œ
99                         case 0xBE: tc = 0x0178; break; // Ÿ
100                 }
101                 *dst++ = tc;
102         }
103
104         return s;
105 }
106
107 static size_t decode_utf8(input_t *input, utf32 *buffer, size_t buffer_size)
108 {
109         unsigned char read_buf[buffer_size];
110
111         for (;;) {
112                 size_t const s = read_block(input, read_buf, sizeof(read_buf));
113                 if (s == 0) {
114                         if (input->utf8_part_decoded_rest_len > 0)
115                                 input_error(0, 0, "incomplete input char at end of input");
116                         return 0;
117                 }
118
119                 unsigned char const *src = read_buf;
120                 unsigned char const *end = read_buf + s;
121                 utf32               *dst = buffer;
122                 utf32                decoded;
123                 utf32                min_code;
124
125                 if (input->utf8_part_decoded_rest_len != 0) {
126                         min_code              = input->utf8_part_decoded_min_code;
127                         decoded               = input->utf8_part_decoded_char;
128                         size_t const rest_len = input->utf8_part_decoded_rest_len;
129                         input->utf8_part_decoded_rest_len = 0;
130                         switch (rest_len) {
131                                 case 4:  goto realign;
132                                 case 3:  goto three_more;
133                                 case 2:  goto two_more;
134                                 default: goto one_more;
135                         }
136                 }
137
138                 while (src != end) {
139                         if ((*src & 0x80) == 0) {
140                                 decoded = *src++;
141                         } else if ((*src & 0xE0) == 0xC0) {
142                                 min_code = 0x80;
143                                 decoded  = *src++ & 0x1F;
144 one_more:
145                                 if (src == end) {
146                                         input->utf8_part_decoded_min_code = min_code;
147                                         input->utf8_part_decoded_char     = decoded;
148                                         input->utf8_part_decoded_rest_len = 1;
149                                         break;
150                                 }
151                                 if ((*src & 0xC0) == 0x80) {
152                                         decoded = (decoded << 6) | (*src++ & 0x3F);
153                                 } else {
154                                         goto invalid_char;
155                                 }
156                                 if (decoded < min_code                      ||
157                                                 decoded > 0x10FFFF                      ||
158                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
159                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
160                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
161                                         input_error(0, 0, "invalid byte sequence in input");
162                                 }
163                         } else if ((*src & 0xF0) == 0xE0) {
164                                 min_code = 0x800;
165                                 decoded  = *src++ & 0x0F;
166 two_more:
167                                 if (src == end) {
168                                         input->utf8_part_decoded_min_code = min_code;
169                                         input->utf8_part_decoded_char     = decoded;
170                                         input->utf8_part_decoded_rest_len = 2;
171                                         break;
172                                 }
173                                 if ((*src & 0xC0) == 0x80) {
174                                         decoded = (decoded << 6) | (*src++ & 0x3F);
175                                 } else {
176                                         goto invalid_char;
177                                 }
178                                 goto one_more;
179                         } else if ((*src & 0xF8) == 0xF0) {
180                                 min_code = 0x10000;
181                                 decoded  = *src++ & 0x07;
182 three_more:
183                                 if (src == end) {
184                                         input->utf8_part_decoded_min_code = min_code;
185                                         input->utf8_part_decoded_char     = decoded;
186                                         input->utf8_part_decoded_rest_len = 3;
187                                         break;
188                                 }
189                                 if ((*src & 0xC0) == 0x80) {
190                                         decoded = (decoded << 6) | (*src++ & 0x3F);
191                                 } else {
192                                         goto invalid_char;
193                                 }
194                                 goto two_more;
195                         } else {
196 invalid_char:
197                                 input_error(0, 0, "invalid byte sequence in input");
198 realign:
199                                 do {
200                                         ++src;
201                                         if (src == end) {
202                                                 input->utf8_part_decoded_rest_len = 4;
203                                                 break;
204                                         }
205                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
206                                 continue;
207                         }
208                         *dst++ = decoded;
209                 }
210
211                 /* we're done when we could read more than 1 char */
212                 if (buffer != dst)
213                         return dst - buffer;
214         }
215 }
216
217 static size_t decode_windows_1252(input_t *input, utf32 *buffer,
218                                   size_t buffer_size)
219 {
220         unsigned char read_buf[buffer_size];
221         size_t const s = read_block(input, read_buf, sizeof(read_buf));
222
223         unsigned char const *src = read_buf;
224         unsigned char const *end = read_buf + s;
225         utf32               *dst = buffer;
226         while (src != end) {
227                 utf32 tc = *src++;
228                 switch (tc) {
229                         case 0x80: tc = 0x20AC; break; // €
230                         case 0x82: tc = 0x201A; break; // ‚
231                         case 0x83: tc = 0x0192; break; // ƒ
232                         case 0x84: tc = 0x201E; break; // „
233                         case 0x85: tc = 0x2026; break; // …
234                         case 0x86: tc = 0x2020; break; // †
235                         case 0x87: tc = 0x2021; break; // ‡
236                         case 0x88: tc = 0x02C6; break; // ˆ
237                         case 0x89: tc = 0x2030; break; // ‰
238                         case 0x8A: tc = 0x0160; break; // Š
239                         case 0x8B: tc = 0x2039; break; // ‹
240                         case 0x8C: tc = 0x0152; break; // Œ
241                         case 0x8E: tc = 0x017D; break; // Ž
242                         case 0x91: tc = 0x2018; break; // ‘
243                         case 0x92: tc = 0x2019; break; // ’
244                         case 0x93: tc = 0x201C; break; // “
245                         case 0x94: tc = 0x201D; break; // ”
246                         case 0x95: tc = 0x2022; break; // •
247                         case 0x96: tc = 0x2013; break; // –
248                         case 0x97: tc = 0x2014; break; // —
249                         case 0x98: tc = 0x02DC; break; // ˜
250                         case 0x99: tc = 0x2122; break; // ™
251                         case 0x9A: tc = 0x0161; break; // š
252                         case 0x9B: tc = 0x203A; break; // ›
253                         case 0x9C: tc = 0x0153; break; // œ
254                         case 0x9E: tc = 0x017E; break; // ž
255                         case 0x9F: tc = 0x0178; break; // Ÿ
256                 }
257                 *dst++ = tc;
258         }
259
260         return s;
261 }
262
263 typedef struct named_decoder_t {
264         char const *name;
265         decode_func decoder;
266 } named_decoder_t;
267
268 static named_decoder_t const decoders[] = {
269         { "CP819",           decode_iso_8859_1   }, // official alias
270         { "IBM819",          decode_iso_8859_1   }, // official alias
271         { "ISO-8859-1",      decode_iso_8859_1   }, // official alias
272         { "ISO-8859-15",     decode_iso_8859_15  }, // official name
273         { "ISO8859-1",       decode_iso_8859_1   },
274         { "ISO8859-15",      decode_iso_8859_15  },
275         { "ISO_8859-1",      decode_iso_8859_1   }, // official alias
276         { "ISO_8859-15",     decode_iso_8859_15  }, // official alias
277         { "ISO_8859-1:1987", decode_iso_8859_1   }, // official name
278         { "Latin-9",         decode_iso_8859_15  }, // official alias
279         { "UTF-8",           decode_utf8         }, // official name
280         { "csISOLatin1",     decode_iso_8859_1   }, // official alias
281         { "cp1252",          decode_windows_1252 },
282         { "iso-ir-100",      decode_iso_8859_1   }, // official alias
283         { "l1",              decode_iso_8859_1   }, // official alias
284         { "latin1",          decode_iso_8859_1   }, // official alias
285         { "windows-1252",    decode_windows_1252 }, // official name
286
287         { NULL,              NULL                }
288 };
289
290 /** strcasecmp is not part of C99 so we need our own implementation here */
291 static int my_strcasecmp(const char *s1, const char *s2)
292 {
293         for ( ; *s1 != 0; ++s1, ++s2) {
294                 if (tolower(*s1) != tolower(*s2))
295                         break;
296         }
297         return (unsigned char)*s1 - (unsigned char)*s2;
298 }
299
300 static void choose_decoder(input_t *result, const char *encoding)
301 {
302         if (encoding) {
303                 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
304                         if (my_strcasecmp(encoding, i->name) != 0)
305                                 continue;
306                         result->decode = i->decoder;
307                         return;
308                 }
309                 errorf(NULL, "input encoding \"%s\" not supported", encoding);
310         }
311         result->decode = decode_utf8;
312 }
313
314 input_t *input_from_stream(FILE *file, const char *encoding)
315 {
316         input_t *result = XMALLOCZ(input_t);
317         result->kind    = INPUT_FILE;
318         result->in.file = file;
319
320         choose_decoder(result, encoding);
321
322         return result;
323 }
324
325 input_t *input_from_string(const char *string, const char *encoding)
326 {
327         input_t *result   = XMALLOCZ(input_t);
328         result->kind      = INPUT_STRING;
329         result->in.string = string;
330
331         choose_decoder(result, encoding);
332
333         return result;
334 }
335
336 size_t decode(input_t *input, utf32 *buffer, size_t buffer_size)
337 {
338         return input->decode(input, buffer, buffer_size);
339 }
340
341 void input_free(input_t *input)
342 {
343         xfree(input);
344 }