Fix calculation of the value of multi-character character constants in signed char...
[cparser] / lexer.c
1 /*
2  * This file is part of cparser.
3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License
7  * as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18  * 02111-1307, USA.
19  */
20 #include <config.h>
21
22 #include "diagnostic.h"
23 #include "lexer.h"
24 #include "symbol_t.h"
25 #include "token_t.h"
26 #include "symbol_table_t.h"
27 #include "adt/error.h"
28 #include "adt/strset.h"
29 #include "adt/util.h"
30 #include "types.h"
31 #include "type_t.h"
32 #include "target_architecture.h"
33 #include "parser.h"
34 #include "warning.h"
35 #include "lang_features.h"
36
37 #include <assert.h>
38 #include <errno.h>
39 #include <string.h>
40 #include <strings.h>
41 #include <stdbool.h>
42 #include <ctype.h>
43
44 //#define DEBUG_CHARS
45 #define MAX_PUTBACK 3
46 #define BUF_SIZE    1024
47
48 #if defined(_WIN32) || defined(__CYGWIN__)
49 /* No strtold on windows and no replacement yet */
50 #define strtold(s, e) strtod(s, e)
51 #endif
52
53 typedef unsigned int utf32;
54
55 static utf32        c;
56 token_t             lexer_token;
57 symbol_t           *symbol_L;
58 static FILE        *input;
59 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
60 static const utf32 *bufend;
61 static const utf32 *bufpos;
62 static strset_t     stringset;
63 bool                allow_dollar_in_symbol = true;
64
65 /**
66  * Prints a parse error message at the current token.
67  *
68  * @param msg   the error message
69  */
70 static void parse_error(const char *msg)
71 {
72         errorf(&lexer_token.source_position, "%s", msg);
73 }
74
75 /**
76  * Prints an internal error message at the current token.
77  *
78  * @param msg   the error message
79  */
80 static NORETURN internal_error(const char *msg)
81 {
82         internal_errorf(&lexer_token.source_position, "%s", msg);
83 }
84
85 static size_t read_block(unsigned char *const read_buf, size_t const n)
86 {
87         size_t const s = fread(read_buf, 1, n, input);
88         if (s == 0) {
89                 if (ferror(input))
90                         parse_error("read from input failed");
91                 buf[MAX_PUTBACK] = EOF;
92                 bufpos           = buf + MAX_PUTBACK;
93                 bufend           = buf + MAX_PUTBACK + 1;
94         }
95         return s;
96 }
97
98 static void decode_iso_8859_1(void)
99 {
100         unsigned char read_buf[BUF_SIZE];
101         size_t const s = read_block(read_buf, sizeof(read_buf));
102         if (s == 0)
103                 return;
104
105         unsigned char const *src = read_buf;
106         unsigned char const *end = read_buf + s;
107         utf32               *dst = buf + MAX_PUTBACK;
108         while (src != end)
109                 *dst++ = *src++;
110
111         bufpos = buf + MAX_PUTBACK;
112         bufend = dst;
113 }
114
115 static void decode_iso_8859_15(void)
116 {
117         unsigned char read_buf[BUF_SIZE];
118         size_t const s = read_block(read_buf, sizeof(read_buf));
119         if (s == 0)
120                 return;
121
122         unsigned char const *src = read_buf;
123         unsigned char const *end = read_buf + s;
124         utf32               *dst = buf + MAX_PUTBACK;
125         while (src != end) {
126                 utf32 tc = *src++;
127                 switch (tc) {
128                         case 0xA4: tc = 0x20AC; break; // €
129                         case 0xA6: tc = 0x0160; break; // Š
130                         case 0xA8: tc = 0x0161; break; // š
131                         case 0xB4: tc = 0x017D; break; // Ž
132                         case 0xB8: tc = 0x017E; break; // ž
133                         case 0xBC: tc = 0x0152; break; // Œ
134                         case 0xBD: tc = 0x0153; break; // œ
135                         case 0xBE: tc = 0x0178; break; // Ÿ
136                 }
137                 *dst++ = tc;
138         }
139
140         bufpos = buf + MAX_PUTBACK;
141         bufend = dst;
142 }
143
144 static void decode_utf8(void)
145 {
146         static utf32  part_decoded_min_code;
147         static utf32  part_decoded_char;
148         static size_t part_decoded_rest_len;
149
150         do {
151                 unsigned char read_buf[BUF_SIZE];
152                 size_t const s = read_block(read_buf, sizeof(read_buf));
153                 if (s == 0) {
154                         if (part_decoded_rest_len > 0)
155                                 parse_error("incomplete input char at end of input");
156                         return;
157                 }
158
159                 unsigned char const *src = read_buf;
160                 unsigned char const *end = read_buf + s;
161                 utf32               *dst = buf + MAX_PUTBACK;
162                 utf32                decoded;
163                 utf32                min_code;
164
165                 if (part_decoded_rest_len != 0) {
166                         min_code              = part_decoded_min_code;
167                         decoded               = part_decoded_char;
168                         size_t const rest_len = part_decoded_rest_len;
169                         part_decoded_rest_len = 0;
170                         switch (rest_len) {
171                                 case 4:  goto realign;
172                                 case 3:  goto three_more;
173                                 case 2:  goto two_more;
174                                 default: goto one_more;
175                         }
176                 }
177
178                 while (src != end) {
179                         if ((*src & 0x80) == 0) {
180                                 decoded = *src++;
181                         } else if ((*src & 0xE0) == 0xC0) {
182                                 min_code = 0x80;
183                                 decoded  = *src++ & 0x1F;
184 one_more:
185                                 if (src == end) {
186                                         part_decoded_min_code = min_code;
187                                         part_decoded_char     = decoded;
188                                         part_decoded_rest_len = 1;
189                                         break;
190                                 }
191                                 if ((*src & 0xC0) == 0x80) {
192                                         decoded = (decoded << 6) | (*src++ & 0x3F);
193                                 } else {
194                                         goto invalid_char;
195                                 }
196                                 if (decoded < min_code                      ||
197                                                 decoded > 0x10FFFF                      ||
198                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
199                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
200                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
201                                         parse_error("invalid byte sequence in input");
202                                 }
203                         } else if ((*src & 0xF0) == 0xE0) {
204                                 min_code = 0x800;
205                                 decoded  = *src++ & 0x0F;
206 two_more:
207                                 if (src == end) {
208                                         part_decoded_min_code = min_code;
209                                         part_decoded_char     = decoded;
210                                         part_decoded_rest_len = 2;
211                                         break;
212                                 }
213                                 if ((*src & 0xC0) == 0x80) {
214                                         decoded = (decoded << 6) | (*src++ & 0x3F);
215                                 } else {
216                                         goto invalid_char;
217                                 }
218                                 goto one_more;
219                         } else if ((*src & 0xF8) == 0xF0) {
220                                 min_code = 0x10000;
221                                 decoded  = *src++ & 0x07;
222 three_more:
223                                 if (src == end) {
224                                         part_decoded_min_code = min_code;
225                                         part_decoded_char     = decoded;
226                                         part_decoded_rest_len = 3;
227                                         break;
228                                 }
229                                 if ((*src & 0xC0) == 0x80) {
230                                         decoded = (decoded << 6) | (*src++ & 0x3F);
231                                 } else {
232                                         goto invalid_char;
233                                 }
234                                 goto two_more;
235                         } else {
236 invalid_char:
237                                 parse_error("invalid byte sequence in input");
238 realign:
239                                 do {
240                                         ++src;
241                                         if (src == end) {
242                                                 part_decoded_rest_len = 4;
243                                                 break;
244                                         }
245                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
246                                 continue;
247                         }
248                         *dst++ = decoded;
249                 }
250
251                 bufpos = buf + MAX_PUTBACK;
252                 bufend = dst;
253         } while (bufpos == bufend);
254 }
255
256 typedef void (*decoder_t)(void);
257
258 static decoder_t decoder = decode_utf8;
259
260 typedef struct named_decoder_t {
261         char const *name;
262         decoder_t   decoder;
263 } named_decoder_t;
264
265 static named_decoder_t const decoders[] = {
266         { "CP819",           decode_iso_8859_1  }, // offical alias
267         { "IBM819",          decode_iso_8859_1  }, // offical alias
268         { "ISO-8859-1",      decode_iso_8859_1  }, // offical alias
269         { "ISO-8859-15",     decode_iso_8859_15 }, // offical name
270         { "ISO8859-1",       decode_iso_8859_1  },
271         { "ISO8859-15",      decode_iso_8859_15 },
272         { "ISO_8859-1",      decode_iso_8859_1  }, // offical alias
273         { "ISO_8859-15",     decode_iso_8859_15 }, // offical alias
274         { "ISO_8859-1:1987", decode_iso_8859_1  }, // offical name
275         { "Latin-9",         decode_iso_8859_15 }, // offical alias
276         { "UTF-8",           decode_utf8        }, // offical name
277         { "csISOLatin1",     decode_iso_8859_1  }, // offical alias
278         { "iso-ir-100",      decode_iso_8859_1  }, // offical alias
279         { "l1",              decode_iso_8859_1  }, // offical alias
280         { "latin1",          decode_iso_8859_1  }, // offical alias
281
282         { NULL,              NULL               }
283 };
284
285 void select_input_encoding(char const* const encoding)
286 {
287         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
288                 if (strcasecmp(encoding, i->name) != 0)
289                         continue;
290                 decoder = i->decoder;
291                 return;
292         }
293         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
294 }
295
296 static inline void next_real_char(void)
297 {
298         assert(bufpos <= bufend);
299         if (bufpos >= bufend) {
300                 if (input == NULL) {
301                         c = EOF;
302                         return;
303                 }
304                 decoder();
305         }
306         c = *bufpos++;
307 }
308
309 /**
310  * Put a character back into the buffer.
311  *
312  * @param pc  the character to put back
313  */
314 static inline void put_back(utf32 const pc)
315 {
316         assert(bufpos > buf);
317         *(--bufpos - buf + buf) = pc;
318
319 #ifdef DEBUG_CHARS
320         printf("putback '%lc'\n", pc);
321 #endif
322 }
323
324 static inline void next_char(void);
325
326 #define MATCH_NEWLINE(code)                   \
327         case '\r':                                \
328                 next_char();                          \
329                 if(c == '\n') {                       \
330                         next_char();                      \
331                 }                                     \
332                 lexer_token.source_position.linenr++; \
333                 code                                  \
334         case '\n':                                \
335                 next_char();                          \
336                 lexer_token.source_position.linenr++; \
337                 code
338
339 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
340
341 static void maybe_concat_lines(void)
342 {
343         eat('\\');
344
345         switch(c) {
346         MATCH_NEWLINE(return;)
347
348         default:
349                 break;
350         }
351
352         put_back(c);
353         c = '\\';
354 }
355
356 /**
357  * Set c to the next input character, ie.
358  * after expanding trigraphs.
359  */
360 static inline void next_char(void)
361 {
362         next_real_char();
363
364         /* filter trigraphs */
365         if(UNLIKELY(c == '\\')) {
366                 maybe_concat_lines();
367                 goto end_of_next_char;
368         }
369
370         if(LIKELY(c != '?'))
371                 goto end_of_next_char;
372
373         next_real_char();
374         if(LIKELY(c != '?')) {
375                 put_back(c);
376                 c = '?';
377                 goto end_of_next_char;
378         }
379
380         next_real_char();
381         switch(c) {
382         case '=': c = '#'; break;
383         case '(': c = '['; break;
384         case '/': c = '\\'; maybe_concat_lines(); break;
385         case ')': c = ']'; break;
386         case '\'': c = '^'; break;
387         case '<': c = '{'; break;
388         case '!': c = '|'; break;
389         case '>': c = '}'; break;
390         case '-': c = '~'; break;
391         default:
392                 put_back(c);
393                 put_back('?');
394                 c = '?';
395                 break;
396         }
397
398 end_of_next_char:;
399 #ifdef DEBUG_CHARS
400         printf("nchar '%c'\n", c);
401 #endif
402 }
403
404 #define SYMBOL_CHARS  \
405         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
406         case 'a':         \
407         case 'b':         \
408         case 'c':         \
409         case 'd':         \
410         case 'e':         \
411         case 'f':         \
412         case 'g':         \
413         case 'h':         \
414         case 'i':         \
415         case 'j':         \
416         case 'k':         \
417         case 'l':         \
418         case 'm':         \
419         case 'n':         \
420         case 'o':         \
421         case 'p':         \
422         case 'q':         \
423         case 'r':         \
424         case 's':         \
425         case 't':         \
426         case 'u':         \
427         case 'v':         \
428         case 'w':         \
429         case 'x':         \
430         case 'y':         \
431         case 'z':         \
432         case 'A':         \
433         case 'B':         \
434         case 'C':         \
435         case 'D':         \
436         case 'E':         \
437         case 'F':         \
438         case 'G':         \
439         case 'H':         \
440         case 'I':         \
441         case 'J':         \
442         case 'K':         \
443         case 'L':         \
444         case 'M':         \
445         case 'N':         \
446         case 'O':         \
447         case 'P':         \
448         case 'Q':         \
449         case 'R':         \
450         case 'S':         \
451         case 'T':         \
452         case 'U':         \
453         case 'V':         \
454         case 'W':         \
455         case 'X':         \
456         case 'Y':         \
457         case 'Z':         \
458         case '_':
459
460 #define DIGITS        \
461         case '0':         \
462         case '1':         \
463         case '2':         \
464         case '3':         \
465         case '4':         \
466         case '5':         \
467         case '6':         \
468         case '7':         \
469         case '8':         \
470         case '9':
471
472 /**
473  * Read a symbol from the input and build
474  * the lexer_token.
475  */
476 static void parse_symbol(void)
477 {
478         symbol_t *symbol;
479         char     *string;
480
481         obstack_1grow(&symbol_obstack, (char) c);
482         next_char();
483
484         while(1) {
485                 switch(c) {
486                 DIGITS
487                 SYMBOL_CHARS
488                         obstack_1grow(&symbol_obstack, (char) c);
489                         next_char();
490                         break;
491
492                 default:
493 dollar_sign:
494                         goto end_symbol;
495                 }
496         }
497
498 end_symbol:
499         obstack_1grow(&symbol_obstack, '\0');
500
501         string = obstack_finish(&symbol_obstack);
502         symbol = symbol_table_insert(string);
503
504         lexer_token.type     = symbol->ID;
505         lexer_token.v.symbol = symbol;
506
507         if(symbol->string != string) {
508                 obstack_free(&symbol_obstack, string);
509         }
510 }
511
512 static void parse_integer_suffix(bool is_oct_hex)
513 {
514         bool is_unsigned     = false;
515         bool min_long        = false;
516         bool min_longlong    = false;
517         bool not_traditional = false;
518         int  pos             = 0;
519         char suffix[4];
520
521         if (c == 'U' || c == 'u') {
522                 not_traditional = true;
523                 suffix[pos++]   = toupper(c);
524                 is_unsigned     = true;
525                 next_char();
526                 if (c == 'L' || c == 'l') {
527                         suffix[pos++] = toupper(c);
528                         min_long = true;
529                         next_char();
530                         if (c == 'L' || c == 'l') {
531                                 suffix[pos++] = toupper(c);
532                                 min_longlong = true;
533                                 next_char();
534                         }
535                 }
536         } else if (c == 'l' || c == 'L') {
537                 suffix[pos++] = toupper(c);
538                 min_long = true;
539                 next_char();
540                 if (c == 'l' || c == 'L') {
541                         not_traditional = true;
542                         suffix[pos++]   = toupper(c);
543                         min_longlong    = true;
544                         next_char();
545                         if (c == 'u' || c == 'U') {
546                                 suffix[pos++] = toupper(c);
547                                 is_unsigned   = true;
548                                 next_char();
549                         }
550                 } else if (c == 'u' || c == 'U') {
551                         not_traditional = true;
552                         suffix[pos++]   = toupper(c);
553                         is_unsigned     = true;
554                         next_char();
555                         lexer_token.datatype = type_unsigned_long;
556                 }
557         }
558
559         if (warning.traditional && not_traditional) {
560                 suffix[pos] = '\0';
561                 warningf(&lexer_token.source_position,
562                         "traditional C rejects the '%s' suffix", suffix);
563         }
564         if (!is_unsigned) {
565                 long long v = lexer_token.v.intvalue;
566                 if (!min_long) {
567                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
568                                 lexer_token.datatype = type_int;
569                                 return;
570                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
571                                 lexer_token.datatype = type_unsigned_int;
572                                 return;
573                         }
574                 }
575                 if (!min_longlong) {
576                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
577                                 lexer_token.datatype = type_long;
578                                 return;
579                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
580                                 lexer_token.datatype = type_unsigned_long;
581                                 return;
582                         }
583                 }
584                 unsigned long long uv = (unsigned long long) v;
585                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
586                         lexer_token.datatype = type_unsigned_long_long;
587                         return;
588                 }
589
590                 lexer_token.datatype = type_long_long;
591         } else {
592                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
593                 if (!min_long && v <= TARGET_UINT_MAX) {
594                         lexer_token.datatype = type_unsigned_int;
595                         return;
596                 }
597                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
598                         lexer_token.datatype = type_unsigned_long;
599                         return;
600                 }
601                 lexer_token.datatype = type_unsigned_long_long;
602         }
603 }
604
605 static void parse_floating_suffix(void)
606 {
607         switch(c) {
608         /* TODO: do something useful with the suffixes... */
609         case 'f':
610         case 'F':
611                 if (warning.traditional) {
612                         warningf(&lexer_token.source_position,
613                                 "traditional C rejects the 'F' suffix");
614                 }
615                 next_char();
616                 lexer_token.datatype = type_float;
617                 break;
618         case 'l':
619         case 'L':
620                 if (warning.traditional) {
621                         warningf(&lexer_token.source_position,
622                                 "traditional C rejects the 'F' suffix");
623                 }
624                 next_char();
625                 lexer_token.datatype = type_long_double;
626                 break;
627         default:
628                 lexer_token.datatype = type_double;
629                 break;
630         }
631 }
632
633 /**
634  * A replacement for strtoull. Only those parts needed for
635  * our parser are implemented.
636  */
637 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
638         unsigned long long v = 0;
639
640         switch (base) {
641         case 16:
642                 for (;; ++s) {
643                         /* check for overrun */
644                         if (v >= 0x1000000000000000ULL)
645                                 break;
646                         switch (tolower(*s)) {
647                         case '0': v <<= 4; break;
648                         case '1': v <<= 4; v |= 0x1; break;
649                         case '2': v <<= 4; v |= 0x2; break;
650                         case '3': v <<= 4; v |= 0x3; break;
651                         case '4': v <<= 4; v |= 0x4; break;
652                         case '5': v <<= 4; v |= 0x5; break;
653                         case '6': v <<= 4; v |= 0x6; break;
654                         case '7': v <<= 4; v |= 0x7; break;
655                         case '8': v <<= 4; v |= 0x8; break;
656                         case '9': v <<= 4; v |= 0x9; break;
657                         case 'a': v <<= 4; v |= 0xa; break;
658                         case 'b': v <<= 4; v |= 0xb; break;
659                         case 'c': v <<= 4; v |= 0xc; break;
660                         case 'd': v <<= 4; v |= 0xd; break;
661                         case 'e': v <<= 4; v |= 0xe; break;
662                         case 'f': v <<= 4; v |= 0xf; break;
663                         default:
664                                 goto end;
665                         }
666                 }
667                 break;
668         case 8:
669                 for (;; ++s) {
670                         /* check for overrun */
671                         if (v >= 0x2000000000000000ULL)
672                                 break;
673                         switch (tolower(*s)) {
674                         case '0': v <<= 3; break;
675                         case '1': v <<= 3; v |= 1; break;
676                         case '2': v <<= 3; v |= 2; break;
677                         case '3': v <<= 3; v |= 3; break;
678                         case '4': v <<= 3; v |= 4; break;
679                         case '5': v <<= 3; v |= 5; break;
680                         case '6': v <<= 3; v |= 6; break;
681                         case '7': v <<= 3; v |= 7; break;
682                         default:
683                                 goto end;
684                         }
685                 }
686                 break;
687         case 10:
688                 for (;; ++s) {
689                         /* check for overrun */
690                         if (v > 0x1999999999999999ULL)
691                                 break;
692                         switch (tolower(*s)) {
693                         case '0': v *= 10; break;
694                         case '1': v *= 10; v += 1; break;
695                         case '2': v *= 10; v += 2; break;
696                         case '3': v *= 10; v += 3; break;
697                         case '4': v *= 10; v += 4; break;
698                         case '5': v *= 10; v += 5; break;
699                         case '6': v *= 10; v += 6; break;
700                         case '7': v *= 10; v += 7; break;
701                         case '8': v *= 10; v += 8; break;
702                         case '9': v *= 10; v += 9; break;
703                         default:
704                                 goto end;
705                         }
706                 }
707                 break;
708         default:
709                 assert(0);
710                 break;
711         }
712 end:
713         *endptr = s;
714         return v;
715 }
716
717 /**
718  * Parses a hex number including hex floats and set the
719  * lexer_token.
720  */
721 static void parse_number_hex(void)
722 {
723         bool is_float = false;
724         assert(c == 'x' || c == 'X');
725         next_char();
726
727         obstack_1grow(&symbol_obstack, '0');
728         obstack_1grow(&symbol_obstack, 'x');
729
730         while(isxdigit(c)) {
731                 obstack_1grow(&symbol_obstack, (char) c);
732                 next_char();
733         }
734
735         if (c == '.') {
736                 obstack_1grow(&symbol_obstack, (char) c);
737                 next_char();
738
739                 while (isxdigit(c)) {
740                         obstack_1grow(&symbol_obstack, (char) c);
741                         next_char();
742                 }
743                 is_float = true;
744         }
745         if (c == 'p' || c == 'P') {
746                 obstack_1grow(&symbol_obstack, (char) c);
747                 next_char();
748
749                 if (c == '-' || c == '+') {
750                         obstack_1grow(&symbol_obstack, (char) c);
751                         next_char();
752                 }
753
754                 while (isxdigit(c)) {
755                         obstack_1grow(&symbol_obstack, (char) c);
756                         next_char();
757                 }
758                 is_float = true;
759         }
760
761         obstack_1grow(&symbol_obstack, '\0');
762         char *string = obstack_finish(&symbol_obstack);
763         if(*string == '\0') {
764                 parse_error("invalid hex number");
765                 lexer_token.type = T_ERROR;
766                 obstack_free(&symbol_obstack, string);
767                 return;
768         }
769
770         if (is_float) {
771                 char *endptr;
772                 lexer_token.type         = T_FLOATINGPOINT;
773                 lexer_token.v.floatvalue = strtold(string, &endptr);
774
775                 if(*endptr != '\0') {
776                         parse_error("invalid hex float literal");
777                 }
778
779                 parse_floating_suffix();
780         } else {
781                 const char *endptr;
782                 lexer_token.type       = T_INTEGER;
783                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
784                 if(*endptr != '\0') {
785                         parse_error("hex number literal too long");
786                 }
787                 parse_integer_suffix(true);
788         }
789
790         obstack_free(&symbol_obstack, string);
791 }
792
793 /**
794  * Returns true if the given char is a octal digit.
795  *
796  * @param char  the character to check
797  */
798 static inline bool is_octal_digit(utf32 chr)
799 {
800         switch(chr) {
801         case '0':
802         case '1':
803         case '2':
804         case '3':
805         case '4':
806         case '5':
807         case '6':
808         case '7':
809                 return true;
810         default:
811                 return false;
812         }
813 }
814
815 /**
816  * Parses a octal number and set the lexer_token.
817  */
818 static void parse_number_oct(void)
819 {
820         while(is_octal_digit(c)) {
821                 obstack_1grow(&symbol_obstack, (char) c);
822                 next_char();
823         }
824         obstack_1grow(&symbol_obstack, '\0');
825         char *string = obstack_finish(&symbol_obstack);
826
827         const char *endptr;
828         lexer_token.type       = T_INTEGER;
829         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
830         if(*endptr != '\0') {
831                 parse_error("octal number literal too long");
832         }
833
834         obstack_free(&symbol_obstack, string);
835         parse_integer_suffix(true);
836 }
837
838 /**
839  * Parses a decimal including float number and set the
840  * lexer_token.
841  */
842 static void parse_number_dec(void)
843 {
844         bool is_float = false;
845         while (isdigit(c)) {
846                 obstack_1grow(&symbol_obstack, (char) c);
847                 next_char();
848         }
849
850         if (c == '.') {
851                 obstack_1grow(&symbol_obstack, '.');
852                 next_char();
853
854                 while (isdigit(c)) {
855                         obstack_1grow(&symbol_obstack, (char) c);
856                         next_char();
857                 }
858                 is_float = true;
859         }
860         if(c == 'e' || c == 'E') {
861                 obstack_1grow(&symbol_obstack, (char) c);
862                 next_char();
863
864                 if(c == '-' || c == '+') {
865                         obstack_1grow(&symbol_obstack, (char) c);
866                         next_char();
867                 }
868
869                 while(isdigit(c)) {
870                         obstack_1grow(&symbol_obstack, (char) c);
871                         next_char();
872                 }
873                 is_float = true;
874         }
875
876         obstack_1grow(&symbol_obstack, '\0');
877         char *string = obstack_finish(&symbol_obstack);
878
879         if(is_float) {
880                 char *endptr;
881                 lexer_token.type         = T_FLOATINGPOINT;
882                 lexer_token.v.floatvalue = strtold(string, &endptr);
883
884                 if(*endptr != '\0') {
885                         parse_error("invalid number literal");
886                 }
887
888                 parse_floating_suffix();
889         } else {
890                 const char *endptr;
891                 lexer_token.type       = T_INTEGER;
892                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
893
894                 if(*endptr != '\0') {
895                         parse_error("invalid number literal");
896                 }
897
898                 parse_integer_suffix(false);
899         }
900         obstack_free(&symbol_obstack, string);
901 }
902
903 /**
904  * Parses a number and sets the lexer_token.
905  */
906 static void parse_number(void)
907 {
908         if (c == '0') {
909                 next_char();
910                 switch (c) {
911                         case 'X':
912                         case 'x':
913                                 parse_number_hex();
914                                 break;
915                         case '0':
916                         case '1':
917                         case '2':
918                         case '3':
919                         case '4':
920                         case '5':
921                         case '6':
922                         case '7':
923                                 parse_number_oct();
924                                 break;
925                         case '8':
926                         case '9':
927                                 next_char();
928                                 parse_error("invalid octal number");
929                                 lexer_token.type = T_ERROR;
930                                 return;
931                         case '.':
932                         case 'e':
933                         case 'E':
934                         default:
935                                 obstack_1grow(&symbol_obstack, '0');
936                                 parse_number_dec();
937                                 return;
938                 }
939         } else {
940                 parse_number_dec();
941         }
942 }
943
944 /**
945  * Returns the value of a digit.
946  * The only portable way to do it ...
947  */
948 static int digit_value(utf32 const digit)
949 {
950         switch (digit) {
951         case '0': return 0;
952         case '1': return 1;
953         case '2': return 2;
954         case '3': return 3;
955         case '4': return 4;
956         case '5': return 5;
957         case '6': return 6;
958         case '7': return 7;
959         case '8': return 8;
960         case '9': return 9;
961         case 'a':
962         case 'A': return 10;
963         case 'b':
964         case 'B': return 11;
965         case 'c':
966         case 'C': return 12;
967         case 'd':
968         case 'D': return 13;
969         case 'e':
970         case 'E': return 14;
971         case 'f':
972         case 'F': return 15;
973         default:
974                 internal_error("wrong character given");
975         }
976 }
977
978 /**
979  * Parses an octal character sequence.
980  *
981  * @param first_digit  the already read first digit
982  */
983 static utf32 parse_octal_sequence(utf32 const first_digit)
984 {
985         assert(is_octal_digit(first_digit));
986         utf32 value = digit_value(first_digit);
987         if (!is_octal_digit(c)) return value;
988         value = 8 * value + digit_value(c);
989         next_char();
990         if (!is_octal_digit(c)) return value;
991         value = 8 * value + digit_value(c);
992         next_char();
993         return value;
994 }
995
996 /**
997  * Parses a hex character sequence.
998  */
999 static utf32 parse_hex_sequence(void)
1000 {
1001         utf32 value = 0;
1002         while(isxdigit(c)) {
1003                 value = 16 * value + digit_value(c);
1004                 next_char();
1005         }
1006         return value;
1007 }
1008
1009 /**
1010  * Parse an escape sequence.
1011  */
1012 static utf32 parse_escape_sequence(void)
1013 {
1014         eat('\\');
1015
1016         utf32 const ec = c;
1017         next_char();
1018
1019         switch (ec) {
1020         case '"':  return '"';
1021         case '\'': return '\'';
1022         case '\\': return '\\';
1023         case '?': return '\?';
1024         case 'a': return '\a';
1025         case 'b': return '\b';
1026         case 'f': return '\f';
1027         case 'n': return '\n';
1028         case 'r': return '\r';
1029         case 't': return '\t';
1030         case 'v': return '\v';
1031         case 'x':
1032                 return parse_hex_sequence();
1033         case '0':
1034         case '1':
1035         case '2':
1036         case '3':
1037         case '4':
1038         case '5':
1039         case '6':
1040         case '7':
1041                 return parse_octal_sequence(ec);
1042         case EOF:
1043                 parse_error("reached end of file while parsing escape sequence");
1044                 return EOF;
1045         /* \E is not documented, but handled, by GCC.  It is acceptable according
1046          * to §6.11.4, whereas \e is not. */
1047         case 'E':
1048         case 'e':
1049                 if (c_mode & _GNUC)
1050                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1051                 /* FALLTHROUGH */
1052         default:
1053                 /* §6.4.4.4:8 footnote 64 */
1054                 parse_error("unknown escape sequence");
1055                 return EOF;
1056         }
1057 }
1058
1059 /**
1060  * Concatenate two strings.
1061  */
1062 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1063 {
1064         const size_t len1 = s1->size - 1;
1065         const size_t len2 = s2->size - 1;
1066
1067         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1068         memcpy(concat, s1->begin, len1);
1069         memcpy(concat + len1, s2->begin, len2 + 1);
1070
1071         if (warning.traditional) {
1072                 warningf(&lexer_token.source_position,
1073                         "traditional C rejects string constant concatenation");
1074         }
1075 #if 0 /* TODO hash */
1076         const char *result = strset_insert(&stringset, concat);
1077         if(result != concat) {
1078                 obstack_free(&symbol_obstack, concat);
1079         }
1080
1081         return result;
1082 #else
1083         return (string_t){ concat, len1 + len2 + 1 };
1084 #endif
1085 }
1086
1087 /**
1088  * Concatenate a string and a wide string.
1089  */
1090 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1091 {
1092         const size_t len1 = s1->size - 1;
1093         const size_t len2 = s2->size - 1;
1094
1095         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1096         const char *const src = s1->begin;
1097         for (size_t i = 0; i != len1; ++i) {
1098                 concat[i] = src[i];
1099         }
1100         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1101         if (warning.traditional) {
1102                 warningf(&lexer_token.source_position,
1103                         "traditional C rejects string constant concatenation");
1104         }
1105
1106         return (wide_string_t){ concat, len1 + len2 + 1 };
1107 }
1108
1109 /**
1110  * Concatenate two wide strings.
1111  */
1112 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1113 {
1114         const size_t len1 = s1->size - 1;
1115         const size_t len2 = s2->size - 1;
1116
1117         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1118         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1119         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1120         if (warning.traditional) {
1121                 warningf(&lexer_token.source_position,
1122                         "traditional C rejects string constant concatenation");
1123         }
1124
1125         return (wide_string_t){ concat, len1 + len2 + 1 };
1126 }
1127
1128 /**
1129  * Concatenate a wide string and a string.
1130  */
1131 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1132 {
1133         const size_t len1 = s1->size - 1;
1134         const size_t len2 = s2->size - 1;
1135
1136         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1137         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1138         const char  *const src = s2->begin;
1139         wchar_rep_t *const dst = concat + len1;
1140         for (size_t i = 0; i != len2 + 1; ++i) {
1141                 dst[i] = src[i];
1142         }
1143         if (warning.traditional) {
1144                 warningf(&lexer_token.source_position,
1145                         "traditional C rejects string constant concatenation");
1146         }
1147
1148         return (wide_string_t){ concat, len1 + len2 + 1 };
1149 }
1150
1151 static void grow_symbol(utf32 const tc)
1152 {
1153         struct obstack *const o  = &symbol_obstack;
1154         if (tc < 0x80U) {
1155                 obstack_1grow(o, tc);
1156         } else if (tc < 0x800) {
1157                 obstack_1grow(o, 0xC0 | (tc >> 6));
1158                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1159         } else if (tc < 0x10000) {
1160                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1161                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1162                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1163         } else {
1164                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1165                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1166                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1167                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1168         }
1169 }
1170
1171 /**
1172  * Parse a string literal and set lexer_token.
1173  */
1174 static void parse_string_literal(void)
1175 {
1176         const unsigned start_linenr = lexer_token.source_position.linenr;
1177
1178         eat('"');
1179
1180         while(1) {
1181                 switch(c) {
1182                 case '\\': {
1183                         utf32 const tc = parse_escape_sequence();
1184                         if (tc >= 0x100) {
1185                                 warningf(&lexer_token.source_position,
1186                                                 "escape sequence out of range");
1187                         }
1188                         obstack_1grow(&symbol_obstack, tc);
1189                         break;
1190                 }
1191
1192                 case EOF: {
1193                         source_position_t source_position;
1194                         source_position.input_name = lexer_token.source_position.input_name;
1195                         source_position.linenr     = start_linenr;
1196                         errorf(&source_position, "string has no end");
1197                         lexer_token.type = T_ERROR;
1198                         return;
1199                 }
1200
1201                 case '"':
1202                         next_char();
1203                         goto end_of_string;
1204
1205                 default:
1206                         grow_symbol(c);
1207                         next_char();
1208                         break;
1209                 }
1210         }
1211
1212 end_of_string:
1213
1214         /* TODO: concatenate multiple strings separated by whitespace... */
1215
1216         /* add finishing 0 to the string */
1217         obstack_1grow(&symbol_obstack, '\0');
1218         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1219         const char *const string = obstack_finish(&symbol_obstack);
1220
1221 #if 0 /* TODO hash */
1222         /* check if there is already a copy of the string */
1223         result = strset_insert(&stringset, string);
1224         if(result != string) {
1225                 obstack_free(&symbol_obstack, string);
1226         }
1227 #else
1228         const char *const result = string;
1229 #endif
1230
1231         lexer_token.type           = T_STRING_LITERAL;
1232         lexer_token.v.string.begin = result;
1233         lexer_token.v.string.size  = size;
1234 }
1235
1236 /**
1237  * Parse a wide character constant and set lexer_token.
1238  */
1239 static void parse_wide_character_constant(void)
1240 {
1241         const unsigned start_linenr = lexer_token.source_position.linenr;
1242
1243         eat('\'');
1244
1245         while(1) {
1246                 switch(c) {
1247                 case '\\': {
1248                         wchar_rep_t tc = parse_escape_sequence();
1249                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1250                         break;
1251                 }
1252
1253                 MATCH_NEWLINE(
1254                         parse_error("newline while parsing character constant");
1255                         break;
1256                 )
1257
1258                 case '\'':
1259                         next_char();
1260                         goto end_of_wide_char_constant;
1261
1262                 case EOF: {
1263                         source_position_t source_position = lexer_token.source_position;
1264                         source_position.linenr = start_linenr;
1265                         errorf(&source_position, "EOF while parsing character constant");
1266                         lexer_token.type = T_ERROR;
1267                         return;
1268                 }
1269
1270                 default: {
1271                         wchar_rep_t tc = (wchar_rep_t) c;
1272                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1273                         next_char();
1274                         break;
1275                 }
1276                 }
1277         }
1278
1279 end_of_wide_char_constant:;
1280         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1281         assert(size % sizeof(wchar_rep_t) == 0);
1282         size /= sizeof(wchar_rep_t);
1283
1284         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1285
1286         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1287         lexer_token.v.wide_string.begin = string;
1288         lexer_token.v.wide_string.size  = size;
1289         lexer_token.datatype            = type_wchar_t;
1290 }
1291
1292 /**
1293  * Parse a wide string literal and set lexer_token.
1294  */
1295 static void parse_wide_string_literal(void)
1296 {
1297         const unsigned start_linenr = lexer_token.source_position.linenr;
1298
1299         assert(c == '"');
1300         next_char();
1301
1302         while(1) {
1303                 switch(c) {
1304                 case '\\': {
1305                         wchar_rep_t tc = parse_escape_sequence();
1306                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1307                         break;
1308                 }
1309
1310                 case EOF: {
1311                         source_position_t source_position;
1312                         source_position.input_name = lexer_token.source_position.input_name;
1313                         source_position.linenr     = start_linenr;
1314                         errorf(&source_position, "string has no end");
1315                         lexer_token.type = T_ERROR;
1316                         return;
1317                 }
1318
1319                 case '"':
1320                         next_char();
1321                         goto end_of_string;
1322
1323                 default: {
1324                         wchar_rep_t tc = c;
1325                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1326                         next_char();
1327                         break;
1328                 }
1329                 }
1330         }
1331
1332 end_of_string:;
1333
1334         /* TODO: concatenate multiple strings separated by whitespace... */
1335
1336         /* add finishing 0 to the string */
1337         wchar_rep_t nul = L'\0';
1338         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1339         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1340         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1341
1342 #if 0 /* TODO hash */
1343         /* check if there is already a copy of the string */
1344         const wchar_rep_t *const result = strset_insert(&stringset, string);
1345         if(result != string) {
1346                 obstack_free(&symbol_obstack, string);
1347         }
1348 #else
1349         const wchar_rep_t *const result = string;
1350 #endif
1351
1352         lexer_token.type                = T_WIDE_STRING_LITERAL;
1353         lexer_token.v.wide_string.begin = result;
1354         lexer_token.v.wide_string.size  = size;
1355 }
1356
1357 /**
1358  * Parse a character constant and set lexer_token.
1359  */
1360 static void parse_character_constant(void)
1361 {
1362         const unsigned start_linenr = lexer_token.source_position.linenr;
1363
1364         eat('\'');
1365
1366         while(1) {
1367                 switch(c) {
1368                 case '\\': {
1369                         utf32 const tc = parse_escape_sequence();
1370                         if (tc >= 0x100) {
1371                                 warningf(&lexer_token.source_position,
1372                                                 "escape sequence out of range");
1373                         }
1374                         obstack_1grow(&symbol_obstack, tc);
1375                         break;
1376                 }
1377
1378                 MATCH_NEWLINE(
1379                         parse_error("newline while parsing character constant");
1380                         break;
1381                 )
1382
1383                 case '\'':
1384                         next_char();
1385                         goto end_of_char_constant;
1386
1387                 case EOF: {
1388                         source_position_t source_position;
1389                         source_position.input_name = lexer_token.source_position.input_name;
1390                         source_position.linenr     = start_linenr;
1391                         errorf(&source_position, "EOF while parsing character constant");
1392                         lexer_token.type = T_ERROR;
1393                         return;
1394                 }
1395
1396                 default:
1397                         grow_symbol(c);
1398                         next_char();
1399                         break;
1400
1401                 }
1402         }
1403
1404 end_of_char_constant:;
1405         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1406         const char *const string = obstack_finish(&symbol_obstack);
1407
1408         lexer_token.type           = T_CHARACTER_CONSTANT;
1409         lexer_token.v.string.begin = string;
1410         lexer_token.v.string.size  = size;
1411         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1412 }
1413
1414 /**
1415  * Skip a multiline comment.
1416  */
1417 static void skip_multiline_comment(void)
1418 {
1419         unsigned start_linenr = lexer_token.source_position.linenr;
1420
1421         while(1) {
1422                 switch(c) {
1423                 case '/':
1424                         next_char();
1425                         if (c == '*') {
1426                                 /* nested comment, warn here */
1427                                 if (warning.comment) {
1428                                         warningf(&lexer_token.source_position, "'/*' within comment");
1429                                 }
1430                         }
1431                         break;
1432                 case '*':
1433                         next_char();
1434                         if(c == '/') {
1435                                 next_char();
1436                                 return;
1437                         }
1438                         break;
1439
1440                 MATCH_NEWLINE(break;)
1441
1442                 case EOF: {
1443                         source_position_t source_position;
1444                         source_position.input_name = lexer_token.source_position.input_name;
1445                         source_position.linenr     = start_linenr;
1446                         errorf(&source_position, "at end of file while looking for comment end");
1447                         return;
1448                 }
1449
1450                 default:
1451                         next_char();
1452                         break;
1453                 }
1454         }
1455 }
1456
1457 /**
1458  * Skip a single line comment.
1459  */
1460 static void skip_line_comment(void)
1461 {
1462         while(1) {
1463                 switch(c) {
1464                 case EOF:
1465                         return;
1466
1467                 case '\n':
1468                 case '\r':
1469                         return;
1470
1471                 case '\\':
1472                         next_char();
1473                         if (c == '\n' || c == '\r') {
1474                                 if (warning.comment)
1475                                         warningf(&lexer_token.source_position, "multi-line comment");
1476                                 return;
1477                         }
1478                         break;
1479
1480                 default:
1481                         next_char();
1482                         break;
1483                 }
1484         }
1485 }
1486
1487 /** The current preprocessor token. */
1488 static token_t pp_token;
1489
1490 /**
1491  * Read the next preprocessor token.
1492  */
1493 static inline void next_pp_token(void)
1494 {
1495         lexer_next_preprocessing_token();
1496         pp_token = lexer_token;
1497 }
1498
1499 /**
1500  * Eat all preprocessor tokens until newline.
1501  */
1502 static void eat_until_newline(void)
1503 {
1504         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1505                 next_pp_token();
1506         }
1507 }
1508
1509 /**
1510  * Handle the define directive.
1511  */
1512 static void define_directive(void)
1513 {
1514         lexer_next_preprocessing_token();
1515         if(lexer_token.type != T_IDENTIFIER) {
1516                 parse_error("expected identifier after #define\n");
1517                 eat_until_newline();
1518         }
1519 }
1520
1521 /**
1522  * Handle the ifdef directive.
1523  */
1524 static void ifdef_directive(int is_ifndef)
1525 {
1526         (void) is_ifndef;
1527         lexer_next_preprocessing_token();
1528         //expect_identifier();
1529         //extect_newline();
1530 }
1531
1532 /**
1533  * Handle the endif directive.
1534  */
1535 static void endif_directive(void)
1536 {
1537         //expect_newline();
1538 }
1539
1540 /**
1541  * Parse the line directive.
1542  */
1543 static void parse_line_directive(void)
1544 {
1545         if(pp_token.type != T_INTEGER) {
1546                 parse_error("expected integer");
1547         } else {
1548                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1549                 next_pp_token();
1550         }
1551         if(pp_token.type == T_STRING_LITERAL) {
1552                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1553                 next_pp_token();
1554         }
1555
1556         eat_until_newline();
1557 }
1558
1559 /**
1560  * STDC pragmas.
1561  */
1562 typedef enum stdc_pragma_kind_t {
1563         STDC_UNKNOWN,
1564         STDC_FP_CONTRACT,
1565         STDC_FENV_ACCESS,
1566         STDC_CX_LIMITED_RANGE
1567 } stdc_pragma_kind_t;
1568
1569 /**
1570  * STDC pragma values.
1571  */
1572 typedef enum stdc_pragma_value_kind_t {
1573         STDC_VALUE_UNKNOWN,
1574         STDC_VALUE_ON,
1575         STDC_VALUE_OFF,
1576         STDC_VALUE_DEFAULT
1577 } stdc_pragma_value_kind_t;
1578
1579 /**
1580  * Parse a pragma directive.
1581  */
1582 static void parse_pragma(void) {
1583         bool unknown_pragma = true;
1584
1585         next_pp_token();
1586         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1587                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1588                 /* a STDC pragma */
1589                 if (c_mode & _C99) {
1590                         next_pp_token();
1591
1592                         switch (pp_token.v.symbol->pp_ID) {
1593                         case TP_FP_CONTRACT:
1594                                 kind = STDC_FP_CONTRACT;
1595                                 break;
1596                         case TP_FENV_ACCESS:
1597                                 kind = STDC_FENV_ACCESS;
1598                                 break;
1599                         case TP_CX_LIMITED_RANGE:
1600                                 kind = STDC_CX_LIMITED_RANGE;
1601                                 break;
1602                         default:
1603                                 break;
1604                         }
1605                         if (kind != STDC_UNKNOWN) {
1606                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1607                                 next_pp_token();
1608                                 switch (pp_token.v.symbol->pp_ID) {
1609                                 case TP_ON:
1610                                         value = STDC_VALUE_ON;
1611                                         break;
1612                                 case TP_OFF:
1613                                         value = STDC_VALUE_OFF;
1614                                         break;
1615                                 case TP_DEFAULT:
1616                                         value = STDC_VALUE_DEFAULT;
1617                                         break;
1618                                 default:
1619                                         break;
1620                                 }
1621                                 if (value != STDC_VALUE_UNKNOWN) {
1622                                         unknown_pragma = false;
1623                                 } else {
1624                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1625                                 }
1626                         }
1627                 }
1628         } else {
1629                 unknown_pragma = true;
1630         }
1631         eat_until_newline();
1632         if (unknown_pragma && warning.unknown_pragmas) {
1633                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1634         }
1635 }
1636
1637 /**
1638  * Parse a preprocessor non-null directive.
1639  */
1640 static void parse_preprocessor_identifier(void)
1641 {
1642         assert(pp_token.type == T_IDENTIFIER);
1643         symbol_t *symbol = pp_token.v.symbol;
1644
1645         switch(symbol->pp_ID) {
1646         case TP_include:
1647                 printf("include - enable header name parsing!\n");
1648                 break;
1649         case TP_define:
1650                 define_directive();
1651                 break;
1652         case TP_ifdef:
1653                 ifdef_directive(0);
1654                 break;
1655         case TP_ifndef:
1656                 ifdef_directive(1);
1657                 break;
1658         case TP_endif:
1659                 endif_directive();
1660                 break;
1661         case TP_line:
1662                 next_pp_token();
1663                 parse_line_directive();
1664                 break;
1665         case TP_if:
1666         case TP_else:
1667         case TP_elif:
1668         case TP_undef:
1669         case TP_error:
1670                 /* TODO; output the rest of the line */
1671                 parse_error("#error directive: ");
1672                 break;
1673         case TP_pragma:
1674                 parse_pragma();
1675                 break;
1676         }
1677 }
1678
1679 /**
1680  * Parse a preprocessor directive.
1681  */
1682 static void parse_preprocessor_directive(void)
1683 {
1684         next_pp_token();
1685
1686         switch(pp_token.type) {
1687         case T_IDENTIFIER:
1688                 parse_preprocessor_identifier();
1689                 break;
1690         case T_INTEGER:
1691                 parse_line_directive();
1692                 break;
1693         case '\n':
1694                 /* NULL directive, see § 6.10.7 */
1695                 break;
1696         default:
1697                 parse_error("invalid preprocessor directive");
1698                 eat_until_newline();
1699                 break;
1700         }
1701 }
1702
1703 #define MAYBE_PROLOG                                       \
1704                         next_char();                                   \
1705                         while(1) {                                     \
1706                                 switch(c) {
1707
1708 #define MAYBE(ch, set_type)                                \
1709                                 case ch:                                   \
1710                                         next_char();                           \
1711                                         lexer_token.type = set_type;           \
1712                                         return;
1713
1714 #define ELSE_CODE(code)                                    \
1715                                 default:                                   \
1716                                         code                                   \
1717                                 }                                          \
1718                         } /* end of while(1) */                        \
1719                         break;
1720
1721 #define ELSE(set_type)                                     \
1722                 ELSE_CODE(                                         \
1723                         lexer_token.type = set_type;                   \
1724                         return;                                        \
1725                 )
1726
1727 void lexer_next_preprocessing_token(void)
1728 {
1729         while(1) {
1730                 switch(c) {
1731                 case ' ':
1732                 case '\t':
1733                         next_char();
1734                         break;
1735
1736                 MATCH_NEWLINE(
1737                         lexer_token.type = '\n';
1738                         return;
1739                 )
1740
1741                 SYMBOL_CHARS
1742                         parse_symbol();
1743                         /* might be a wide string ( L"string" ) */
1744                         if(lexer_token.type == T_IDENTIFIER &&
1745                             lexer_token.v.symbol == symbol_L) {
1746                             if(c == '"') {
1747                                         parse_wide_string_literal();
1748                                 } else if(c == '\'') {
1749                                         parse_wide_character_constant();
1750                                 }
1751                         }
1752                         return;
1753
1754                 DIGITS
1755                         parse_number();
1756                         return;
1757
1758                 case '"':
1759                         parse_string_literal();
1760                         return;
1761
1762                 case '\'':
1763                         parse_character_constant();
1764                         return;
1765
1766                 case '.':
1767                         MAYBE_PROLOG
1768                                 DIGITS
1769                                         put_back(c);
1770                                         c = '.';
1771                                         parse_number_dec();
1772                                         return;
1773
1774                                 case '.':
1775                                         MAYBE_PROLOG
1776                                         MAYBE('.', T_DOTDOTDOT)
1777                                         ELSE_CODE(
1778                                                 put_back(c);
1779                                                 c = '.';
1780                                                 lexer_token.type = '.';
1781                                                 return;
1782                                         )
1783                         ELSE('.')
1784                 case '&':
1785                         MAYBE_PROLOG
1786                         MAYBE('&', T_ANDAND)
1787                         MAYBE('=', T_ANDEQUAL)
1788                         ELSE('&')
1789                 case '*':
1790                         MAYBE_PROLOG
1791                         MAYBE('=', T_ASTERISKEQUAL)
1792                         ELSE('*')
1793                 case '+':
1794                         MAYBE_PROLOG
1795                         MAYBE('+', T_PLUSPLUS)
1796                         MAYBE('=', T_PLUSEQUAL)
1797                         ELSE('+')
1798                 case '-':
1799                         MAYBE_PROLOG
1800                         MAYBE('>', T_MINUSGREATER)
1801                         MAYBE('-', T_MINUSMINUS)
1802                         MAYBE('=', T_MINUSEQUAL)
1803                         ELSE('-')
1804                 case '!':
1805                         MAYBE_PROLOG
1806                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1807                         ELSE('!')
1808                 case '/':
1809                         MAYBE_PROLOG
1810                         MAYBE('=', T_SLASHEQUAL)
1811                                 case '*':
1812                                         next_char();
1813                                         skip_multiline_comment();
1814                                         lexer_next_preprocessing_token();
1815                                         return;
1816                                 case '/':
1817                                         next_char();
1818                                         skip_line_comment();
1819                                         lexer_next_preprocessing_token();
1820                                         return;
1821                         ELSE('/')
1822                 case '%':
1823                         MAYBE_PROLOG
1824                         MAYBE('>', '}')
1825                         MAYBE('=', T_PERCENTEQUAL)
1826                                 case ':':
1827                                         MAYBE_PROLOG
1828                                                 case '%':
1829                                                         MAYBE_PROLOG
1830                                                         MAYBE(':', T_HASHHASH)
1831                                                         ELSE_CODE(
1832                                                                 put_back(c);
1833                                                                 c = '%';
1834                                                                 lexer_token.type = '#';
1835                                                                 return;
1836                                                         )
1837                                         ELSE('#')
1838                         ELSE('%')
1839                 case '<':
1840                         MAYBE_PROLOG
1841                         MAYBE(':', '[')
1842                         MAYBE('%', '{')
1843                         MAYBE('=', T_LESSEQUAL)
1844                                 case '<':
1845                                         MAYBE_PROLOG
1846                                         MAYBE('=', T_LESSLESSEQUAL)
1847                                         ELSE(T_LESSLESS)
1848                         ELSE('<')
1849                 case '>':
1850                         MAYBE_PROLOG
1851                         MAYBE('=', T_GREATEREQUAL)
1852                                 case '>':
1853                                         MAYBE_PROLOG
1854                                         MAYBE('=', T_GREATERGREATEREQUAL)
1855                                         ELSE(T_GREATERGREATER)
1856                         ELSE('>')
1857                 case '^':
1858                         MAYBE_PROLOG
1859                         MAYBE('=', T_CARETEQUAL)
1860                         ELSE('^')
1861                 case '|':
1862                         MAYBE_PROLOG
1863                         MAYBE('=', T_PIPEEQUAL)
1864                         MAYBE('|', T_PIPEPIPE)
1865                         ELSE('|')
1866                 case ':':
1867                         MAYBE_PROLOG
1868                         MAYBE('>', ']')
1869                         ELSE(':')
1870                 case '=':
1871                         MAYBE_PROLOG
1872                         MAYBE('=', T_EQUALEQUAL)
1873                         ELSE('=')
1874                 case '#':
1875                         MAYBE_PROLOG
1876                         MAYBE('#', T_HASHHASH)
1877                         ELSE('#')
1878
1879                 case '?':
1880                 case '[':
1881                 case ']':
1882                 case '(':
1883                 case ')':
1884                 case '{':
1885                 case '}':
1886                 case '~':
1887                 case ';':
1888                 case ',':
1889                 case '\\':
1890                         lexer_token.type = c;
1891                         next_char();
1892                         return;
1893
1894                 case EOF:
1895                         lexer_token.type = T_EOF;
1896                         return;
1897
1898                 default:
1899 dollar_sign:
1900                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1901                         next_char();
1902                         lexer_token.type = T_ERROR;
1903                         return;
1904                 }
1905         }
1906 }
1907
1908 void lexer_next_token(void)
1909 {
1910         lexer_next_preprocessing_token();
1911
1912         while (lexer_token.type == '\n') {
1913 newline_found:
1914                 lexer_next_preprocessing_token();
1915         }
1916
1917         if (lexer_token.type == '#') {
1918                 parse_preprocessor_directive();
1919                 goto newline_found;
1920         }
1921 }
1922
1923 void init_lexer(void)
1924 {
1925         strset_init(&stringset);
1926         symbol_L = symbol_table_insert("L");
1927 }
1928
1929 void lexer_open_stream(FILE *stream, const char *input_name)
1930 {
1931         input                                  = stream;
1932         lexer_token.source_position.linenr     = 0;
1933         lexer_token.source_position.input_name = input_name;
1934
1935         bufpos = NULL;
1936         bufend = NULL;
1937
1938         /* place a virtual \n at the beginning so the lexer knows that we're
1939          * at the beginning of a line */
1940         c = '\n';
1941 }
1942
1943 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1944 {
1945         input                                  = NULL;
1946         lexer_token.source_position.linenr     = 0;
1947         lexer_token.source_position.input_name = input_name;
1948
1949 #if 0 // TODO
1950         bufpos = buffer;
1951         bufend = buffer + len;
1952 #else
1953         (void)buffer;
1954         (void)len;
1955         panic("builtin lexing not done yet");
1956 #endif
1957
1958         /* place a virtual \n at the beginning so the lexer knows that we're
1959          * at the beginning of a line */
1960         c = '\n';
1961 }
1962
1963 void exit_lexer(void)
1964 {
1965         strset_destroy(&stringset);
1966 }
1967
1968 static __attribute__((unused))
1969 void dbg_pos(const source_position_t source_position)
1970 {
1971         fprintf(stdout, "%s:%u\n", source_position.input_name,
1972                 source_position.linenr);
1973         fflush(stdout);
1974 }