Add eat_token() for more clarity in the preprocessor.
[cparser] / lexer.c
1 /*
2  * This file is part of cparser.
3  * Copyright (C) 2007-2009 Matthias Braun <matze@braunis.de>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License
7  * as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18  * 02111-1307, USA.
19  */
20 #include <config.h>
21
22 #include "adt/strutil.h"
23 #include "input.h"
24 #include "diagnostic.h"
25 #include "lexer.h"
26 #include "symbol_t.h"
27 #include "token_t.h"
28 #include "symbol_table_t.h"
29 #include "adt/error.h"
30 #include "adt/strset.h"
31 #include "adt/util.h"
32 #include "types.h"
33 #include "type_t.h"
34 #include "parser.h"
35 #include "warning.h"
36 #include "lang_features.h"
37
38 #include <assert.h>
39 #include <errno.h>
40 #include <string.h>
41 #include <stdbool.h>
42 #include <ctype.h>
43
44 #ifndef _WIN32
45 #include <strings.h>
46 #endif
47
48 #define MAX_PUTBACK 16    // 3 would be enough, but 16 gives a nicer alignment
49 #define BUF_SIZE    1024
50
51 static input_t           *input;
52 static utf32              input_buf[BUF_SIZE + MAX_PUTBACK];
53 static const utf32       *bufpos;
54 static const utf32       *bufend;
55 static utf32              c;
56 static source_position_t  lexer_pos;
57 token_t                   lexer_token;
58 static symbol_t          *symbol_L;
59 static strset_t           stringset;
60 bool                      allow_dollar_in_symbol = true;
61
62 /**
63  * Prints a parse error message at the current token.
64  *
65  * @param msg   the error message
66  */
67 static void parse_error(const char *msg)
68 {
69         errorf(&lexer_pos, "%s", msg);
70 }
71
72 /**
73  * Prints an internal error message at the current token.
74  *
75  * @param msg   the error message
76  */
77 static NORETURN internal_error(const char *msg)
78 {
79         internal_errorf(&lexer_pos, "%s", msg);
80 }
81
82 static inline void next_real_char(void)
83 {
84         assert(bufpos <= bufend);
85         if (bufpos >= bufend) {
86                 size_t n = decode(input, input_buf+MAX_PUTBACK, BUF_SIZE);
87                 if (n == 0) {
88                         c = EOF;
89                         return;
90                 }
91                 bufpos = input_buf + MAX_PUTBACK;
92                 bufend = bufpos + n;
93         }
94         c = *bufpos++;
95         ++lexer_pos.colno;
96 }
97
98 /**
99  * Put a character back into the buffer.
100  *
101  * @param pc  the character to put back
102  */
103 static inline void put_back(utf32 const pc)
104 {
105         *(--bufpos - input_buf + input_buf) = pc;
106         --lexer_pos.colno;
107 }
108
109 static inline void next_char(void);
110
111 #define NEWLINE  \
112         '\r': \
113                 next_char(); \
114                 if (c == '\n') { \
115         case '\n': \
116                         next_char(); \
117                 } \
118                 lexer_pos.lineno++; \
119                 lexer_pos.colno = 1; \
120                 goto newline; \
121                 newline // Let it look like an ordinary case label.
122
123 #define eat(c_type) (assert(c == c_type), next_char())
124
125 static void maybe_concat_lines(void)
126 {
127         eat('\\');
128
129         switch (c) {
130         case NEWLINE:
131                 return;
132
133         default:
134                 break;
135         }
136
137         put_back(c);
138         c = '\\';
139 }
140
141 /**
142  * Set c to the next input character, ie.
143  * after expanding trigraphs.
144  */
145 static inline void next_char(void)
146 {
147         next_real_char();
148
149         /* filter trigraphs */
150         if (UNLIKELY(c == '\\')) {
151                 maybe_concat_lines();
152                 return;
153         }
154
155         if (LIKELY(c != '?'))
156                 return;
157
158         next_real_char();
159         if (LIKELY(c != '?')) {
160                 put_back(c);
161                 c = '?';
162                 return;
163         }
164
165         next_real_char();
166         switch (c) {
167         case '=': c = '#'; break;
168         case '(': c = '['; break;
169         case '/': c = '\\'; maybe_concat_lines(); break;
170         case ')': c = ']'; break;
171         case '\'': c = '^'; break;
172         case '<': c = '{'; break;
173         case '!': c = '|'; break;
174         case '>': c = '}'; break;
175         case '-': c = '~'; break;
176         default:
177                 put_back(c);
178                 put_back('?');
179                 c = '?';
180                 break;
181         }
182 }
183
184 #define SYMBOL_CHARS_WITHOUT_E_P \
185              '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
186         case 'a':         \
187         case 'b':         \
188         case 'c':         \
189         case 'd':         \
190         case 'f':         \
191         case 'g':         \
192         case 'h':         \
193         case 'i':         \
194         case 'j':         \
195         case 'k':         \
196         case 'l':         \
197         case 'm':         \
198         case 'n':         \
199         case 'o':         \
200         case 'q':         \
201         case 'r':         \
202         case 's':         \
203         case 't':         \
204         case 'u':         \
205         case 'v':         \
206         case 'w':         \
207         case 'x':         \
208         case 'y':         \
209         case 'z':         \
210         case 'A':         \
211         case 'B':         \
212         case 'C':         \
213         case 'D':         \
214         case 'F':         \
215         case 'G':         \
216         case 'H':         \
217         case 'I':         \
218         case 'J':         \
219         case 'K':         \
220         case 'L':         \
221         case 'M':         \
222         case 'N':         \
223         case 'O':         \
224         case 'Q':         \
225         case 'R':         \
226         case 'S':         \
227         case 'T':         \
228         case 'U':         \
229         case 'V':         \
230         case 'W':         \
231         case 'X':         \
232         case 'Y':         \
233         case 'Z':         \
234         case '_'
235
236 #define SYMBOL_CHARS_E_P \
237              'E': \
238         case 'P': \
239         case 'e': \
240         case 'p'
241
242 #define SYMBOL_CHARS  \
243              SYMBOL_CHARS_WITHOUT_E_P: \
244         case SYMBOL_CHARS_E_P
245
246 #define DIGITS        \
247              '0':         \
248         case '1':         \
249         case '2':         \
250         case '3':         \
251         case '4':         \
252         case '5':         \
253         case '6':         \
254         case '7':         \
255         case '8':         \
256         case '9'
257
258 static bool is_universal_char_valid(utf32 const v)
259 {
260         /* C11 Â§6.4.3:2 */
261         if (v < 0xA0U && v != 0x24 && v != 0x40 && v != 0x60)
262                 return false;
263         if (0xD800 <= v && v <= 0xDFFF)
264                 return false;
265         return true;
266 }
267
268 static int digit_value(utf32 digit);
269
270 static utf32 parse_universal_char(unsigned const n_digits)
271 {
272         utf32 v = 0;
273         for (unsigned k = n_digits; k != 0; --k) {
274                 if (isxdigit(c)) {
275                         v = 16 * v + digit_value(c);
276                         next_char();
277                 } else {
278                         errorf(&lexer_pos, "short universal character name, expected %u more digits", k);
279                         break;
280                 }
281         }
282         if (!is_universal_char_valid(v)) {
283                 errorf(&lexer_pos, "\\%c%0*X is not a valid universal character name", n_digits == 4 ? 'u' : 'U', (int)n_digits, v);
284         }
285         return v;
286 }
287
288 static bool is_universal_char_valid_identifier(utf32 const v)
289 {
290         /* C11 Annex D.1 */
291         if (                v == 0x000A8) return true;
292         if (                v == 0x000AA) return true;
293         if (                v == 0x000AD) return true;
294         if (                v == 0x000AF) return true;
295         if (0x000B2 <= v && v <= 0x000B5) return true;
296         if (0x000B7 <= v && v <= 0x000BA) return true;
297         if (0x000BC <= v && v <= 0x000BE) return true;
298         if (0x000C0 <= v && v <= 0x000D6) return true;
299         if (0x000D8 <= v && v <= 0x000F6) return true;
300         if (0x000F8 <= v && v <= 0x000FF) return true;
301         if (0x00100 <= v && v <= 0x0167F) return true;
302         if (0x01681 <= v && v <= 0x0180D) return true;
303         if (0x0180F <= v && v <= 0x01FFF) return true;
304         if (0x0200B <= v && v <= 0x0200D) return true;
305         if (0x0202A <= v && v <= 0x0202E) return true;
306         if (0x0203F <= v && v <= 0x02040) return true;
307         if (                v == 0x02054) return true;
308         if (0x02060 <= v && v <= 0x0206F) return true;
309         if (0x02070 <= v && v <= 0x0218F) return true;
310         if (0x02460 <= v && v <= 0x024FF) return true;
311         if (0x02776 <= v && v <= 0x02793) return true;
312         if (0x02C00 <= v && v <= 0x02DFF) return true;
313         if (0x02E80 <= v && v <= 0x02FFF) return true;
314         if (0x03004 <= v && v <= 0x03007) return true;
315         if (0x03021 <= v && v <= 0x0302F) return true;
316         if (0x03031 <= v && v <= 0x0303F) return true;
317         if (0x03040 <= v && v <= 0x0D7FF) return true;
318         if (0x0F900 <= v && v <= 0x0FD3D) return true;
319         if (0x0FD40 <= v && v <= 0x0FDCF) return true;
320         if (0x0FDF0 <= v && v <= 0x0FE44) return true;
321         if (0x0FE47 <= v && v <= 0x0FFFD) return true;
322         if (0x10000 <= v && v <= 0x1FFFD) return true;
323         if (0x20000 <= v && v <= 0x2FFFD) return true;
324         if (0x30000 <= v && v <= 0x3FFFD) return true;
325         if (0x40000 <= v && v <= 0x4FFFD) return true;
326         if (0x50000 <= v && v <= 0x5FFFD) return true;
327         if (0x60000 <= v && v <= 0x6FFFD) return true;
328         if (0x70000 <= v && v <= 0x7FFFD) return true;
329         if (0x80000 <= v && v <= 0x8FFFD) return true;
330         if (0x90000 <= v && v <= 0x9FFFD) return true;
331         if (0xA0000 <= v && v <= 0xAFFFD) return true;
332         if (0xB0000 <= v && v <= 0xBFFFD) return true;
333         if (0xC0000 <= v && v <= 0xCFFFD) return true;
334         if (0xD0000 <= v && v <= 0xDFFFD) return true;
335         if (0xE0000 <= v && v <= 0xEFFFD) return true;
336         return false;
337 }
338
339 static bool is_universal_char_valid_identifier_start(utf32 const v)
340 {
341         /* C11 Annex D.2 */
342         if (0x0300 <= v && v <= 0x036F) return false;
343         if (0x1DC0 <= v && v <= 0x1DFF) return false;
344         if (0x20D0 <= v && v <= 0x20FF) return false;
345         if (0xFE20 <= v && v <= 0xFE2F) return false;
346         return true;
347 }
348
349 /**
350  * Read a symbol from the input and build
351  * the lexer_token.
352  */
353 static void parse_symbol(void)
354 {
355         while (true) {
356                 switch (c) {
357                 case DIGITS:
358                 case SYMBOL_CHARS:
359                         obstack_1grow(&symbol_obstack, (char) c);
360                         next_char();
361                         break;
362
363                 case '\\':
364                         next_char();
365                         switch (c) {
366                         {
367                                 unsigned n;
368                         case 'U': n = 8; goto universal;
369                         case 'u': n = 4; goto universal;
370 universal:
371                                 next_char();
372                                 utf32 const v = parse_universal_char(n);
373                                 if (!is_universal_char_valid_identifier(v)) {
374                                         if (is_universal_char_valid(v)) {
375                                                 errorf(&lexer_pos, "universal character \\%c%0*X is not valid in an identifier", n == 4 ? 'u' : 'U', (int)n, v);
376                                         }
377                                 } else if (obstack_object_size(&symbol_obstack) == 0 && !is_universal_char_valid_identifier_start(v)) {
378                                         errorf(&lexer_pos, "universal character \\%c%0*X is not valid as start of an identifier", n == 4 ? 'u' : 'U', (int)n, v);
379                                 } else {
380                                         obstack_grow_utf8(&symbol_obstack, v);
381                                 }
382                                 break;
383                         }
384
385                         default:
386                                 put_back(c);
387                                 c = '\\';
388                                 goto end_symbol;
389                         }
390
391                 default:
392 dollar_sign:
393                         goto end_symbol;
394                 }
395         }
396
397 end_symbol:
398         obstack_1grow(&symbol_obstack, '\0');
399
400         char     *string = obstack_finish(&symbol_obstack);
401         symbol_t *symbol = symbol_table_insert(string);
402
403         lexer_token.kind        = symbol->ID;
404         lexer_token.base.symbol = symbol;
405
406         if (symbol->string != string) {
407                 obstack_free(&symbol_obstack, string);
408         }
409 }
410
411 static string_t sym_make_string(string_encoding_t const enc)
412 {
413         obstack_1grow(&symbol_obstack, '\0');
414         size_t const len    = obstack_object_size(&symbol_obstack) - 1;
415         char  *const string = obstack_finish(&symbol_obstack);
416
417         /* TODO hash */
418 #if 0
419         const char *result = strset_insert(&stringset, concat);
420         if (result != concat) {
421                 obstack_free(&symbol_obstack, concat);
422         }
423 #else
424         const char *result = string;
425 #endif
426         return (string_t){ result, len, enc };
427 }
428
429 static void parse_pp_number(void)
430 {
431         for (;;) {
432                 switch (c) {
433                 case SYMBOL_CHARS_E_P:
434                         obstack_1grow(&symbol_obstack, (char)c);
435                         next_char();
436                         if (c == '+' || c == '-') {
437                 case '.':
438                 case DIGITS:
439                 case SYMBOL_CHARS_WITHOUT_E_P:
440                                 obstack_1grow(&symbol_obstack, (char)c);
441                                 next_char();
442                         }
443                         break;
444
445                 default:
446 dollar_sign:
447                         lexer_token.kind           = T_NUMBER;
448                         lexer_token.literal.string = sym_make_string(STRING_ENCODING_CHAR);
449                         return;
450                 }
451         }
452 }
453
454 /**
455  * Returns true if the given char is a octal digit.
456  *
457  * @param char  the character to check
458  */
459 static bool is_octal_digit(utf32 chr)
460 {
461         return '0' <= chr && chr <= '7';
462 }
463
464 /**
465  * Returns the value of a digit.
466  * The only portable way to do it ...
467  */
468 static int digit_value(utf32 const digit)
469 {
470         switch (digit) {
471         case '0': return 0;
472         case '1': return 1;
473         case '2': return 2;
474         case '3': return 3;
475         case '4': return 4;
476         case '5': return 5;
477         case '6': return 6;
478         case '7': return 7;
479         case '8': return 8;
480         case '9': return 9;
481         case 'a':
482         case 'A': return 10;
483         case 'b':
484         case 'B': return 11;
485         case 'c':
486         case 'C': return 12;
487         case 'd':
488         case 'D': return 13;
489         case 'e':
490         case 'E': return 14;
491         case 'f':
492         case 'F': return 15;
493         default:
494                 internal_error("wrong character given");
495         }
496 }
497
498 /**
499  * Parses an octal character sequence.
500  *
501  * @param first_digit  the already read first digit
502  */
503 static utf32 parse_octal_sequence(utf32 const first_digit)
504 {
505         assert(is_octal_digit(first_digit));
506         utf32 value = digit_value(first_digit);
507         if (!is_octal_digit(c)) return value;
508         value = 8 * value + digit_value(c);
509         next_char();
510         if (!is_octal_digit(c)) return value;
511         value = 8 * value + digit_value(c);
512         next_char();
513         return value;
514 }
515
516 /**
517  * Parses a hex character sequence.
518  */
519 static utf32 parse_hex_sequence(void)
520 {
521         utf32 value = 0;
522         while (isxdigit(c)) {
523                 value = 16 * value + digit_value(c);
524                 next_char();
525         }
526         return value;
527 }
528
529 /**
530  * Parse an escape sequence.
531  */
532 static utf32 parse_escape_sequence(void)
533 {
534         eat('\\');
535
536         utf32 const ec = c;
537         next_char();
538
539         switch (ec) {
540         case '"':  return '"';
541         case '\'': return '\'';
542         case '\\': return '\\';
543         case '?': return '\?';
544         case 'a': return '\a';
545         case 'b': return '\b';
546         case 'f': return '\f';
547         case 'n': return '\n';
548         case 'r': return '\r';
549         case 't': return '\t';
550         case 'v': return '\v';
551         case 'x':
552                 return parse_hex_sequence();
553         case '0':
554         case '1':
555         case '2':
556         case '3':
557         case '4':
558         case '5':
559         case '6':
560         case '7':
561                 return parse_octal_sequence(ec);
562         case EOF:
563                 parse_error("reached end of file while parsing escape sequence");
564                 return EOF;
565         /* \E is not documented, but handled, by GCC.  It is acceptable according
566          * to Â§6.11.4, whereas \e is not. */
567         case 'E':
568         case 'e':
569                 if (c_mode & _GNUC)
570                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
571                 break;
572
573         case 'U': return parse_universal_char(8);
574         case 'u': return parse_universal_char(4);
575
576         default:
577                 break;
578         }
579         /* Â§6.4.4.4:8 footnote 64 */
580         parse_error("unknown escape sequence");
581         return EOF;
582 }
583
584 string_t make_string(const char *string)
585 {
586         obstack_grow(&symbol_obstack, string, strlen(string));
587         return sym_make_string(STRING_ENCODING_CHAR);
588 }
589
590 static void parse_string(utf32 const delim, token_kind_t const kind, string_encoding_t const enc, char const *const context)
591 {
592         eat(delim);
593
594         while (true) {
595                 switch (c) {
596                 case '\\': {
597                         utf32 const tc = parse_escape_sequence();
598                         if (enc == STRING_ENCODING_CHAR) {
599                                 if (tc >= 0x100) {
600                                         warningf(WARN_OTHER, &lexer_pos, "escape sequence out of range");
601                                 }
602                                 obstack_1grow(&symbol_obstack, tc);
603                         } else {
604                                 obstack_grow_utf8(&symbol_obstack, tc);
605                         }
606                         break;
607                 }
608
609                 case NEWLINE:
610                         errorf(&lexer_pos, "newline while parsing %s", context);
611                         break;
612
613                 case EOF:
614                         errorf(&lexer_token.base.source_position, "EOF while parsing %s", context);
615                         goto end_of_string;
616
617                 default:
618                         if (c == delim) {
619                                 next_char();
620                                 goto end_of_string;
621                         } else {
622                                 obstack_grow_utf8(&symbol_obstack, c);
623                                 next_char();
624                                 break;
625                         }
626                 }
627         }
628
629 end_of_string:
630         lexer_token.kind           = kind;
631         lexer_token.literal.string = sym_make_string(enc);
632 }
633
634 /**
635  * Parse a string literal and set lexer_token.
636  */
637 static void parse_string_literal(string_encoding_t const enc)
638 {
639         parse_string('"', T_STRING_LITERAL, enc, "string literal");
640 }
641
642 /**
643  * Parse a character constant and set lexer_token.
644  */
645 static void parse_character_constant(string_encoding_t const enc)
646 {
647         parse_string('\'', T_CHARACTER_CONSTANT, enc, "character constant");
648         if (lexer_token.literal.string.size == 0) {
649                 errorf(&lexer_token.base.source_position, "empty character constant");
650         }
651 }
652
653 /**
654  * Skip a multiline comment.
655  */
656 static void skip_multiline_comment(void)
657 {
658         while (true) {
659                 switch (c) {
660                 case '/':
661                         next_char();
662                         if (c == '*') {
663                                 /* nested comment, warn here */
664                                 warningf(WARN_COMMENT, &lexer_pos, "'/*' within comment");
665                         }
666                         break;
667                 case '*':
668                         next_char();
669                         if (c == '/') {
670                                 next_char();
671                                 return;
672                         }
673                         break;
674
675                 case NEWLINE:
676                         break;
677
678                 case EOF: {
679                         errorf(&lexer_token.base.source_position,
680                                "at end of file while looking for comment end");
681                         return;
682                 }
683
684                 default:
685                         next_char();
686                         break;
687                 }
688         }
689 }
690
691 /**
692  * Skip a single line comment.
693  */
694 static void skip_line_comment(void)
695 {
696         while (true) {
697                 switch (c) {
698                 case EOF:
699                         return;
700
701                 case '\n':
702                 case '\r':
703                         return;
704
705                 case '\\':
706                         next_char();
707                         if (c == '\n' || c == '\r') {
708                                 warningf(WARN_COMMENT, &lexer_pos, "multi-line comment");
709                                 return;
710                         }
711                         break;
712
713                 default:
714                         next_char();
715                         break;
716                 }
717         }
718 }
719
720 /** The current preprocessor token. */
721 static token_t pp_token;
722
723 /**
724  * Read the next preprocessor token.
725  */
726 static inline void next_pp_token(void)
727 {
728         lexer_next_preprocessing_token();
729         pp_token = lexer_token;
730 }
731
732 /**
733  * Eat all preprocessor tokens until newline.
734  */
735 static void eat_until_newline(void)
736 {
737         while (pp_token.kind != '\n' && pp_token.kind != T_EOF) {
738                 next_pp_token();
739         }
740 }
741
742 /**
743  * Parse the line directive.
744  */
745 static void parse_line_directive(void)
746 {
747         if (pp_token.kind != T_NUMBER) {
748                 parse_error("expected integer");
749         } else {
750                 /* use offset -1 as this is about the next line */
751                 char      *end;
752                 long const line = strtol(pp_token.literal.string.begin, &end, 0);
753                 if (*end == '\0') {
754                         lexer_pos.lineno = line - 1;
755                 } else {
756                         errorf(&lexer_pos, "'%S' is not a valid line number", &pp_token.literal.string);
757                 }
758                 next_pp_token();
759         }
760         if (pp_token.kind == T_STRING_LITERAL && pp_token.literal.string.encoding == STRING_ENCODING_CHAR) {
761                 lexer_pos.input_name       = pp_token.literal.string.begin;
762                 lexer_pos.is_system_header = false;
763                 next_pp_token();
764
765                 /* attempt to parse numeric flags as outputted by gcc preprocessor */
766                 while (pp_token.kind == T_NUMBER) {
767                         /* flags:
768                          * 1 - indicates start of a new file
769                          * 2 - indicates return from a file
770                          * 3 - indicates system header
771                          * 4 - indicates implicit extern "C" in C++ mode
772                          *
773                          * currently we're only interested in "3"
774                          */
775                         if (streq(pp_token.literal.string.begin, "3")) {
776                                 lexer_pos.is_system_header = true;
777                         }
778                         next_pp_token();
779                 }
780         }
781
782         eat_until_newline();
783 }
784
785 /**
786  * STDC pragmas.
787  */
788 typedef enum stdc_pragma_kind_t {
789         STDC_UNKNOWN,
790         STDC_FP_CONTRACT,
791         STDC_FENV_ACCESS,
792         STDC_CX_LIMITED_RANGE
793 } stdc_pragma_kind_t;
794
795 /**
796  * STDC pragma values.
797  */
798 typedef enum stdc_pragma_value_kind_t {
799         STDC_VALUE_UNKNOWN,
800         STDC_VALUE_ON,
801         STDC_VALUE_OFF,
802         STDC_VALUE_DEFAULT
803 } stdc_pragma_value_kind_t;
804
805 /**
806  * Parse a pragma directive.
807  */
808 static void parse_pragma(void)
809 {
810         next_pp_token();
811         if (pp_token.kind != T_IDENTIFIER) {
812                 warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.base.source_position,
813                          "expected identifier after #pragma");
814                 eat_until_newline();
815                 return;
816         }
817
818         stdc_pragma_kind_t kind = STDC_UNKNOWN;
819         if (pp_token.base.symbol->pp_ID == TP_STDC && c_mode & _C99) {
820                 /* a STDC pragma */
821                 next_pp_token();
822
823                 switch (pp_token.base.symbol->pp_ID) {
824                 case TP_FP_CONTRACT:      kind = STDC_FP_CONTRACT;      break;
825                 case TP_FENV_ACCESS:      kind = STDC_FENV_ACCESS;      break;
826                 case TP_CX_LIMITED_RANGE: kind = STDC_CX_LIMITED_RANGE; break;
827                 default:                  break;
828                 }
829                 if (kind != STDC_UNKNOWN) {
830                         next_pp_token();
831                         stdc_pragma_value_kind_t value;
832                         switch (pp_token.base.symbol->pp_ID) {
833                         case TP_ON:      value = STDC_VALUE_ON;      break;
834                         case TP_OFF:     value = STDC_VALUE_OFF;     break;
835                         case TP_DEFAULT: value = STDC_VALUE_DEFAULT; break;
836                         default:         value = STDC_VALUE_UNKNOWN; break;
837                         }
838                         if (value == STDC_VALUE_UNKNOWN) {
839                                 kind = STDC_UNKNOWN;
840                                 errorf(&pp_token.base.source_position, "bad STDC pragma argument");
841                         }
842                 }
843         }
844         eat_until_newline();
845         if (kind == STDC_UNKNOWN) {
846                 warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.base.source_position,
847                          "encountered unknown #pragma");
848         }
849 }
850
851 /**
852  * Parse a preprocessor non-null directive.
853  */
854 static void parse_preprocessor_identifier(void)
855 {
856         assert(pp_token.kind == T_IDENTIFIER);
857         switch (pp_token.base.symbol->pp_ID) {
858         case TP_line:
859                 next_pp_token();
860                 parse_line_directive();
861                 break;
862         case TP_pragma:
863                 parse_pragma();
864                 break;
865         case TP_error:
866                 /* TODO; output the rest of the line */
867                 parse_error("#error directive");
868                 break;
869         }
870 }
871
872 /**
873  * Parse a preprocessor directive.
874  */
875 static void parse_preprocessor_directive(void)
876 {
877         next_pp_token();
878
879         switch (pp_token.kind) {
880         case T_IDENTIFIER:
881                 parse_preprocessor_identifier();
882                 break;
883         case T_NUMBER:
884                 parse_line_directive();
885                 break;
886         case '\n':
887                 /* NULL directive, see Â§6.10.7 */
888                 break;
889         default:
890                 parse_error("invalid preprocessor directive");
891                 eat_until_newline();
892                 break;
893         }
894 }
895
896 #define MAYBE_PROLOG                                       \
897                         next_char();                                   \
898                         while (true) {                                 \
899                                 switch (c) {
900
901 #define MAYBE(ch, set_type)                                \
902                                 case ch:                                   \
903                                         next_char();                           \
904                                         lexer_token.kind = set_type;           \
905                                         return;
906
907 /* must use this as last thing */
908 #define MAYBE_MODE(ch, set_type, mode)                     \
909                                 case ch:                                   \
910                                         if (c_mode & mode) {                   \
911                                                 next_char();                       \
912                                                 lexer_token.kind = set_type;       \
913                                                 return;                            \
914                                         }                                      \
915                                         /* fallthrough */
916
917 #define ELSE_CODE(code)                                    \
918                                 default:                                   \
919                                         code                                   \
920                                         return;                                \
921                                 }                                          \
922                         } /* end of while (true) */                    \
923
924 #define ELSE(set_type)                                     \
925                 ELSE_CODE(                                         \
926                         lexer_token.kind = set_type;                   \
927                 )
928
929 void lexer_next_preprocessing_token(void)
930 {
931         while (true) {
932                 lexer_token.base.source_position = lexer_pos;
933                 lexer_token.base.symbol          = NULL;
934
935                 switch (c) {
936                 case ' ':
937                 case '\t':
938                         next_char();
939                         break;
940
941                 case NEWLINE:
942                         lexer_token.kind = '\n';
943                         return;
944
945                 case SYMBOL_CHARS: {
946                         parse_symbol();
947                         /* might be a wide string ( L"string" ) */
948                         string_encoding_t const enc = STRING_ENCODING_WIDE;
949                         if (lexer_token.base.symbol == symbol_L) {
950                                 switch (c) {
951                                 case '"':  parse_string_literal(enc);     break;
952                                 case '\'': parse_character_constant(enc); break;
953                                 }
954                         }
955                         return;
956                 }
957
958                 case DIGITS:
959                         parse_pp_number();
960                         return;
961
962                 case '"':
963                         parse_string_literal(STRING_ENCODING_CHAR);
964                         return;
965
966                 case '\'':
967                         parse_character_constant(STRING_ENCODING_CHAR);
968                         return;
969
970                 case '.':
971                         MAYBE_PROLOG
972                                 case DIGITS:
973                                         put_back(c);
974                                         c = '.';
975                                         parse_pp_number();
976                                         return;
977
978                                 case '.':
979                                         MAYBE_PROLOG
980                                         MAYBE('.', T_DOTDOTDOT)
981                                         ELSE_CODE(
982                                                 put_back(c);
983                                                 c = '.';
984                                                 lexer_token.kind = '.';
985                                         )
986                         ELSE('.')
987                 case '&':
988                         MAYBE_PROLOG
989                         MAYBE('&', T_ANDAND)
990                         MAYBE('=', T_ANDEQUAL)
991                         ELSE('&')
992                 case '*':
993                         MAYBE_PROLOG
994                         MAYBE('=', T_ASTERISKEQUAL)
995                         ELSE('*')
996                 case '+':
997                         MAYBE_PROLOG
998                         MAYBE('+', T_PLUSPLUS)
999                         MAYBE('=', T_PLUSEQUAL)
1000                         ELSE('+')
1001                 case '-':
1002                         MAYBE_PROLOG
1003                         MAYBE('>', T_MINUSGREATER)
1004                         MAYBE('-', T_MINUSMINUS)
1005                         MAYBE('=', T_MINUSEQUAL)
1006                         ELSE('-')
1007                 case '!':
1008                         MAYBE_PROLOG
1009                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1010                         ELSE('!')
1011                 case '/':
1012                         MAYBE_PROLOG
1013                         MAYBE('=', T_SLASHEQUAL)
1014                                 case '*':
1015                                         next_char();
1016                                         skip_multiline_comment();
1017                                         lexer_next_preprocessing_token();
1018                                         return;
1019                                 case '/':
1020                                         next_char();
1021                                         skip_line_comment();
1022                                         lexer_next_preprocessing_token();
1023                                         return;
1024                         ELSE('/')
1025                 case '%':
1026                         MAYBE_PROLOG
1027                         MAYBE('>', '}')
1028                         MAYBE('=', T_PERCENTEQUAL)
1029                                 case ':':
1030                                         MAYBE_PROLOG
1031                                                 case '%':
1032                                                         MAYBE_PROLOG
1033                                                         MAYBE(':', T_HASHHASH)
1034                                                         ELSE_CODE(
1035                                                                 put_back(c);
1036                                                                 c = '%';
1037                                                                 lexer_token.kind = '#';
1038                                                         )
1039                                         ELSE('#')
1040                         ELSE('%')
1041                 case '<':
1042                         MAYBE_PROLOG
1043                         MAYBE(':', '[')
1044                         MAYBE('%', '{')
1045                         MAYBE('=', T_LESSEQUAL)
1046                                 case '<':
1047                                         MAYBE_PROLOG
1048                                         MAYBE('=', T_LESSLESSEQUAL)
1049                                         ELSE(T_LESSLESS)
1050                         ELSE('<')
1051                 case '>':
1052                         MAYBE_PROLOG
1053                         MAYBE('=', T_GREATEREQUAL)
1054                                 case '>':
1055                                         MAYBE_PROLOG
1056                                         MAYBE('=', T_GREATERGREATEREQUAL)
1057                                         ELSE(T_GREATERGREATER)
1058                         ELSE('>')
1059                 case '^':
1060                         MAYBE_PROLOG
1061                         MAYBE('=', T_CARETEQUAL)
1062                         ELSE('^')
1063                 case '|':
1064                         MAYBE_PROLOG
1065                         MAYBE('=', T_PIPEEQUAL)
1066                         MAYBE('|', T_PIPEPIPE)
1067                         ELSE('|')
1068                 case ':':
1069                         MAYBE_PROLOG
1070                         MAYBE('>', ']')
1071                         MAYBE_MODE(':', T_COLONCOLON, _CXX)
1072                         ELSE(':')
1073                 case '=':
1074                         MAYBE_PROLOG
1075                         MAYBE('=', T_EQUALEQUAL)
1076                         ELSE('=')
1077                 case '#':
1078                         MAYBE_PROLOG
1079                         MAYBE('#', T_HASHHASH)
1080                         ELSE('#')
1081
1082                 case '\\':
1083                         next_char();
1084                         if (c == 'U' || c == 'u') {
1085                                 put_back(c);
1086                                 c = '\\';
1087                                 parse_symbol();
1088                         } else {
1089                                 lexer_token.kind = '\\';
1090                         }
1091                         return;
1092
1093                 case '?':
1094                 case '[':
1095                 case ']':
1096                 case '(':
1097                 case ')':
1098                 case '{':
1099                 case '}':
1100                 case '~':
1101                 case ';':
1102                 case ',':
1103                         lexer_token.kind = c;
1104                         next_char();
1105                         return;
1106
1107                 case EOF:
1108                         lexer_token.kind = T_EOF;
1109                         return;
1110
1111                 default:
1112 dollar_sign:
1113                         errorf(&lexer_pos, "unknown character '%lc' found", c);
1114                         next_char();
1115                         break;
1116                 }
1117         }
1118 }
1119
1120 void lexer_next_token(void)
1121 {
1122         lexer_next_preprocessing_token();
1123
1124         while (lexer_token.kind == '\n') {
1125 newline_found:
1126                 lexer_next_preprocessing_token();
1127         }
1128
1129         if (lexer_token.kind == '#') {
1130                 parse_preprocessor_directive();
1131                 goto newline_found;
1132         }
1133 }
1134
1135 void init_lexer(void)
1136 {
1137         strset_init(&stringset);
1138         symbol_L = symbol_table_insert("L");
1139 }
1140
1141 static void input_error(unsigned delta_lines, unsigned delta_cols,
1142                         const char *message)
1143 {
1144         lexer_pos.lineno += delta_lines;
1145         lexer_pos.colno  += delta_cols;
1146         errorf(&lexer_pos, "%s", message);
1147 }
1148
1149 void lexer_switch_input(input_t *new_input, const char *input_name)
1150 {
1151         lexer_pos.lineno     = 0;
1152         lexer_pos.colno      = 0;
1153         lexer_pos.input_name = input_name;
1154
1155         set_input_error_callback(input_error);
1156         input  = new_input;
1157         bufpos = NULL;
1158         bufend = NULL;
1159
1160         /* place a virtual \n at the beginning so the lexer knows that we're
1161          * at the beginning of a line */
1162         c = '\n';
1163 }
1164
1165 void exit_lexer(void)
1166 {
1167         strset_destroy(&stringset);
1168 }
1169
1170 static __attribute__((unused))
1171 void dbg_pos(const source_position_t source_position)
1172 {
1173         fprintf(stdout, "%s:%u:%u\n", source_position.input_name,
1174                 source_position.lineno, (unsigned)source_position.colno);
1175         fflush(stdout);
1176 }