0aa07ab670e203bbc422b2b590dccc271254aaee
[cparser] / lexer.c
1 /*
2  * This file is part of cparser.
3  * Copyright (C) 2007-2009 Matthias Braun <matze@braunis.de>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License
7  * as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18  * 02111-1307, USA.
19  */
20 #include <config.h>
21
22 #include "adt/strutil.h"
23 #include "input.h"
24 #include "diagnostic.h"
25 #include "lexer.h"
26 #include "symbol_t.h"
27 #include "token_t.h"
28 #include "symbol_table_t.h"
29 #include "adt/error.h"
30 #include "adt/strset.h"
31 #include "adt/util.h"
32 #include "types.h"
33 #include "type_t.h"
34 #include "parser.h"
35 #include "warning.h"
36 #include "lang_features.h"
37
38 #include <assert.h>
39 #include <errno.h>
40 #include <string.h>
41 #include <stdbool.h>
42 #include <ctype.h>
43
44 #ifndef _WIN32
45 #include <strings.h>
46 #endif
47
48 #define MAX_PUTBACK 16    // 3 would be enough, but 16 gives a nicer alignment
49 #define BUF_SIZE    1024
50
51 static input_t           *input;
52 static utf32              input_buf[BUF_SIZE + MAX_PUTBACK];
53 static const utf32       *bufpos;
54 static const utf32       *bufend;
55 static utf32              c;
56 static source_position_t  lexer_pos;
57 token_t                   lexer_token;
58 static symbol_t          *symbol_L;
59 static strset_t           stringset;
60 bool                      allow_dollar_in_symbol = true;
61
62 /**
63  * Prints a parse error message at the current token.
64  *
65  * @param msg   the error message
66  */
67 static void parse_error(const char *msg)
68 {
69         errorf(&lexer_pos, "%s", msg);
70 }
71
72 /**
73  * Prints an internal error message at the current token.
74  *
75  * @param msg   the error message
76  */
77 static NORETURN internal_error(const char *msg)
78 {
79         internal_errorf(&lexer_pos, "%s", msg);
80 }
81
82 static inline void next_real_char(void)
83 {
84         assert(bufpos <= bufend);
85         if (bufpos >= bufend) {
86                 size_t n = decode(input, input_buf+MAX_PUTBACK, BUF_SIZE);
87                 if (n == 0) {
88                         c = EOF;
89                         return;
90                 }
91                 bufpos = input_buf + MAX_PUTBACK;
92                 bufend = bufpos + n;
93         }
94         c = *bufpos++;
95         ++lexer_pos.colno;
96 }
97
98 /**
99  * Put a character back into the buffer.
100  *
101  * @param pc  the character to put back
102  */
103 static inline void put_back(utf32 const pc)
104 {
105         *(--bufpos - input_buf + input_buf) = pc;
106         --lexer_pos.colno;
107 }
108
109 static inline void next_char(void);
110
111 #define MATCH_NEWLINE(code)  \
112         case '\r':               \
113                 next_char();         \
114                 if (c == '\n') {     \
115         case '\n':               \
116                         next_char();     \
117                 }                    \
118                 lexer_pos.lineno++;  \
119                 lexer_pos.colno = 1; \
120                 code
121
122 #define eat(c_type) (assert(c == c_type), next_char())
123
124 static void maybe_concat_lines(void)
125 {
126         eat('\\');
127
128         switch (c) {
129         MATCH_NEWLINE(return;)
130
131         default:
132                 break;
133         }
134
135         put_back(c);
136         c = '\\';
137 }
138
139 /**
140  * Set c to the next input character, ie.
141  * after expanding trigraphs.
142  */
143 static inline void next_char(void)
144 {
145         next_real_char();
146
147         /* filter trigraphs */
148         if (UNLIKELY(c == '\\')) {
149                 maybe_concat_lines();
150                 return;
151         }
152
153         if (LIKELY(c != '?'))
154                 return;
155
156         next_real_char();
157         if (LIKELY(c != '?')) {
158                 put_back(c);
159                 c = '?';
160                 return;
161         }
162
163         next_real_char();
164         switch (c) {
165         case '=': c = '#'; break;
166         case '(': c = '['; break;
167         case '/': c = '\\'; maybe_concat_lines(); break;
168         case ')': c = ']'; break;
169         case '\'': c = '^'; break;
170         case '<': c = '{'; break;
171         case '!': c = '|'; break;
172         case '>': c = '}'; break;
173         case '-': c = '~'; break;
174         default:
175                 put_back(c);
176                 put_back('?');
177                 c = '?';
178                 break;
179         }
180 }
181
182 #define SYMBOL_CHARS  \
183         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
184         case 'a':         \
185         case 'b':         \
186         case 'c':         \
187         case 'd':         \
188         case 'e':         \
189         case 'f':         \
190         case 'g':         \
191         case 'h':         \
192         case 'i':         \
193         case 'j':         \
194         case 'k':         \
195         case 'l':         \
196         case 'm':         \
197         case 'n':         \
198         case 'o':         \
199         case 'p':         \
200         case 'q':         \
201         case 'r':         \
202         case 's':         \
203         case 't':         \
204         case 'u':         \
205         case 'v':         \
206         case 'w':         \
207         case 'x':         \
208         case 'y':         \
209         case 'z':         \
210         case 'A':         \
211         case 'B':         \
212         case 'C':         \
213         case 'D':         \
214         case 'E':         \
215         case 'F':         \
216         case 'G':         \
217         case 'H':         \
218         case 'I':         \
219         case 'J':         \
220         case 'K':         \
221         case 'L':         \
222         case 'M':         \
223         case 'N':         \
224         case 'O':         \
225         case 'P':         \
226         case 'Q':         \
227         case 'R':         \
228         case 'S':         \
229         case 'T':         \
230         case 'U':         \
231         case 'V':         \
232         case 'W':         \
233         case 'X':         \
234         case 'Y':         \
235         case 'Z':         \
236         case '_':
237
238 #define DIGITS        \
239         case '0':         \
240         case '1':         \
241         case '2':         \
242         case '3':         \
243         case '4':         \
244         case '5':         \
245         case '6':         \
246         case '7':         \
247         case '8':         \
248         case '9':
249
250 /**
251  * Read a symbol from the input and build
252  * the lexer_token.
253  */
254 static void parse_symbol(void)
255 {
256         obstack_1grow(&symbol_obstack, (char) c);
257         next_char();
258
259         while (true) {
260                 switch (c) {
261                 DIGITS
262                 SYMBOL_CHARS
263                         obstack_1grow(&symbol_obstack, (char) c);
264                         next_char();
265                         break;
266
267                 default:
268 dollar_sign:
269                         goto end_symbol;
270                 }
271         }
272
273 end_symbol:
274         obstack_1grow(&symbol_obstack, '\0');
275
276         char     *string = obstack_finish(&symbol_obstack);
277         symbol_t *symbol = symbol_table_insert(string);
278
279         lexer_token.kind              = symbol->ID;
280         lexer_token.identifier.symbol = symbol;
281
282         if (symbol->string != string) {
283                 obstack_free(&symbol_obstack, string);
284         }
285 }
286
287 static string_t identify_string(char *string, size_t len)
288 {
289         /* TODO hash */
290 #if 0
291         const char *result = strset_insert(&stringset, concat);
292         if (result != concat) {
293                 obstack_free(&symbol_obstack, concat);
294         }
295 #else
296         const char *result = string;
297 #endif
298         return (string_t) {result, len};
299 }
300
301 /**
302  * parse suffixes like 'LU' or 'f' after numbers
303  */
304 static void parse_number_suffix(void)
305 {
306         assert(obstack_object_size(&symbol_obstack) == 0);
307         while (true) {
308                 switch (c) {
309                 SYMBOL_CHARS
310                         obstack_1grow(&symbol_obstack, (char) c);
311                         next_char();
312                         break;
313                 default:
314                 dollar_sign:
315                         goto finish_suffix;
316                 }
317         }
318 finish_suffix:
319         if (obstack_object_size(&symbol_obstack) == 0) {
320                 lexer_token.number.suffix.begin = NULL;
321                 lexer_token.number.suffix.size  = 0;
322                 return;
323         }
324
325         obstack_1grow(&symbol_obstack, '\0');
326         size_t size   = obstack_object_size(&symbol_obstack) - 1;
327         char  *string = obstack_finish(&symbol_obstack);
328
329         lexer_token.number.suffix = identify_string(string, size);
330 }
331
332 static void parse_exponent(void)
333 {
334         if (c == '-' || c == '+') {
335                 obstack_1grow(&symbol_obstack, (char)c);
336                 next_char();
337         }
338
339         if (isdigit(c)) {
340                 do {
341                         obstack_1grow(&symbol_obstack, (char)c);
342                         next_char();
343                 } while (isdigit(c));
344         } else {
345                 errorf(&lexer_token.base.source_position, "exponent has no digits");
346         }
347 }
348
349 /**
350  * Parses a hex number including hex floats and set the
351  * lexer_token.
352  */
353 static void parse_number_hex(void)
354 {
355         bool is_float   = false;
356         bool has_digits = false;
357
358         while (isxdigit(c)) {
359                 has_digits = true;
360                 obstack_1grow(&symbol_obstack, (char) c);
361                 next_char();
362         }
363
364         if (c == '.') {
365                 is_float = true;
366                 obstack_1grow(&symbol_obstack, (char) c);
367                 next_char();
368
369                 while (isxdigit(c)) {
370                         has_digits = true;
371                         obstack_1grow(&symbol_obstack, (char) c);
372                         next_char();
373                 }
374         }
375         if (c == 'p' || c == 'P') {
376                 is_float = true;
377                 obstack_1grow(&symbol_obstack, (char) c);
378                 next_char();
379                 parse_exponent();
380         } else if (is_float) {
381                 errorf(&lexer_token.base.source_position,
382                        "hexadecimal floatingpoint constant requires an exponent");
383         }
384         obstack_1grow(&symbol_obstack, '\0');
385
386         size_t  size   = obstack_object_size(&symbol_obstack) - 1;
387         char   *string = obstack_finish(&symbol_obstack);
388         lexer_token.number.number = identify_string(string, size);
389
390         lexer_token.kind    =
391                 is_float ? T_FLOATINGPOINT_HEXADECIMAL : T_INTEGER;
392
393         if (!has_digits) {
394                 errorf(&lexer_token.base.source_position, "invalid number literal '%S'", &lexer_token.number.number);
395                 lexer_token.number.number.begin = "0";
396                 lexer_token.number.number.size  = 1;
397         }
398
399         parse_number_suffix();
400 }
401
402 /**
403  * Returns true if the given char is a octal digit.
404  *
405  * @param char  the character to check
406  */
407 static bool is_octal_digit(utf32 chr)
408 {
409         return '0' <= chr && chr <= '7';
410 }
411
412 /**
413  * Parses a number and sets the lexer_token.
414  */
415 static void parse_number(void)
416 {
417         bool is_float   = false;
418         bool has_digits = false;
419
420         assert(obstack_object_size(&symbol_obstack) == 0);
421         if (c == '0') {
422                 obstack_1grow(&symbol_obstack, (char)c);
423                 next_char();
424                 if (c == 'x' || c == 'X') {
425                         obstack_1grow(&symbol_obstack, (char)c);
426                         next_char();
427                         parse_number_hex();
428                         return;
429                 } else {
430                         has_digits = true;
431                 }
432         }
433
434         while (isdigit(c)) {
435                 has_digits = true;
436                 obstack_1grow(&symbol_obstack, (char) c);
437                 next_char();
438         }
439
440         if (c == '.') {
441                 is_float = true;
442                 obstack_1grow(&symbol_obstack, '.');
443                 next_char();
444
445                 while (isdigit(c)) {
446                         has_digits = true;
447                         obstack_1grow(&symbol_obstack, (char) c);
448                         next_char();
449                 }
450         }
451         if (c == 'e' || c == 'E') {
452                 is_float = true;
453                 obstack_1grow(&symbol_obstack, 'e');
454                 next_char();
455                 parse_exponent();
456         }
457
458         obstack_1grow(&symbol_obstack, '\0');
459         size_t  size   = obstack_object_size(&symbol_obstack) - 1;
460         char   *string = obstack_finish(&symbol_obstack);
461         lexer_token.number.number = identify_string(string, size);
462
463         if (is_float) {
464                 lexer_token.kind = T_FLOATINGPOINT;
465         } else {
466                 lexer_token.kind = T_INTEGER;
467
468                 if (string[0] == '0') {
469                         /* check for invalid octal digits */
470                         for (size_t i= 0; i < size; ++i) {
471                                 char t = string[i];
472                                 if (t >= '8')
473                                         errorf(&lexer_token.base.source_position, "invalid digit '%c' in octal number", t);
474                         }
475                 }
476         }
477
478         if (!has_digits) {
479                 errorf(&lexer_token.base.source_position, "invalid number literal '%S'",
480                        &lexer_token.number.number);
481         }
482
483         parse_number_suffix();
484 }
485
486 /**
487  * Returns the value of a digit.
488  * The only portable way to do it ...
489  */
490 static int digit_value(utf32 const digit)
491 {
492         switch (digit) {
493         case '0': return 0;
494         case '1': return 1;
495         case '2': return 2;
496         case '3': return 3;
497         case '4': return 4;
498         case '5': return 5;
499         case '6': return 6;
500         case '7': return 7;
501         case '8': return 8;
502         case '9': return 9;
503         case 'a':
504         case 'A': return 10;
505         case 'b':
506         case 'B': return 11;
507         case 'c':
508         case 'C': return 12;
509         case 'd':
510         case 'D': return 13;
511         case 'e':
512         case 'E': return 14;
513         case 'f':
514         case 'F': return 15;
515         default:
516                 internal_error("wrong character given");
517         }
518 }
519
520 /**
521  * Parses an octal character sequence.
522  *
523  * @param first_digit  the already read first digit
524  */
525 static utf32 parse_octal_sequence(utf32 const first_digit)
526 {
527         assert(is_octal_digit(first_digit));
528         utf32 value = digit_value(first_digit);
529         if (!is_octal_digit(c)) return value;
530         value = 8 * value + digit_value(c);
531         next_char();
532         if (!is_octal_digit(c)) return value;
533         value = 8 * value + digit_value(c);
534         next_char();
535         return value;
536 }
537
538 /**
539  * Parses a hex character sequence.
540  */
541 static utf32 parse_hex_sequence(void)
542 {
543         utf32 value = 0;
544         while (isxdigit(c)) {
545                 value = 16 * value + digit_value(c);
546                 next_char();
547         }
548         return value;
549 }
550
551 /**
552  * Parse an escape sequence.
553  */
554 static utf32 parse_escape_sequence(void)
555 {
556         eat('\\');
557
558         utf32 const ec = c;
559         next_char();
560
561         switch (ec) {
562         case '"':  return '"';
563         case '\'': return '\'';
564         case '\\': return '\\';
565         case '?': return '\?';
566         case 'a': return '\a';
567         case 'b': return '\b';
568         case 'f': return '\f';
569         case 'n': return '\n';
570         case 'r': return '\r';
571         case 't': return '\t';
572         case 'v': return '\v';
573         case 'x':
574                 return parse_hex_sequence();
575         case '0':
576         case '1':
577         case '2':
578         case '3':
579         case '4':
580         case '5':
581         case '6':
582         case '7':
583                 return parse_octal_sequence(ec);
584         case EOF:
585                 parse_error("reached end of file while parsing escape sequence");
586                 return EOF;
587         /* \E is not documented, but handled, by GCC.  It is acceptable according
588          * to Â§6.11.4, whereas \e is not. */
589         case 'E':
590         case 'e':
591                 if (c_mode & _GNUC)
592                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
593                 break;
594         case 'u':
595         case 'U':
596                 parse_error("universal character parsing not implemented yet");
597                 return EOF;
598         default:
599                 break;
600         }
601         /* Â§6.4.4.4:8 footnote 64 */
602         parse_error("unknown escape sequence");
603         return EOF;
604 }
605
606 /**
607  * Concatenate two strings.
608  */
609 string_t concat_strings(const string_t *const s1, const string_t *const s2)
610 {
611         const size_t len1 = s1->size - 1;
612         const size_t len2 = s2->size - 1;
613
614         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
615         memcpy(concat, s1->begin, len1);
616         memcpy(concat + len1, s2->begin, len2 + 1);
617
618         return identify_string(concat, len1 + len2 + 1);
619 }
620
621 string_t make_string(const char *string)
622 {
623         size_t      len   = strlen(string) + 1;
624         char *const space = obstack_alloc(&symbol_obstack, len);
625         memcpy(space, string, len);
626
627         return identify_string(space, len);
628 }
629
630 /**
631  * Parse a string literal and set lexer_token.
632  */
633 static void parse_string_literal(void)
634 {
635         eat('"');
636
637         while (true) {
638                 switch (c) {
639                 case '\\': {
640                         utf32 const tc = parse_escape_sequence();
641                         if (tc >= 0x100) {
642                                 warningf(WARN_OTHER, &lexer_pos, "escape sequence out of range");
643                         }
644                         obstack_1grow(&symbol_obstack, tc);
645                         break;
646                 }
647
648                 case EOF:
649                         errorf(&lexer_token.base.source_position, "string has no end");
650                         goto end_of_string;
651
652                 case '"':
653                         next_char();
654                         goto end_of_string;
655
656                 default:
657                         obstack_grow_symbol(&symbol_obstack, c);
658                         next_char();
659                         break;
660                 }
661         }
662
663 end_of_string:
664
665         /* TODO: concatenate multiple strings separated by whitespace... */
666
667         /* add finishing 0 to the string */
668         obstack_1grow(&symbol_obstack, '\0');
669         const size_t  size   = (size_t)obstack_object_size(&symbol_obstack);
670         char         *string = obstack_finish(&symbol_obstack);
671
672         lexer_token.kind          = T_STRING_LITERAL;
673         lexer_token.string.string = identify_string(string, size);
674 }
675
676 /**
677  * Parse a wide character constant and set lexer_token.
678  */
679 static void parse_wide_character_constant(void)
680 {
681         eat('\'');
682
683         while (true) {
684                 switch (c) {
685                 case '\\': {
686                         const utf32 tc = parse_escape_sequence();
687                         obstack_grow_symbol(&symbol_obstack, tc);
688                         break;
689                 }
690
691                 MATCH_NEWLINE(
692                         parse_error("newline while parsing character constant");
693                         break;
694                 )
695
696                 case '\'':
697                         next_char();
698                         goto end_of_wide_char_constant;
699
700                 case EOF:
701                         errorf(&lexer_token.base.source_position, "EOF while parsing character constant");
702                         goto end_of_wide_char_constant;
703
704                 default:
705                         obstack_grow_symbol(&symbol_obstack, c);
706                         next_char();
707                         break;
708                 }
709         }
710
711 end_of_wide_char_constant:;
712         obstack_1grow(&symbol_obstack, '\0');
713         size_t  size   = (size_t) obstack_object_size(&symbol_obstack) - 1;
714         char   *string = obstack_finish(&symbol_obstack);
715
716         lexer_token.kind          = T_WIDE_CHARACTER_CONSTANT;
717         lexer_token.string.string = identify_string(string, size);
718
719         if (size == 0) {
720                 errorf(&lexer_token.base.source_position, "empty character constant");
721         }
722 }
723
724 /**
725  * Parse a wide string literal and set lexer_token.
726  */
727 static void parse_wide_string_literal(void)
728 {
729         parse_string_literal();
730         if (lexer_token.kind == T_STRING_LITERAL)
731                 lexer_token.kind = T_WIDE_STRING_LITERAL;
732 }
733
734 /**
735  * Parse a character constant and set lexer_token.
736  */
737 static void parse_character_constant(void)
738 {
739         eat('\'');
740
741         while (true) {
742                 switch (c) {
743                 case '\\': {
744                         utf32 const tc = parse_escape_sequence();
745                         if (tc >= 0x100) {
746                                 warningf(WARN_OTHER, &lexer_pos, "escape sequence out of range");
747                         }
748                         obstack_1grow(&symbol_obstack, tc);
749                         break;
750                 }
751
752                 MATCH_NEWLINE(
753                         parse_error("newline while parsing character constant");
754                         break;
755                 )
756
757                 case '\'':
758                         next_char();
759                         goto end_of_char_constant;
760
761                 case EOF:
762                         errorf(&lexer_token.base.source_position, "EOF while parsing character constant");
763                         goto end_of_char_constant;
764
765                 default:
766                         obstack_grow_symbol(&symbol_obstack, c);
767                         next_char();
768                         break;
769
770                 }
771         }
772
773 end_of_char_constant:;
774         obstack_1grow(&symbol_obstack, '\0');
775         const size_t        size   = (size_t)obstack_object_size(&symbol_obstack)-1;
776         char         *const string = obstack_finish(&symbol_obstack);
777
778         lexer_token.kind          = T_CHARACTER_CONSTANT;
779         lexer_token.string.string = identify_string(string, size);
780
781         if (size == 0) {
782                 errorf(&lexer_token.base.source_position, "empty character constant");
783         }
784 }
785
786 /**
787  * Skip a multiline comment.
788  */
789 static void skip_multiline_comment(void)
790 {
791         while (true) {
792                 switch (c) {
793                 case '/':
794                         next_char();
795                         if (c == '*') {
796                                 /* nested comment, warn here */
797                                 warningf(WARN_COMMENT, &lexer_pos, "'/*' within comment");
798                         }
799                         break;
800                 case '*':
801                         next_char();
802                         if (c == '/') {
803                                 next_char();
804                                 return;
805                         }
806                         break;
807
808                 MATCH_NEWLINE(break;)
809
810                 case EOF: {
811                         errorf(&lexer_token.base.source_position,
812                                "at end of file while looking for comment end");
813                         return;
814                 }
815
816                 default:
817                         next_char();
818                         break;
819                 }
820         }
821 }
822
823 /**
824  * Skip a single line comment.
825  */
826 static void skip_line_comment(void)
827 {
828         while (true) {
829                 switch (c) {
830                 case EOF:
831                         return;
832
833                 case '\n':
834                 case '\r':
835                         return;
836
837                 case '\\':
838                         next_char();
839                         if (c == '\n' || c == '\r') {
840                                 warningf(WARN_COMMENT, &lexer_pos, "multi-line comment");
841                                 return;
842                         }
843                         break;
844
845                 default:
846                         next_char();
847                         break;
848                 }
849         }
850 }
851
852 /** The current preprocessor token. */
853 static token_t pp_token;
854
855 /**
856  * Read the next preprocessor token.
857  */
858 static inline void next_pp_token(void)
859 {
860         lexer_next_preprocessing_token();
861         pp_token = lexer_token;
862 }
863
864 /**
865  * Eat all preprocessor tokens until newline.
866  */
867 static void eat_until_newline(void)
868 {
869         while (pp_token.kind != '\n' && pp_token.kind != T_EOF) {
870                 next_pp_token();
871         }
872 }
873
874 /**
875  * Parse the line directive.
876  */
877 static void parse_line_directive(void)
878 {
879         if (pp_token.kind != T_INTEGER) {
880                 parse_error("expected integer");
881         } else {
882                 /* use offset -1 as this is about the next line */
883                 lexer_pos.lineno = atoi(pp_token.number.number.begin) - 1;
884                 next_pp_token();
885         }
886         if (pp_token.kind == T_STRING_LITERAL) {
887                 lexer_pos.input_name = pp_token.string.string.begin;
888                 lexer_pos.is_system_header = false;
889                 next_pp_token();
890
891                 /* attempt to parse numeric flags as outputted by gcc preprocessor */
892                 while (pp_token.kind == T_INTEGER) {
893                         /* flags:
894                          * 1 - indicates start of a new file
895                          * 2 - indicates return from a file
896                          * 3 - indicates system header
897                          * 4 - indicates implicit extern "C" in C++ mode
898                          *
899                          * currently we're only interested in "3"
900                          */
901                         if (streq(pp_token.number.number.begin, "3")) {
902                                 lexer_pos.is_system_header = true;
903                         }
904                         next_pp_token();
905                 }
906         }
907
908         eat_until_newline();
909 }
910
911 /**
912  * STDC pragmas.
913  */
914 typedef enum stdc_pragma_kind_t {
915         STDC_UNKNOWN,
916         STDC_FP_CONTRACT,
917         STDC_FENV_ACCESS,
918         STDC_CX_LIMITED_RANGE
919 } stdc_pragma_kind_t;
920
921 /**
922  * STDC pragma values.
923  */
924 typedef enum stdc_pragma_value_kind_t {
925         STDC_VALUE_UNKNOWN,
926         STDC_VALUE_ON,
927         STDC_VALUE_OFF,
928         STDC_VALUE_DEFAULT
929 } stdc_pragma_value_kind_t;
930
931 /**
932  * Parse a pragma directive.
933  */
934 static void parse_pragma(void)
935 {
936         bool unknown_pragma = true;
937
938         next_pp_token();
939         if (pp_token.kind != T_IDENTIFIER) {
940                 warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.base.source_position,
941                          "expected identifier after #pragma");
942                 eat_until_newline();
943                 return;
944         }
945
946         symbol_t *symbol = pp_token.identifier.symbol;
947         if (symbol->pp_ID == TP_STDC) {
948                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
949                 /* a STDC pragma */
950                 if (c_mode & _C99) {
951                         next_pp_token();
952
953                         switch (pp_token.identifier.symbol->pp_ID) {
954                         case TP_FP_CONTRACT:
955                                 kind = STDC_FP_CONTRACT;
956                                 break;
957                         case TP_FENV_ACCESS:
958                                 kind = STDC_FENV_ACCESS;
959                                 break;
960                         case TP_CX_LIMITED_RANGE:
961                                 kind = STDC_CX_LIMITED_RANGE;
962                                 break;
963                         default:
964                                 break;
965                         }
966                         if (kind != STDC_UNKNOWN) {
967                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
968                                 next_pp_token();
969                                 switch (pp_token.identifier.symbol->pp_ID) {
970                                 case TP_ON:
971                                         value = STDC_VALUE_ON;
972                                         break;
973                                 case TP_OFF:
974                                         value = STDC_VALUE_OFF;
975                                         break;
976                                 case TP_DEFAULT:
977                                         value = STDC_VALUE_DEFAULT;
978                                         break;
979                                 default:
980                                         break;
981                                 }
982                                 if (value != STDC_VALUE_UNKNOWN) {
983                                         unknown_pragma = false;
984                                 } else {
985                                         errorf(&pp_token.base.source_position,
986                                                "bad STDC pragma argument");
987                                 }
988                         }
989                 }
990         } else {
991                 unknown_pragma = true;
992         }
993         eat_until_newline();
994         if (unknown_pragma) {
995                 warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.base.source_position,
996                          "encountered unknown #pragma");
997         }
998 }
999
1000 /**
1001  * Parse a preprocessor non-null directive.
1002  */
1003 static void parse_preprocessor_identifier(void)
1004 {
1005         assert(pp_token.kind == T_IDENTIFIER);
1006         symbol_t *symbol = pp_token.identifier.symbol;
1007
1008         switch (symbol->pp_ID) {
1009         case TP_line:
1010                 next_pp_token();
1011                 parse_line_directive();
1012                 break;
1013         case TP_pragma:
1014                 parse_pragma();
1015                 break;
1016         case TP_error:
1017                 /* TODO; output the rest of the line */
1018                 parse_error("#error directive");
1019                 break;
1020         }
1021 }
1022
1023 /**
1024  * Parse a preprocessor directive.
1025  */
1026 static void parse_preprocessor_directive(void)
1027 {
1028         next_pp_token();
1029
1030         switch (pp_token.kind) {
1031         case T_IDENTIFIER:
1032                 parse_preprocessor_identifier();
1033                 break;
1034         case T_INTEGER:
1035                 parse_line_directive();
1036                 break;
1037         case '\n':
1038                 /* NULL directive, see Â§6.10.7 */
1039                 break;
1040         default:
1041                 parse_error("invalid preprocessor directive");
1042                 eat_until_newline();
1043                 break;
1044         }
1045 }
1046
1047 #define MAYBE_PROLOG                                       \
1048                         next_char();                                   \
1049                         while (true) {                                 \
1050                                 switch (c) {
1051
1052 #define MAYBE(ch, set_type)                                \
1053                                 case ch:                                   \
1054                                         next_char();                           \
1055                                         lexer_token.kind = set_type;           \
1056                                         return;
1057
1058 /* must use this as last thing */
1059 #define MAYBE_MODE(ch, set_type, mode)                     \
1060                                 case ch:                                   \
1061                                         if (c_mode & mode) {                   \
1062                                                 next_char();                       \
1063                                                 lexer_token.kind = set_type;       \
1064                                                 return;                            \
1065                                         }                                      \
1066                                         /* fallthrough */
1067
1068 #define ELSE_CODE(code)                                    \
1069                                 default:                                   \
1070                                         code                                   \
1071                                         return;                                \
1072                                 }                                          \
1073                         } /* end of while (true) */                    \
1074
1075 #define ELSE(set_type)                                     \
1076                 ELSE_CODE(                                         \
1077                         lexer_token.kind = set_type;                   \
1078                 )
1079
1080 void lexer_next_preprocessing_token(void)
1081 {
1082         while (true) {
1083                 lexer_token.base.source_position = lexer_pos;
1084
1085                 switch (c) {
1086                 case ' ':
1087                 case '\t':
1088                         next_char();
1089                         break;
1090
1091                 MATCH_NEWLINE(
1092                         lexer_token.kind = '\n';
1093                         return;
1094                 )
1095
1096                 SYMBOL_CHARS
1097                         parse_symbol();
1098                         /* might be a wide string ( L"string" ) */
1099                         if (lexer_token.identifier.symbol == symbol_L) {
1100                                 switch (c) {
1101                                         case '"':  parse_wide_string_literal();     break;
1102                                         case '\'': parse_wide_character_constant(); break;
1103                                 }
1104                         }
1105                         return;
1106
1107                 DIGITS
1108                         parse_number();
1109                         return;
1110
1111                 case '"':
1112                         parse_string_literal();
1113                         return;
1114
1115                 case '\'':
1116                         parse_character_constant();
1117                         return;
1118
1119                 case '.':
1120                         MAYBE_PROLOG
1121                                 DIGITS
1122                                         put_back(c);
1123                                         c = '.';
1124                                         parse_number();
1125                                         return;
1126
1127                                 case '.':
1128                                         MAYBE_PROLOG
1129                                         MAYBE('.', T_DOTDOTDOT)
1130                                         ELSE_CODE(
1131                                                 put_back(c);
1132                                                 c = '.';
1133                                                 lexer_token.kind = '.';
1134                                         )
1135                         ELSE('.')
1136                 case '&':
1137                         MAYBE_PROLOG
1138                         MAYBE('&', T_ANDAND)
1139                         MAYBE('=', T_ANDEQUAL)
1140                         ELSE('&')
1141                 case '*':
1142                         MAYBE_PROLOG
1143                         MAYBE('=', T_ASTERISKEQUAL)
1144                         ELSE('*')
1145                 case '+':
1146                         MAYBE_PROLOG
1147                         MAYBE('+', T_PLUSPLUS)
1148                         MAYBE('=', T_PLUSEQUAL)
1149                         ELSE('+')
1150                 case '-':
1151                         MAYBE_PROLOG
1152                         MAYBE('>', T_MINUSGREATER)
1153                         MAYBE('-', T_MINUSMINUS)
1154                         MAYBE('=', T_MINUSEQUAL)
1155                         ELSE('-')
1156                 case '!':
1157                         MAYBE_PROLOG
1158                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1159                         ELSE('!')
1160                 case '/':
1161                         MAYBE_PROLOG
1162                         MAYBE('=', T_SLASHEQUAL)
1163                                 case '*':
1164                                         next_char();
1165                                         skip_multiline_comment();
1166                                         lexer_next_preprocessing_token();
1167                                         return;
1168                                 case '/':
1169                                         next_char();
1170                                         skip_line_comment();
1171                                         lexer_next_preprocessing_token();
1172                                         return;
1173                         ELSE('/')
1174                 case '%':
1175                         MAYBE_PROLOG
1176                         MAYBE('>', '}')
1177                         MAYBE('=', T_PERCENTEQUAL)
1178                                 case ':':
1179                                         MAYBE_PROLOG
1180                                                 case '%':
1181                                                         MAYBE_PROLOG
1182                                                         MAYBE(':', T_HASHHASH)
1183                                                         ELSE_CODE(
1184                                                                 put_back(c);
1185                                                                 c = '%';
1186                                                                 lexer_token.kind = '#';
1187                                                         )
1188                                         ELSE('#')
1189                         ELSE('%')
1190                 case '<':
1191                         MAYBE_PROLOG
1192                         MAYBE(':', '[')
1193                         MAYBE('%', '{')
1194                         MAYBE('=', T_LESSEQUAL)
1195                                 case '<':
1196                                         MAYBE_PROLOG
1197                                         MAYBE('=', T_LESSLESSEQUAL)
1198                                         ELSE(T_LESSLESS)
1199                         ELSE('<')
1200                 case '>':
1201                         MAYBE_PROLOG
1202                         MAYBE('=', T_GREATEREQUAL)
1203                                 case '>':
1204                                         MAYBE_PROLOG
1205                                         MAYBE('=', T_GREATERGREATEREQUAL)
1206                                         ELSE(T_GREATERGREATER)
1207                         ELSE('>')
1208                 case '^':
1209                         MAYBE_PROLOG
1210                         MAYBE('=', T_CARETEQUAL)
1211                         ELSE('^')
1212                 case '|':
1213                         MAYBE_PROLOG
1214                         MAYBE('=', T_PIPEEQUAL)
1215                         MAYBE('|', T_PIPEPIPE)
1216                         ELSE('|')
1217                 case ':':
1218                         MAYBE_PROLOG
1219                         MAYBE('>', ']')
1220                         MAYBE_MODE(':', T_COLONCOLON, _CXX)
1221                         ELSE(':')
1222                 case '=':
1223                         MAYBE_PROLOG
1224                         MAYBE('=', T_EQUALEQUAL)
1225                         ELSE('=')
1226                 case '#':
1227                         MAYBE_PROLOG
1228                         MAYBE('#', T_HASHHASH)
1229                         ELSE('#')
1230
1231                 case '?':
1232                 case '[':
1233                 case ']':
1234                 case '(':
1235                 case ')':
1236                 case '{':
1237                 case '}':
1238                 case '~':
1239                 case ';':
1240                 case ',':
1241                 case '\\':
1242                         lexer_token.kind = c;
1243                         next_char();
1244                         return;
1245
1246                 case EOF:
1247                         lexer_token.kind = T_EOF;
1248                         return;
1249
1250                 default:
1251 dollar_sign:
1252                         errorf(&lexer_pos, "unknown character '%c' found", c);
1253                         next_char();
1254                         break;
1255                 }
1256         }
1257 }
1258
1259 void lexer_next_token(void)
1260 {
1261         lexer_next_preprocessing_token();
1262
1263         while (lexer_token.kind == '\n') {
1264 newline_found:
1265                 lexer_next_preprocessing_token();
1266         }
1267
1268         if (lexer_token.kind == '#') {
1269                 parse_preprocessor_directive();
1270                 goto newline_found;
1271         }
1272 }
1273
1274 void init_lexer(void)
1275 {
1276         strset_init(&stringset);
1277         symbol_L = symbol_table_insert("L");
1278 }
1279
1280 static void input_error(unsigned delta_lines, unsigned delta_cols,
1281                         const char *message)
1282 {
1283         lexer_pos.lineno += delta_lines;
1284         lexer_pos.colno  += delta_cols;
1285         errorf(&lexer_pos, "%s", message);
1286 }
1287
1288 void lexer_switch_input(input_t *new_input, const char *input_name)
1289 {
1290         lexer_pos.lineno     = 0;
1291         lexer_pos.colno      = 0;
1292         lexer_pos.input_name = input_name;
1293
1294         set_input_error_callback(input_error);
1295         input  = new_input;
1296         bufpos = NULL;
1297         bufend = NULL;
1298
1299         /* place a virtual \n at the beginning so the lexer knows that we're
1300          * at the beginning of a line */
1301         c = '\n';
1302 }
1303
1304 void exit_lexer(void)
1305 {
1306         strset_destroy(&stringset);
1307 }
1308
1309 static __attribute__((unused))
1310 void dbg_pos(const source_position_t source_position)
1311 {
1312         fprintf(stdout, "%s:%u:%u\n", source_position.input_name,
1313                 source_position.lineno, (unsigned)source_position.colno);
1314         fflush(stdout);
1315 }