add license comments
[cparser] / lexer.c
1 /*
2  * This file is part of cparser.
3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License
7  * as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18  * 02111-1307, USA.
19  */
20 #include <config.h>
21
22 #include "diagnostic.h"
23 #include "lexer.h"
24 #include "token_t.h"
25 #include "symbol_table_t.h"
26 #include "adt/error.h"
27 #include "adt/strset.h"
28 #include "adt/util.h"
29 #include "types.h"
30 #include "type_t.h"
31 #include "target_architecture.h"
32 #include "parser.h"
33 #include "warning.h"
34 #include "lang_features.h"
35
36 #include <assert.h>
37 #include <errno.h>
38 #include <string.h>
39 #include <stdbool.h>
40 #include <ctype.h>
41
42 //#define DEBUG_CHARS
43 #define MAX_PUTBACK 3
44
45 #ifdef _WIN32
46 /* No strtold on windows and no replacement yet */
47 #define strtold(s, e) strtod(s, e)
48 #endif
49
50 static int         c;
51 token_t            lexer_token;
52 symbol_t          *symbol_L;
53 static FILE       *input;
54 static char        buf[1024 + MAX_PUTBACK];
55 static const char *bufend;
56 static const char *bufpos;
57 static strset_t    stringset;
58
59 /**
60  * Prints a parse error message at the current token.
61  *
62  * @param msg   the error message
63  */
64 static void parse_error(const char *msg)
65 {
66         errorf(lexer_token.source_position,  "%s", msg);
67 }
68
69 static inline void next_real_char(void)
70 {
71         assert(bufpos <= bufend);
72         if (bufpos >= bufend) {
73                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
74                                  input);
75                 if(s == 0) {
76                         c = EOF;
77                         return;
78                 }
79                 bufpos = buf + MAX_PUTBACK;
80                 bufend = buf + MAX_PUTBACK + s;
81         }
82         c = *bufpos++;
83 }
84
85 /**
86  * Put a character back into the buffer.
87  *
88  * @param pc  the character to put back
89  */
90 static inline void put_back(int pc)
91 {
92         assert(bufpos > buf);
93         *(--bufpos - buf + buf) = (char) pc;
94
95 #ifdef DEBUG_CHARS
96         printf("putback '%c'\n", pc);
97 #endif
98 }
99
100 static inline void next_char(void);
101
102 #define MATCH_NEWLINE(code)                   \
103         case '\r':                                \
104                 next_char();                          \
105                 if(c == '\n') {                       \
106                         next_char();                      \
107                 }                                     \
108                 lexer_token.source_position.linenr++; \
109                 code                                  \
110         case '\n':                                \
111                 next_char();                          \
112                 lexer_token.source_position.linenr++; \
113                 code
114
115 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
116
117 static void maybe_concat_lines(void)
118 {
119         eat('\\');
120
121         switch(c) {
122         MATCH_NEWLINE(return;)
123
124         default:
125                 break;
126         }
127
128         put_back(c);
129         c = '\\';
130 }
131
132 /**
133  * Set c to the next input character, ie.
134  * after expanding trigraphs.
135  */
136 static inline void next_char(void)
137 {
138         next_real_char();
139
140         /* filter trigraphs */
141         if(UNLIKELY(c == '\\')) {
142                 maybe_concat_lines();
143                 goto end_of_next_char;
144         }
145
146         if(LIKELY(c != '?'))
147                 goto end_of_next_char;
148
149         next_real_char();
150         if(LIKELY(c != '?')) {
151                 put_back(c);
152                 c = '?';
153                 goto end_of_next_char;
154         }
155
156         next_real_char();
157         switch(c) {
158         case '=': c = '#'; break;
159         case '(': c = '['; break;
160         case '/': c = '\\'; maybe_concat_lines(); break;
161         case ')': c = ']'; break;
162         case '\'': c = '^'; break;
163         case '<': c = '{'; break;
164         case '!': c = '|'; break;
165         case '>': c = '}'; break;
166         case '-': c = '~'; break;
167         default:
168                 put_back(c);
169                 put_back('?');
170                 c = '?';
171                 break;
172         }
173
174 end_of_next_char:;
175 #ifdef DEBUG_CHARS
176         printf("nchar '%c'\n", c);
177 #endif
178 }
179
180 #define SYMBOL_CHARS  \
181         case 'a':         \
182         case 'b':         \
183         case 'c':         \
184         case 'd':         \
185         case 'e':         \
186         case 'f':         \
187         case 'g':         \
188         case 'h':         \
189         case 'i':         \
190         case 'j':         \
191         case 'k':         \
192         case 'l':         \
193         case 'm':         \
194         case 'n':         \
195         case 'o':         \
196         case 'p':         \
197         case 'q':         \
198         case 'r':         \
199         case 's':         \
200         case 't':         \
201         case 'u':         \
202         case 'v':         \
203         case 'w':         \
204         case 'x':         \
205         case 'y':         \
206         case 'z':         \
207         case 'A':         \
208         case 'B':         \
209         case 'C':         \
210         case 'D':         \
211         case 'E':         \
212         case 'F':         \
213         case 'G':         \
214         case 'H':         \
215         case 'I':         \
216         case 'J':         \
217         case 'K':         \
218         case 'L':         \
219         case 'M':         \
220         case 'N':         \
221         case 'O':         \
222         case 'P':         \
223         case 'Q':         \
224         case 'R':         \
225         case 'S':         \
226         case 'T':         \
227         case 'U':         \
228         case 'V':         \
229         case 'W':         \
230         case 'X':         \
231         case 'Y':         \
232         case 'Z':         \
233         case '_':
234
235 #define DIGITS        \
236         case '0':         \
237         case '1':         \
238         case '2':         \
239         case '3':         \
240         case '4':         \
241         case '5':         \
242         case '6':         \
243         case '7':         \
244         case '8':         \
245         case '9':
246
247 /**
248  * Read a symbol from the input and build
249  * the lexer_token.
250  */
251 static void parse_symbol(void)
252 {
253         symbol_t *symbol;
254         char     *string;
255
256         obstack_1grow(&symbol_obstack, (char) c);
257         next_char();
258
259         while(1) {
260                 switch(c) {
261                 DIGITS
262                 SYMBOL_CHARS
263                         obstack_1grow(&symbol_obstack, (char) c);
264                         next_char();
265                         break;
266
267                 default:
268                         goto end_symbol;
269                 }
270         }
271
272 end_symbol:
273         obstack_1grow(&symbol_obstack, '\0');
274
275         string = obstack_finish(&symbol_obstack);
276         symbol = symbol_table_insert(string);
277
278         lexer_token.type     = symbol->ID;
279         lexer_token.v.symbol = symbol;
280
281         if(symbol->string != string) {
282                 obstack_free(&symbol_obstack, string);
283         }
284 }
285
286 static void parse_integer_suffix(bool is_oct_hex)
287 {
288         bool is_unsigned  = false;
289         bool min_long     = false;
290         bool min_longlong = false;
291
292         if(c == 'U' || c == 'u') {
293                 is_unsigned = true;
294                 next_char();
295                 if(c == 'L' || c == 'l') {
296                         min_long = true;
297                         next_char();
298                         if(c == 'L' || c == 'l') {
299                                 min_longlong = true;
300                                 next_char();
301                         }
302                 }
303         } else if(c == 'l' || c == 'L') {
304                 min_long = true;
305                 next_char();
306                 if(c == 'l' || c == 'L') {
307                         min_longlong = true;
308                         next_char();
309                         if(c == 'u' || c == 'U') {
310                                 is_unsigned = true;
311                                 next_char();
312                         }
313                 } else if(c == 'u' || c == 'U') {
314                         is_unsigned = true;
315                         next_char();
316                         lexer_token.datatype = type_unsigned_long;
317                 }
318         }
319
320         if(!is_unsigned) {
321                 long long v = lexer_token.v.intvalue;
322                 if(!min_long) {
323                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
324                                 lexer_token.datatype = type_int;
325                                 return;
326                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
327                                 lexer_token.datatype = type_unsigned_int;
328                                 return;
329                         }
330                 }
331                 if(!min_longlong) {
332                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
333                                 lexer_token.datatype = type_long;
334                                 return;
335                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
336                                 lexer_token.datatype = type_unsigned_long;
337                                 return;
338                         }
339                 }
340                 unsigned long long uv = (unsigned long long) v;
341                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
342                         lexer_token.datatype = type_unsigned_long_long;
343                         return;
344                 }
345
346                 lexer_token.datatype = type_long_long;
347         } else {
348                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
349                 if(!min_long && v <= TARGET_UINT_MAX) {
350                         lexer_token.datatype = type_unsigned_int;
351                         return;
352                 }
353                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
354                         lexer_token.datatype = type_unsigned_long;
355                         return;
356                 }
357                 lexer_token.datatype = type_unsigned_long_long;
358         }
359 }
360
361 static void parse_floating_suffix(void)
362 {
363         switch(c) {
364         /* TODO: do something useful with the suffixes... */
365         case 'f':
366         case 'F':
367                 next_char();
368                 lexer_token.datatype = type_float;
369                 break;
370         case 'l':
371         case 'L':
372                 next_char();
373                 lexer_token.datatype = type_long_double;
374                 break;
375         default:
376                 lexer_token.datatype = type_double;
377                 break;
378         }
379 }
380
381 /**
382  * A replacement for strtoull. Only those parts needed for
383  * our parser are implemented.
384  */
385 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
386         unsigned long long v = 0;
387
388         switch (base) {
389         case 16:
390                 for (;; ++s) {
391                         /* check for overrun */
392                         if (v >= 0x1000000000000000ULL)
393                                 break;
394                         switch (tolower(*s)) {
395                         case '0': v <<= 4; break;
396                         case '1': v <<= 4; v |= 0x1; break;
397                         case '2': v <<= 4; v |= 0x2; break;
398                         case '3': v <<= 4; v |= 0x3; break;
399                         case '4': v <<= 4; v |= 0x4; break;
400                         case '5': v <<= 4; v |= 0x5; break;
401                         case '6': v <<= 4; v |= 0x6; break;
402                         case '7': v <<= 4; v |= 0x7; break;
403                         case '8': v <<= 4; v |= 0x8; break;
404                         case '9': v <<= 4; v |= 0x9; break;
405                         case 'a': v <<= 4; v |= 0xa; break;
406                         case 'b': v <<= 4; v |= 0xb; break;
407                         case 'c': v <<= 4; v |= 0xc; break;
408                         case 'd': v <<= 4; v |= 0xd; break;
409                         case 'e': v <<= 4; v |= 0xe; break;
410                         case 'f': v <<= 4; v |= 0xf; break;
411                         default:
412                                 goto end;
413                         }
414                 }
415                 break;
416         case 8:
417                 for (;; ++s) {
418                         /* check for overrun */
419                         if (v >= 0x2000000000000000ULL)
420                                 break;
421                         switch (tolower(*s)) {
422                         case '0': v <<= 3; break;
423                         case '1': v <<= 3; v |= 1; break;
424                         case '2': v <<= 3; v |= 2; break;
425                         case '3': v <<= 3; v |= 3; break;
426                         case '4': v <<= 3; v |= 4; break;
427                         case '5': v <<= 3; v |= 5; break;
428                         case '6': v <<= 3; v |= 6; break;
429                         case '7': v <<= 3; v |= 7; break;
430                         default:
431                                 goto end;
432                         }
433                 }
434                 break;
435         case 10:
436                 for (;; ++s) {
437                         /* check for overrun */
438                         if (v > 0x1999999999999999ULL)
439                                 break;
440                         switch (tolower(*s)) {
441                         case '0': v *= 10; break;
442                         case '1': v *= 10; v += 1; break;
443                         case '2': v *= 10; v += 2; break;
444                         case '3': v *= 10; v += 3; break;
445                         case '4': v *= 10; v += 4; break;
446                         case '5': v *= 10; v += 5; break;
447                         case '6': v *= 10; v += 6; break;
448                         case '7': v *= 10; v += 7; break;
449                         case '8': v *= 10; v += 8; break;
450                         case '9': v *= 10; v += 9; break;
451                         default:
452                                 goto end;
453                         }
454                 }
455                 break;
456         default:
457                 assert(0);
458                 break;
459         }
460 end:
461         *endptr = s;
462         return v;
463 }
464
465 /**
466  * Parses a hex number including hex floats and set the
467  * lexer_token.
468  */
469 static void parse_number_hex(void)
470 {
471         assert(c == 'x' || c == 'X');
472         next_char();
473
474         while(isxdigit(c)) {
475                 obstack_1grow(&symbol_obstack, (char) c);
476                 next_char();
477         }
478         obstack_1grow(&symbol_obstack, '\0');
479         char *string = obstack_finish(&symbol_obstack);
480
481         if(c == '.' || c == 'p' || c == 'P') {
482                 next_char();
483                 panic("Hex floating point numbers not implemented yet");
484         }
485         if(*string == '\0') {
486                 parse_error("invalid hex number");
487                 lexer_token.type = T_ERROR;
488         }
489
490         const char *endptr;
491         lexer_token.type       = T_INTEGER;
492         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
493         if(*endptr != '\0') {
494                 parse_error("hex number literal too long");
495         }
496
497         obstack_free(&symbol_obstack, string);
498         parse_integer_suffix(true);
499 }
500
501 /**
502  * Returns true if the given char is a octal digit.
503  *
504  * @param char  the character to check
505  */
506 static inline bool is_octal_digit(int chr)
507 {
508         switch(chr) {
509         case '0':
510         case '1':
511         case '2':
512         case '3':
513         case '4':
514         case '5':
515         case '6':
516         case '7':
517                 return true;
518         default:
519                 return false;
520         }
521 }
522
523 /**
524  * Parses a octal number and set the lexer_token.
525  */
526 static void parse_number_oct(void)
527 {
528         while(is_octal_digit(c)) {
529                 obstack_1grow(&symbol_obstack, (char) c);
530                 next_char();
531         }
532         obstack_1grow(&symbol_obstack, '\0');
533         char *string = obstack_finish(&symbol_obstack);
534
535         const char *endptr;
536         lexer_token.type       = T_INTEGER;
537         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
538         if(*endptr != '\0') {
539                 parse_error("octal number literal too long");
540         }
541
542         obstack_free(&symbol_obstack, string);
543         parse_integer_suffix(true);
544 }
545
546 /**
547  * Parses a decimal including float number and set the
548  * lexer_token.
549  */
550 static void parse_number_dec(void)
551 {
552         bool is_float = false;
553         while(isdigit(c)) {
554                 obstack_1grow(&symbol_obstack, (char) c);
555                 next_char();
556         }
557
558         if(c == '.') {
559                 obstack_1grow(&symbol_obstack, '.');
560                 next_char();
561
562                 while(isdigit(c)) {
563                         obstack_1grow(&symbol_obstack, (char) c);
564                         next_char();
565                 }
566                 is_float = true;
567         }
568         if(c == 'e' || c == 'E') {
569                 obstack_1grow(&symbol_obstack, 'e');
570                 next_char();
571
572                 if(c == '-' || c == '+') {
573                         obstack_1grow(&symbol_obstack, (char) c);
574                         next_char();
575                 }
576
577                 while(isdigit(c)) {
578                         obstack_1grow(&symbol_obstack, (char) c);
579                         next_char();
580                 }
581                 is_float = true;
582         }
583
584         obstack_1grow(&symbol_obstack, '\0');
585         char *string = obstack_finish(&symbol_obstack);
586
587         if(is_float) {
588                 char *endptr;
589                 lexer_token.type         = T_FLOATINGPOINT;
590                 lexer_token.v.floatvalue = strtold(string, &endptr);
591
592                 if(*endptr != '\0') {
593                         parse_error("invalid number literal");
594                 }
595
596                 parse_floating_suffix();
597         } else {
598                 const char *endptr;
599                 lexer_token.type       = T_INTEGER;
600                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
601
602                 if(*endptr != '\0') {
603                         parse_error("invalid number literal");
604                 }
605
606                 parse_integer_suffix(false);
607         }
608         obstack_free(&symbol_obstack, string);
609 }
610
611 /**
612  * Parses a number and sets the lexer_token.
613  */
614 static void parse_number(void)
615 {
616         if (c == '0') {
617                 next_char();
618                 switch (c) {
619                         case 'X':
620                         case 'x':
621                                 parse_number_hex();
622                                 break;
623                         case '0':
624                         case '1':
625                         case '2':
626                         case '3':
627                         case '4':
628                         case '5':
629                         case '6':
630                         case '7':
631                                 parse_number_oct();
632                                 break;
633                         case '8':
634                         case '9':
635                                 next_char();
636                                 parse_error("invalid octal number");
637                                 lexer_token.type = T_ERROR;
638                                 return;
639                         case '.':
640                         case 'e':
641                         case 'E':
642                         default:
643                                 obstack_1grow(&symbol_obstack, '0');
644                                 parse_number_dec();
645                                 return;
646                 }
647         } else {
648                 parse_number_dec();
649         }
650 }
651
652 /**
653  * Returns the value of a digit.
654  * The only portable way to do it ...
655  */
656 static int digit_value(int digit) {
657         switch (digit) {
658         case '0': return 0;
659         case '1': return 1;
660         case '2': return 2;
661         case '3': return 3;
662         case '4': return 4;
663         case '5': return 5;
664         case '6': return 6;
665         case '7': return 7;
666         case '8': return 8;
667         case '9': return 9;
668         case 'a':
669         case 'A': return 10;
670         case 'b':
671         case 'B': return 11;
672         case 'c':
673         case 'C': return 12;
674         case 'd':
675         case 'D': return 13;
676         case 'e':
677         case 'E': return 14;
678         case 'f':
679         case 'F': return 15;
680         default:
681                 panic("wrong character given");
682         }
683 }
684
685 /**
686  * Parses an octal character sequence.
687  *
688  * @param first_digit  the already read first digit
689  */
690 static int parse_octal_sequence(const int first_digit)
691 {
692         assert(is_octal_digit(first_digit));
693         int value = digit_value(first_digit);
694         if (!is_octal_digit(c)) return value;
695         value = 8 * value + digit_value(c);
696         next_char();
697         if (!is_octal_digit(c)) return value;
698         value = 8 * value + digit_value(c);
699         next_char();
700
701         if(char_is_signed) {
702                 return (signed char) value;
703         } else {
704                 return (unsigned char) value;
705         }
706 }
707
708 /**
709  * Parses a hex character sequence.
710  */
711 static int parse_hex_sequence(void)
712 {
713         int value = 0;
714         while(isxdigit(c)) {
715                 value = 16 * value + digit_value(c);
716                 next_char();
717         }
718
719         if(char_is_signed) {
720                 return (signed char) value;
721         } else {
722                 return (unsigned char) value;
723         }
724 }
725
726 /**
727  * Parse an escape sequence.
728  */
729 static int parse_escape_sequence(void)
730 {
731         eat('\\');
732
733         int ec = c;
734         next_char();
735
736         switch(ec) {
737         case '"':  return '"';
738         case '\'': return '\'';
739         case '\\': return '\\';
740         case '?': return '\?';
741         case 'a': return '\a';
742         case 'b': return '\b';
743         case 'f': return '\f';
744         case 'n': return '\n';
745         case 'r': return '\r';
746         case 't': return '\t';
747         case 'v': return '\v';
748         case 'x':
749                 return parse_hex_sequence();
750         case '0':
751         case '1':
752         case '2':
753         case '3':
754         case '4':
755         case '5':
756         case '6':
757         case '7':
758                 return parse_octal_sequence(ec);
759         case EOF:
760                 parse_error("reached end of file while parsing escape sequence");
761                 return EOF;
762         default:
763                 parse_error("unknown escape sequence");
764                 return EOF;
765         }
766 }
767
768 /**
769  * Concatenate two strings.
770  */
771 string_t concat_strings(const string_t *const s1, const string_t *const s2)
772 {
773         const size_t len1 = s1->size - 1;
774         const size_t len2 = s2->size - 1;
775
776         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
777         memcpy(concat, s1->begin, len1);
778         memcpy(concat + len1, s2->begin, len2 + 1);
779
780 #if 0 /* TODO hash */
781         const char *result = strset_insert(&stringset, concat);
782         if(result != concat) {
783                 obstack_free(&symbol_obstack, concat);
784         }
785
786         return result;
787 #else
788         return (string_t){ concat, len1 + len2 + 1 };
789 #endif
790 }
791
792 /**
793  * Concatenate a string and a wide string.
794  */
795 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
796 {
797         const size_t len1 = s1->size - 1;
798         const size_t len2 = s2->size - 1;
799
800         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
801         const char *const src = s1->begin;
802         for (size_t i = 0; i != len1; ++i) {
803                 concat[i] = src[i];
804         }
805         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
806
807         return (wide_string_t){ concat, len1 + len2 + 1 };
808 }
809
810 /**
811  * Concatenate two wide strings.
812  */
813 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
814 {
815         const size_t len1 = s1->size - 1;
816         const size_t len2 = s2->size - 1;
817
818         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
819         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
820         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
821
822         return (wide_string_t){ concat, len1 + len2 + 1 };
823 }
824
825 /**
826  * Concatenate a wide string and a string.
827  */
828 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
829 {
830         const size_t len1 = s1->size - 1;
831         const size_t len2 = s2->size - 1;
832
833         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
834         memcpy(concat, s1->begin, len1 * sizeof(*concat));
835         const char *const src = s2->begin;
836         for (size_t i = 0; i != len2 + 1; ++i) {
837                 concat[i] = src[i];
838         }
839
840         return (wide_string_t){ concat, len1 + len2 + 1 };
841 }
842
843 /**
844  * Parse a string literal and set lexer_token.
845  */
846 static void parse_string_literal(void)
847 {
848         const unsigned start_linenr = lexer_token.source_position.linenr;
849
850         eat('"');
851
852         int tc;
853         while(1) {
854                 switch(c) {
855                 case '\\':
856                         tc = parse_escape_sequence();
857                         obstack_1grow(&symbol_obstack, (char) tc);
858                         break;
859
860                 case EOF: {
861                         source_position_t source_position;
862                         source_position.input_name = lexer_token.source_position.input_name;
863                         source_position.linenr     = start_linenr;
864                         errorf(source_position, "string has no end");
865                         lexer_token.type = T_ERROR;
866                         return;
867                 }
868
869                 case '"':
870                         next_char();
871                         goto end_of_string;
872
873                 default:
874                         obstack_1grow(&symbol_obstack, (char) c);
875                         next_char();
876                         break;
877                 }
878         }
879
880 end_of_string:
881
882         /* TODO: concatenate multiple strings separated by whitespace... */
883
884         /* add finishing 0 to the string */
885         obstack_1grow(&symbol_obstack, '\0');
886         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
887         const char *const string = obstack_finish(&symbol_obstack);
888
889 #if 0 /* TODO hash */
890         /* check if there is already a copy of the string */
891         result = strset_insert(&stringset, string);
892         if(result != string) {
893                 obstack_free(&symbol_obstack, string);
894         }
895 #else
896         const char *const result = string;
897 #endif
898
899         lexer_token.type           = T_STRING_LITERAL;
900         lexer_token.v.string.begin = result;
901         lexer_token.v.string.size  = size;
902 }
903
904 /**
905  * Parse a wide character constant and set lexer_token.
906  */
907 static void parse_wide_character_constant(void)
908 {
909         eat('\'');
910
911         int found_char = 0;
912         while(1) {
913                 switch(c) {
914                 case '\\':
915                         found_char = parse_escape_sequence();
916                         break;
917
918                 MATCH_NEWLINE(
919                         parse_error("newline while parsing character constant");
920                         break;
921                 )
922
923                 case '\'':
924                         next_char();
925                         goto end_of_wide_char_constant;
926
927                 case EOF:
928                         parse_error("EOF while parsing character constant");
929                         lexer_token.type = T_ERROR;
930                         return;
931
932                 default:
933                         if(found_char != 0) {
934                                 parse_error("more than 1 characters in character "
935                                             "constant");
936                                 goto end_of_wide_char_constant;
937                         } else {
938                                 found_char = c;
939                                 next_char();
940                         }
941                         break;
942                 }
943         }
944
945 end_of_wide_char_constant:
946         lexer_token.type       = T_INTEGER;
947         lexer_token.v.intvalue = found_char;
948         lexer_token.datatype   = type_wchar_t;
949 }
950
951 /**
952  * Parse a wide string literal and set lexer_token.
953  */
954 static void parse_wide_string_literal(void)
955 {
956         const unsigned start_linenr = lexer_token.source_position.linenr;
957
958         assert(c == '"');
959         next_char();
960
961         while(1) {
962                 switch(c) {
963                 case '\\': {
964                         wchar_rep_t tc = parse_escape_sequence();
965                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
966                         break;
967                 }
968
969                 case EOF: {
970                         source_position_t source_position;
971                         source_position.input_name = lexer_token.source_position.input_name;
972                         source_position.linenr     = start_linenr;
973                         errorf(source_position, "string has no end");
974                         lexer_token.type = T_ERROR;
975                         return;
976                 }
977
978                 case '"':
979                         next_char();
980                         goto end_of_string;
981
982                 default: {
983                         wchar_rep_t tc = c;
984                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
985                         next_char();
986                         break;
987                 }
988                 }
989         }
990
991 end_of_string:;
992
993         /* TODO: concatenate multiple strings separated by whitespace... */
994
995         /* add finishing 0 to the string */
996         wchar_rep_t nul = L'\0';
997         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
998         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
999         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1000
1001 #if 0 /* TODO hash */
1002         /* check if there is already a copy of the string */
1003         const wchar_rep_t *const result = strset_insert(&stringset, string);
1004         if(result != string) {
1005                 obstack_free(&symbol_obstack, string);
1006         }
1007 #else
1008         const wchar_rep_t *const result = string;
1009 #endif
1010
1011         lexer_token.type                = T_WIDE_STRING_LITERAL;
1012         lexer_token.v.wide_string.begin = result;
1013         lexer_token.v.wide_string.size  = size;
1014 }
1015
1016 /**
1017  * Parse a character constant and set lexer_token.
1018  */
1019 static void parse_character_constant(void)
1020 {
1021         const unsigned start_linenr = lexer_token.source_position.linenr;
1022
1023         eat('\'');
1024
1025         int tc;
1026         while(1) {
1027                 switch(c) {
1028                 case '\\':
1029                         tc = parse_escape_sequence();
1030                         obstack_1grow(&symbol_obstack, (char) tc);
1031                         break;
1032
1033                 MATCH_NEWLINE(
1034                         parse_error("newline while parsing character constant");
1035                         break;
1036                 )
1037
1038                 case EOF: {
1039                         source_position_t source_position;
1040                         source_position.input_name = lexer_token.source_position.input_name;
1041                         source_position.linenr     = start_linenr;
1042                         errorf(source_position, "EOF while parsing character constant");
1043                         lexer_token.type = T_ERROR;
1044                         return;
1045                 }
1046
1047                 case '\'':
1048                         next_char();
1049                         goto end_of_char_constant;
1050
1051                 default:
1052                         obstack_1grow(&symbol_obstack, (char) c);
1053                         next_char();
1054                         break;
1055
1056                 }
1057         }
1058
1059 end_of_char_constant:;
1060         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1061         const char *const string = obstack_finish(&symbol_obstack);
1062
1063         lexer_token.type           = T_CHARS;
1064         lexer_token.v.string.begin = string;
1065         lexer_token.v.string.size  = size;
1066         lexer_token.datatype       = type_int;
1067 }
1068
1069 /**
1070  * Skip a multiline comment.
1071  */
1072 static void skip_multiline_comment(void)
1073 {
1074         unsigned start_linenr = lexer_token.source_position.linenr;
1075
1076         while(1) {
1077                 switch(c) {
1078                 case '/':
1079                         next_char();
1080                         if (c == '*') {
1081                                 /* TODO: nested comment, warn here */
1082                         }
1083                         break;
1084                 case '*':
1085                         next_char();
1086                         if(c == '/') {
1087                                 next_char();
1088                                 return;
1089                         }
1090                         break;
1091
1092                 MATCH_NEWLINE(break;)
1093
1094                 case EOF: {
1095                         source_position_t source_position;
1096                         source_position.input_name = lexer_token.source_position.input_name;
1097                         source_position.linenr     = start_linenr;
1098                         errorf(source_position, "at end of file while looking for comment end");
1099                         return;
1100                 }
1101
1102                 default:
1103                         next_char();
1104                         break;
1105                 }
1106         }
1107 }
1108
1109 /**
1110  * Skip a single line comment.
1111  */
1112 static void skip_line_comment(void)
1113 {
1114         while(1) {
1115                 switch(c) {
1116                 case EOF:
1117                         return;
1118
1119                 case '\n':
1120                 case '\r':
1121                         return;
1122
1123                 default:
1124                         next_char();
1125                         break;
1126                 }
1127         }
1128 }
1129
1130 /** The current preprocessor token. */
1131 static token_t pp_token;
1132
1133 /**
1134  * Read the next preprocessor token.
1135  */
1136 static inline void next_pp_token(void)
1137 {
1138         lexer_next_preprocessing_token();
1139         pp_token = lexer_token;
1140 }
1141
1142 /**
1143  * Eat all preprocessor tokens until newline.
1144  */
1145 static void eat_until_newline(void)
1146 {
1147         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1148                 next_pp_token();
1149         }
1150 }
1151
1152 /**
1153  * Handle the define directive.
1154  */
1155 static void define_directive(void)
1156 {
1157         lexer_next_preprocessing_token();
1158         if(lexer_token.type != T_IDENTIFIER) {
1159                 parse_error("expected identifier after #define\n");
1160                 eat_until_newline();
1161         }
1162 }
1163
1164 /**
1165  * Handle the ifdef directive.
1166  */
1167 static void ifdef_directive(int is_ifndef)
1168 {
1169         (void) is_ifndef;
1170         lexer_next_preprocessing_token();
1171         //expect_identifier();
1172         //extect_newline();
1173 }
1174
1175 /**
1176  * Handle the endif directive.
1177  */
1178 static void endif_directive(void)
1179 {
1180         //expect_newline();
1181 }
1182
1183 /**
1184  * Parse the line directive.
1185  */
1186 static void parse_line_directive(void)
1187 {
1188         if(pp_token.type != T_INTEGER) {
1189                 parse_error("expected integer");
1190         } else {
1191                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1192                 next_pp_token();
1193         }
1194         if(pp_token.type == T_STRING_LITERAL) {
1195                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1196                 next_pp_token();
1197         }
1198
1199         eat_until_newline();
1200 }
1201
1202 /**
1203  * STDC pragmas.
1204  */
1205 typedef enum {
1206         STDC_UNKNOWN,
1207         STDC_FP_CONTRACT,
1208         STDC_FENV_ACCESS,
1209         STDC_CX_LIMITED_RANGE
1210 } stdc_pragma_kind_t;
1211
1212 /**
1213  * STDC pragma values.
1214  */
1215 typedef enum {
1216         STDC_VALUE_UNKNOWN,
1217         STDC_VALUE_ON,
1218         STDC_VALUE_OFF,
1219         STDC_VALUE_DEFAULT
1220 } stdc_pragma_value_kind_t;
1221
1222 /**
1223  * Parse a pragma directive.
1224  */
1225 static void parse_pragma(void) {
1226         bool unknown_pragma = true;
1227
1228         next_pp_token();
1229         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1230                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1231                 /* a STDC pragma */
1232                 if (c_mode & _C99) {
1233                         next_pp_token();
1234
1235                         switch (pp_token.v.symbol->pp_ID) {
1236                         case TP_FP_CONTRACT:
1237                                 kind = STDC_FP_CONTRACT;
1238                                 break;
1239                         case TP_FENV_ACCESS:
1240                                 kind = STDC_FENV_ACCESS;
1241                                 break;
1242                         case TP_CX_LIMITED_RANGE:
1243                                 kind = STDC_CX_LIMITED_RANGE;
1244                                 break;
1245                         default:
1246                                 break;
1247                         }
1248                         if (kind != STDC_UNKNOWN) {
1249                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1250                                 next_pp_token();
1251                                 switch (pp_token.v.symbol->pp_ID) {
1252                                 case TP_ON:
1253                                         value = STDC_VALUE_ON;
1254                                         break;
1255                                 case TP_OFF:
1256                                         value = STDC_VALUE_OFF;
1257                                         break;
1258                                 case TP_DEFAULT:
1259                                         value = STDC_VALUE_DEFAULT;
1260                                         break;
1261                                 default:
1262                                         break;
1263                                 }
1264                                 if (value != STDC_VALUE_UNKNOWN) {
1265                                         unknown_pragma = false;
1266                                 } else {
1267                                         errorf(pp_token.source_position, "bad STDC pragma argument");
1268                                 }
1269                         }
1270                 }
1271         } else {
1272                 unknown_pragma = true;
1273         }
1274         eat_until_newline();
1275         if (unknown_pragma && warning.unknown_pragmas) {
1276                 warningf(pp_token.source_position, "encountered unknown #pragma");
1277         }
1278 }
1279
1280 /**
1281  * Parse a preprocessor non-null directive.
1282  */
1283 static void parse_preprocessor_identifier(void)
1284 {
1285         assert(pp_token.type == T_IDENTIFIER);
1286         symbol_t *symbol = pp_token.v.symbol;
1287
1288         switch(symbol->pp_ID) {
1289         case TP_include:
1290                 printf("include - enable header name parsing!\n");
1291                 break;
1292         case TP_define:
1293                 define_directive();
1294                 break;
1295         case TP_ifdef:
1296                 ifdef_directive(0);
1297                 break;
1298         case TP_ifndef:
1299                 ifdef_directive(1);
1300                 break;
1301         case TP_endif:
1302                 endif_directive();
1303                 break;
1304         case TP_line:
1305                 next_pp_token();
1306                 parse_line_directive();
1307                 break;
1308         case TP_if:
1309         case TP_else:
1310         case TP_elif:
1311         case TP_undef:
1312         case TP_error:
1313                 /* TODO; output the rest of the line */
1314                 parse_error("#error directive: ");
1315                 break;
1316         case TP_pragma:
1317                 parse_pragma();
1318                 break;
1319         }
1320 }
1321
1322 /**
1323  * Parse a preprocessor directive.
1324  */
1325 static void parse_preprocessor_directive(void)
1326 {
1327         next_pp_token();
1328
1329         switch(pp_token.type) {
1330         case T_IDENTIFIER:
1331                 parse_preprocessor_identifier();
1332                 break;
1333         case T_INTEGER:
1334                 parse_line_directive();
1335                 break;
1336         case '\n':
1337                 /* NULL directive, see Â§ 6.10.7 */
1338                 break;
1339         default:
1340                 parse_error("invalid preprocessor directive");
1341                 eat_until_newline();
1342                 break;
1343         }
1344 }
1345
1346 #define MAYBE_PROLOG                                       \
1347                         next_char();                                   \
1348                         while(1) {                                     \
1349                                 switch(c) {
1350
1351 #define MAYBE(ch, set_type)                                \
1352                                 case ch:                                   \
1353                                         next_char();                           \
1354                                         lexer_token.type = set_type;           \
1355                                         return;
1356
1357 #define ELSE_CODE(code)                                    \
1358                                 default:                                   \
1359                                         code;                                  \
1360                                 }                                          \
1361                         } /* end of while(1) */                        \
1362                         break;
1363
1364 #define ELSE(set_type)                                     \
1365                 ELSE_CODE(                                         \
1366                         lexer_token.type = set_type;                   \
1367                         return;                                        \
1368                 )
1369
1370 void lexer_next_preprocessing_token(void)
1371 {
1372         while(1) {
1373                 switch(c) {
1374                 case ' ':
1375                 case '\t':
1376                         next_char();
1377                         break;
1378
1379                 MATCH_NEWLINE(
1380                         lexer_token.type = '\n';
1381                         return;
1382                 )
1383
1384                 SYMBOL_CHARS
1385                         parse_symbol();
1386                         /* might be a wide string ( L"string" ) */
1387                         if(lexer_token.type == T_IDENTIFIER &&
1388                             lexer_token.v.symbol == symbol_L) {
1389                             if(c == '"') {
1390                                         parse_wide_string_literal();
1391                                 } else if(c == '\'') {
1392                                         parse_wide_character_constant();
1393                                 }
1394                         }
1395                         return;
1396
1397                 DIGITS
1398                         parse_number();
1399                         return;
1400
1401                 case '"':
1402                         parse_string_literal();
1403                         return;
1404
1405                 case '\'':
1406                         parse_character_constant();
1407                         return;
1408
1409                 case '.':
1410                         MAYBE_PROLOG
1411                                 case '0':
1412                                 case '1':
1413                                 case '2':
1414                                 case '3':
1415                                 case '4':
1416                                 case '5':
1417                                 case '6':
1418                                 case '7':
1419                                 case '8':
1420                                 case '9':
1421                                         put_back(c);
1422                                         c = '.';
1423                                         parse_number_dec();
1424                                         return;
1425
1426                                 case '.':
1427                                         MAYBE_PROLOG
1428                                         MAYBE('.', T_DOTDOTDOT)
1429                                         ELSE_CODE(
1430                                                 put_back(c);
1431                                                 c = '.';
1432                                                 lexer_token.type = '.';
1433                                                 return;
1434                                         )
1435                         ELSE('.')
1436                 case '&':
1437                         MAYBE_PROLOG
1438                         MAYBE('&', T_ANDAND)
1439                         MAYBE('=', T_ANDEQUAL)
1440                         ELSE('&')
1441                 case '*':
1442                         MAYBE_PROLOG
1443                         MAYBE('=', T_ASTERISKEQUAL)
1444                         ELSE('*')
1445                 case '+':
1446                         MAYBE_PROLOG
1447                         MAYBE('+', T_PLUSPLUS)
1448                         MAYBE('=', T_PLUSEQUAL)
1449                         ELSE('+')
1450                 case '-':
1451                         MAYBE_PROLOG
1452                         MAYBE('>', T_MINUSGREATER)
1453                         MAYBE('-', T_MINUSMINUS)
1454                         MAYBE('=', T_MINUSEQUAL)
1455                         ELSE('-')
1456                 case '!':
1457                         MAYBE_PROLOG
1458                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1459                         ELSE('!')
1460                 case '/':
1461                         MAYBE_PROLOG
1462                         MAYBE('=', T_SLASHEQUAL)
1463                                 case '*':
1464                                         next_char();
1465                                         skip_multiline_comment();
1466                                         lexer_next_preprocessing_token();
1467                                         return;
1468                                 case '/':
1469                                         next_char();
1470                                         skip_line_comment();
1471                                         lexer_next_preprocessing_token();
1472                                         return;
1473                         ELSE('/')
1474                 case '%':
1475                         MAYBE_PROLOG
1476                         MAYBE('>', T_PERCENTGREATER)
1477                         MAYBE('=', T_PERCENTEQUAL)
1478                                 case ':':
1479                                         MAYBE_PROLOG
1480                                                 case '%':
1481                                                         MAYBE_PROLOG
1482                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1483                                                         ELSE_CODE(
1484                                                                 put_back(c);
1485                                                                 c = '%';
1486                                                                 lexer_token.type = T_PERCENTCOLON;
1487                                                                 return;
1488                                                         )
1489                                         ELSE(T_PERCENTCOLON)
1490                         ELSE('%')
1491                 case '<':
1492                         MAYBE_PROLOG
1493                         MAYBE(':', T_LESSCOLON)
1494                         MAYBE('%', T_LESSPERCENT)
1495                         MAYBE('=', T_LESSEQUAL)
1496                                 case '<':
1497                                         MAYBE_PROLOG
1498                                         MAYBE('=', T_LESSLESSEQUAL)
1499                                         ELSE(T_LESSLESS)
1500                         ELSE('<')
1501                 case '>':
1502                         MAYBE_PROLOG
1503                         MAYBE('=', T_GREATEREQUAL)
1504                                 case '>':
1505                                         MAYBE_PROLOG
1506                                         MAYBE('=', T_GREATERGREATEREQUAL)
1507                                         ELSE(T_GREATERGREATER)
1508                         ELSE('>')
1509                 case '^':
1510                         MAYBE_PROLOG
1511                         MAYBE('=', T_CARETEQUAL)
1512                         ELSE('^')
1513                 case '|':
1514                         MAYBE_PROLOG
1515                         MAYBE('=', T_PIPEEQUAL)
1516                         MAYBE('|', T_PIPEPIPE)
1517                         ELSE('|')
1518                 case ':':
1519                         MAYBE_PROLOG
1520                         MAYBE('>', T_COLONGREATER)
1521                         ELSE(':')
1522                 case '=':
1523                         MAYBE_PROLOG
1524                         MAYBE('=', T_EQUALEQUAL)
1525                         ELSE('=')
1526                 case '#':
1527                         MAYBE_PROLOG
1528                         MAYBE('#', T_HASHHASH)
1529                         ELSE('#')
1530
1531                 case '?':
1532                 case '[':
1533                 case ']':
1534                 case '(':
1535                 case ')':
1536                 case '{':
1537                 case '}':
1538                 case '~':
1539                 case ';':
1540                 case ',':
1541                 case '\\':
1542                         lexer_token.type = c;
1543                         next_char();
1544                         return;
1545
1546                 case EOF:
1547                         lexer_token.type = T_EOF;
1548                         return;
1549
1550                 default:
1551                         next_char();
1552                         errorf(lexer_token.source_position, "unknown character '%c' found\n", c);
1553                         lexer_token.type = T_ERROR;
1554                         return;
1555                 }
1556         }
1557 }
1558
1559 void lexer_next_token(void)
1560 {
1561         lexer_next_preprocessing_token();
1562         if(lexer_token.type != '\n')
1563                 return;
1564
1565 newline_found:
1566         do {
1567                 lexer_next_preprocessing_token();
1568         } while(lexer_token.type == '\n');
1569
1570         if(lexer_token.type == '#') {
1571                 parse_preprocessor_directive();
1572                 goto newline_found;
1573         }
1574 }
1575
1576 void init_lexer(void)
1577 {
1578         strset_init(&stringset);
1579 }
1580
1581 void lexer_open_stream(FILE *stream, const char *input_name)
1582 {
1583         input                                  = stream;
1584         lexer_token.source_position.linenr     = 0;
1585         lexer_token.source_position.input_name = input_name;
1586
1587         symbol_L = symbol_table_insert("L");
1588         bufpos = NULL;
1589         bufend = NULL;
1590
1591         /* place a virtual \n at the beginning so the lexer knows that we're
1592          * at the beginning of a line */
1593         c = '\n';
1594 }
1595
1596 void exit_lexer(void)
1597 {
1598         strset_destroy(&stringset);
1599 }
1600
1601 static __attribute__((unused))
1602 void dbg_pos(const source_position_t source_position)
1603 {
1604         fprintf(stdout, "%s:%u\n", source_position.input_name,
1605                 source_position.linenr);
1606         fflush(stdout);
1607 }