const.
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7 #include "adt/strset.h"
8 #include "adt/util.h"
9 #include "type_t.h"
10 #include "target_architecture.h"
11 #include "parser.h"
12
13 #include <assert.h>
14 #include <errno.h>
15 #include <string.h>
16 #include <stdbool.h>
17 #include <ctype.h>
18
19 //#define DEBUG_CHARS
20 #define MAX_PUTBACK 3
21
22 #ifdef _WIN32
23 /* No strtold on windows and no replacement yet */
24 #define strtold(s, e) strtod(s, e)
25 #endif
26
27 #if defined HAS_SIGNED_CHAR
28 typedef signed char char_type;
29 #elif defined HAS_UNSIGNED_CHAR
30 typedef unsigned char char_type;
31 #else
32 #       error signedness of char not determined
33 #endif
34
35 static int         c;
36 token_t            lexer_token;
37 symbol_t          *symbol_L;
38 static FILE       *input;
39 static char        buf[1024 + MAX_PUTBACK];
40 static const char *bufend;
41 static const char *bufpos;
42 static strset_t    stringset;
43
44 static type_t     *type_int        = NULL;
45 static type_t     *type_uint       = NULL;
46 static type_t     *type_long       = NULL;
47 static type_t     *type_ulong      = NULL;
48 static type_t     *type_longlong   = NULL;
49 static type_t     *type_ulonglong  = NULL;
50 static type_t     *type_float      = NULL;
51 static type_t     *type_double     = NULL;
52 static type_t     *type_longdouble = NULL;
53
54 static void error_prefix_at(const char *input_name, unsigned linenr)
55 {
56         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
57 }
58
59 static void error_prefix(void)
60 {
61         error_prefix_at(lexer_token.source_position.input_name,
62                         lexer_token.source_position.linenr);
63 }
64
65 static void parse_error(const char *msg)
66 {
67         error_prefix();
68         fprintf(stderr, "%s\n", msg);
69 }
70
71 static inline void next_real_char(void)
72 {
73         bufpos++;
74         if(bufpos >= bufend) {
75                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
76                                  input);
77                 if(s == 0) {
78                         c = EOF;
79                         return;
80                 }
81                 bufpos = buf + MAX_PUTBACK;
82                 bufend = buf + MAX_PUTBACK + s;
83         }
84         c = *(bufpos);
85 }
86
87 static inline void put_back(int pc)
88 {
89         assert(bufpos >= buf);
90         //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
91
92         char *p = buf + (bufpos - buf);
93         *p = (char) pc;
94
95         /* going backwards in the buffer is legal as long as it's not more often
96          * than MAX_PUTBACK */
97         bufpos--;
98
99 #ifdef DEBUG_CHARS
100         printf("putback '%c'\n", pc);
101 #endif
102 }
103
104 static inline void next_char(void);
105
106 #define MATCH_NEWLINE(code)                   \
107         case '\r':                                \
108                 next_char();                          \
109                 if(c == '\n') {                       \
110                         next_char();                      \
111                 }                                     \
112                 lexer_token.source_position.linenr++; \
113                 code;                                 \
114         case '\n':                                \
115                 next_char();                          \
116                 lexer_token.source_position.linenr++; \
117                 code;
118
119 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
120
121 static void maybe_concat_lines(void)
122 {
123         eat('\\');
124
125         switch(c) {
126         MATCH_NEWLINE(return;)
127
128         default:
129                 break;
130         }
131
132         put_back(c);
133         c = '\\';
134 }
135
136 static inline void next_char(void)
137 {
138         next_real_char();
139
140         /* filter trigraphs */
141         if(UNLIKELY(c == '\\')) {
142                 maybe_concat_lines();
143                 goto end_of_next_char;
144         }
145
146         if(LIKELY(c != '?'))
147                 goto end_of_next_char;
148
149         next_real_char();
150         if(LIKELY(c != '?')) {
151                 put_back(c);
152                 c = '?';
153                 goto end_of_next_char;
154         }
155
156         next_real_char();
157         switch(c) {
158         case '=': c = '#'; break;
159         case '(': c = '['; break;
160         case '/': c = '\\'; maybe_concat_lines(); break;
161         case ')': c = ']'; break;
162         case '\'': c = '^'; break;
163         case '<': c = '{'; break;
164         case '!': c = '|'; break;
165         case '>': c = '}'; break;
166         case '-': c = '~'; break;
167         default:
168                 put_back('?');
169                 put_back(c);
170                 c = '?';
171                 break;
172         }
173
174 end_of_next_char:;
175 #ifdef DEBUG_CHARS
176         printf("nchar '%c'\n", c);
177 #endif
178 }
179
180 #define SYMBOL_CHARS  \
181         case 'a':         \
182         case 'b':         \
183         case 'c':         \
184         case 'd':         \
185         case 'e':         \
186         case 'f':         \
187         case 'g':         \
188         case 'h':         \
189         case 'i':         \
190         case 'j':         \
191         case 'k':         \
192         case 'l':         \
193         case 'm':         \
194         case 'n':         \
195         case 'o':         \
196         case 'p':         \
197         case 'q':         \
198         case 'r':         \
199         case 's':         \
200         case 't':         \
201         case 'u':         \
202         case 'v':         \
203         case 'w':         \
204         case 'x':         \
205         case 'y':         \
206         case 'z':         \
207         case 'A':         \
208         case 'B':         \
209         case 'C':         \
210         case 'D':         \
211         case 'E':         \
212         case 'F':         \
213         case 'G':         \
214         case 'H':         \
215         case 'I':         \
216         case 'J':         \
217         case 'K':         \
218         case 'L':         \
219         case 'M':         \
220         case 'N':         \
221         case 'O':         \
222         case 'P':         \
223         case 'Q':         \
224         case 'R':         \
225         case 'S':         \
226         case 'T':         \
227         case 'U':         \
228         case 'V':         \
229         case 'W':         \
230         case 'X':         \
231         case 'Y':         \
232         case 'Z':         \
233         case '_':
234
235 #define DIGITS        \
236         case '0':         \
237         case '1':         \
238         case '2':         \
239         case '3':         \
240         case '4':         \
241         case '5':         \
242         case '6':         \
243         case '7':         \
244         case '8':         \
245         case '9':
246
247 static void parse_symbol(void)
248 {
249         symbol_t *symbol;
250         char     *string;
251
252         obstack_1grow(&symbol_obstack, (char) c);
253         next_char();
254
255         while(1) {
256                 switch(c) {
257                 DIGITS
258                 SYMBOL_CHARS
259                         obstack_1grow(&symbol_obstack, (char) c);
260                         next_char();
261                         break;
262
263                 default:
264                         goto end_symbol;
265                 }
266         }
267
268 end_symbol:
269         obstack_1grow(&symbol_obstack, '\0');
270
271         string = obstack_finish(&symbol_obstack);
272         symbol = symbol_table_insert(string);
273
274         lexer_token.type     = symbol->ID;
275         lexer_token.v.symbol = symbol;
276
277         if(symbol->string != string) {
278                 obstack_free(&symbol_obstack, string);
279         }
280 }
281
282 static void parse_integer_suffix(bool is_oct_hex)
283 {
284         bool is_unsigned  = false;
285         bool min_long     = false;
286         bool min_longlong = false;
287
288         if(c == 'U' || c == 'u') {
289                 is_unsigned = true;
290                 next_char();
291                 if(c == 'L' || c == 'l') {
292                         min_long = true;
293                         next_char();
294                         if(c == 'L' || c == 'l') {
295                                 min_longlong = true;
296                                 next_char();
297                         }
298                 }
299         } else if(c == 'l' || c == 'L') {
300                 min_long = true;
301                 next_char();
302                 if(c == 'l' || c == 'L') {
303                         min_longlong = true;
304                         next_char();
305                         if(c == 'u' || c == 'U') {
306                                 is_unsigned = true;
307                                 next_char();
308                         }
309                 } else if(c == 'u' || c == 'U') {
310                         is_unsigned = true;
311                         next_char();
312                         lexer_token.datatype = type_ulong;
313                 }
314         }
315
316         if(!is_unsigned) {
317                 long long v = lexer_token.v.intvalue;
318                 if(!min_long) {
319                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
320                                 lexer_token.datatype = type_int;
321                                 return;
322                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
323                                 lexer_token.datatype = type_uint;
324                                 return;
325                         }
326                 }
327                 if(!min_longlong) {
328                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
329                                 lexer_token.datatype = type_long;
330                                 return;
331                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
332                                 lexer_token.datatype = type_ulong;
333                                 return;
334                         }
335                 }
336                 unsigned long long uv = (unsigned long long) v;
337                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
338                         lexer_token.datatype = type_ulonglong;
339                         return;
340                 }
341
342                 lexer_token.datatype = type_longlong;
343         } else {
344                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
345                 if(!min_long && v <= TARGET_UINT_MAX) {
346                         lexer_token.datatype = type_uint;
347                         return;
348                 }
349                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
350                         lexer_token.datatype = type_ulong;
351                         return;
352                 }
353                 lexer_token.datatype = type_ulonglong;
354         }
355 }
356
357 static void parse_floating_suffix(void)
358 {
359         switch(c) {
360         /* TODO: do something usefull with the suffixes... */
361         case 'f':
362         case 'F':
363                 next_char();
364                 lexer_token.datatype = type_float;
365                 break;
366         case 'l':
367         case 'L':
368                 next_char();
369                 lexer_token.datatype = type_longdouble;
370                 break;
371         default:
372                 lexer_token.datatype = type_double;
373                 break;
374         }
375 }
376
377 /**
378  * A replacement for strtoull. Only those parts needed for
379  * our parser are implemented.
380  */
381 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
382         unsigned long long v = 0;
383
384         switch (base) {
385         case 16:
386                 for (;; ++s) {
387                         /* check for overrun */
388                         if (v >= 0x1000000000000000ULL)
389                                 break;
390                         switch (tolower(*s)) {
391                         case '0': v <<= 4; break;
392                         case '1': v <<= 4; v |= 0x1; break;
393                         case '2': v <<= 4; v |= 0x2; break;
394                         case '3': v <<= 4; v |= 0x3; break;
395                         case '4': v <<= 4; v |= 0x4; break;
396                         case '5': v <<= 4; v |= 0x5; break;
397                         case '6': v <<= 4; v |= 0x6; break;
398                         case '7': v <<= 4; v |= 0x7; break;
399                         case '8': v <<= 4; v |= 0x8; break;
400                         case '9': v <<= 4; v |= 0x9; break;
401                         case 'a': v <<= 4; v |= 0xa; break;
402                         case 'b': v <<= 4; v |= 0xb; break;
403                         case 'c': v <<= 4; v |= 0xc; break;
404                         case 'd': v <<= 4; v |= 0xd; break;
405                         case 'e': v <<= 4; v |= 0xe; break;
406                         case 'f': v <<= 4; v |= 0xf; break;
407                         default:
408                                 goto end;
409                         }
410                 }
411                 break;
412         case 8:
413                 for (;; ++s) {
414                         /* check for overrun */
415                         if (v >= 0x2000000000000000ULL)
416                                 break;
417                         switch (tolower(*s)) {
418                         case '0': v <<= 3; break;
419                         case '1': v <<= 3; v |= 1; break;
420                         case '2': v <<= 3; v |= 2; break;
421                         case '3': v <<= 3; v |= 3; break;
422                         case '4': v <<= 3; v |= 4; break;
423                         case '5': v <<= 3; v |= 5; break;
424                         case '6': v <<= 3; v |= 6; break;
425                         case '7': v <<= 3; v |= 7; break;
426                         default:
427                                 goto end;
428                         }
429                 }
430                 break;
431         case 10:
432                 for (;; ++s) {
433                         /* check for overrun */
434                         if (v > 0x1999999999999999ULL)
435                                 break;
436                         switch (tolower(*s)) {
437                         case '0': v *= 10; break;
438                         case '1': v *= 10; v += 1; break;
439                         case '2': v *= 10; v += 2; break;
440                         case '3': v *= 10; v += 3; break;
441                         case '4': v *= 10; v += 4; break;
442                         case '5': v *= 10; v += 5; break;
443                         case '6': v *= 10; v += 6; break;
444                         case '7': v *= 10; v += 7; break;
445                         case '8': v *= 10; v += 8; break;
446                         case '9': v *= 10; v += 9; break;
447                         default:
448                                 goto end;
449                         }
450                 }
451                 break;
452         default:
453                 assert(0);
454                 break;
455         }
456 end:
457         *endptr = s;
458         return v;
459 }
460
461 static void parse_number_hex(void)
462 {
463         assert(c == 'x' || c == 'X');
464         next_char();
465
466         while(isxdigit(c)) {
467                 obstack_1grow(&symbol_obstack, (char) c);
468                 next_char();
469         }
470         obstack_1grow(&symbol_obstack, '\0');
471         char *string = obstack_finish(&symbol_obstack);
472
473         if(c == '.' || c == 'p' || c == 'P') {
474                 next_char();
475                 panic("Hex floating point numbers not implemented yet");
476         }
477         if(*string == '\0') {
478                 parse_error("invalid hex number");
479                 lexer_token.type = T_ERROR;
480         }
481
482         const char *endptr;
483         lexer_token.type       = T_INTEGER;
484         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
485         if(*endptr != '\0') {
486                 parse_error("hex number literal too long");
487         }
488
489         obstack_free(&symbol_obstack, string);
490         parse_integer_suffix(true);
491 }
492
493 static inline bool is_octal_digit(int chr)
494 {
495         return '0' <= chr && chr <= '7';
496 }
497
498 static void parse_number_oct(void)
499 {
500         while(is_octal_digit(c)) {
501                 obstack_1grow(&symbol_obstack, (char) c);
502                 next_char();
503         }
504         obstack_1grow(&symbol_obstack, '\0');
505         char *string = obstack_finish(&symbol_obstack);
506
507         const char *endptr;
508         lexer_token.type       = T_INTEGER;
509         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
510         if(*endptr != '\0') {
511                 parse_error("octal number literal too long");
512         }
513
514         obstack_free(&symbol_obstack, string);
515         parse_integer_suffix(true);
516 }
517
518 static void parse_number_dec(void)
519 {
520         bool is_float = false;
521         while(isdigit(c)) {
522                 obstack_1grow(&symbol_obstack, (char) c);
523                 next_char();
524         }
525
526         if(c == '.') {
527                 obstack_1grow(&symbol_obstack, '.');
528                 next_char();
529
530                 while(isdigit(c)) {
531                         obstack_1grow(&symbol_obstack, (char) c);
532                         next_char();
533                 }
534                 is_float = true;
535         }
536         if(c == 'e' || c == 'E') {
537                 obstack_1grow(&symbol_obstack, 'e');
538                 next_char();
539
540                 if(c == '-' || c == '+') {
541                         obstack_1grow(&symbol_obstack, (char) c);
542                         next_char();
543                 }
544
545                 while(isdigit(c)) {
546                         obstack_1grow(&symbol_obstack, (char) c);
547                         next_char();
548                 }
549                 is_float = true;
550         }
551
552         obstack_1grow(&symbol_obstack, '\0');
553         char *string = obstack_finish(&symbol_obstack);
554
555         if(is_float) {
556                 char *endptr;
557                 lexer_token.type         = T_FLOATINGPOINT;
558                 lexer_token.v.floatvalue = strtold(string, &endptr);
559
560                 if(*endptr != '\0') {
561                         parse_error("invalid number literal");
562                 }
563
564                 parse_floating_suffix();
565         } else {
566                 const char *endptr;
567                 lexer_token.type       = T_INTEGER;
568                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
569
570                 if(*endptr != '\0') {
571                         parse_error("invalid number literal");
572                 }
573
574                 parse_integer_suffix(false);
575         }
576         obstack_free(&symbol_obstack, string);
577 }
578
579 static void parse_number(void)
580 {
581         if (c == '0') {
582                 next_char();
583                 switch (c) {
584                         case 'X':
585                         case 'x':
586                                 parse_number_hex();
587                                 break;
588                         case '0':
589                         case '1':
590                         case '2':
591                         case '3':
592                         case '4':
593                         case '5':
594                         case '6':
595                         case '7':
596                                 parse_number_oct();
597                                 break;
598                         case '8':
599                         case '9':
600                                 next_char();
601                                 parse_error("invalid octal number");
602                                 lexer_token.type = T_ERROR;
603                                 return;
604                         case '.':
605                         case 'e':
606                         case 'E':
607                         default:
608                                 obstack_1grow(&symbol_obstack, '0');
609                                 parse_number_dec();
610                                 return;
611                 }
612         } else {
613                 parse_number_dec();
614         }
615 }
616
617 static int parse_octal_sequence(const int first_digit)
618 {
619         assert(is_octal_digit(first_digit));
620         int value = first_digit - '0';
621         if (!is_octal_digit(c)) return value;
622         value = 8 * value + c - '0';
623         next_char();
624         if (!is_octal_digit(c)) return value;
625         value = 8 * value + c - '0';
626         next_char();
627         return (char_type)value;
628 }
629
630 static int parse_hex_sequence(void)
631 {
632         int value = 0;
633         while(1) {
634                 if (c >= '0' && c <= '9') {
635                         value = 16 * value + c - '0';
636                 } else if ('A' <= c && c <= 'F') {
637                         value = 16 * value + c - 'A' + 10;
638                 } else if ('a' <= c && c <= 'f') {
639                         value = 16 * value + c - 'a' + 10;
640                 } else {
641                         break;
642                 }
643                 next_char();
644         }
645
646         return (char_type)value;
647 }
648
649 static int parse_escape_sequence(void)
650 {
651         eat('\\');
652
653         int ec = c;
654         next_char();
655
656         switch(ec) {
657         case '"':  return '"';
658         case '\'': return '\'';
659         case '\\': return '\\';
660         case '?': return '\?';
661         case 'a': return '\a';
662         case 'b': return '\b';
663         case 'f': return '\f';
664         case 'n': return '\n';
665         case 'r': return '\r';
666         case 't': return '\t';
667         case 'v': return '\v';
668         case 'x':
669                 return parse_hex_sequence();
670         case '0':
671         case '1':
672         case '2':
673         case '3':
674         case '4':
675         case '5':
676         case '6':
677         case '7':
678                 return parse_octal_sequence(ec);
679         case EOF:
680                 parse_error("reached end of file while parsing escape sequence");
681                 return EOF;
682         default:
683                 parse_error("unknown escape sequence");
684                 return EOF;
685         }
686 }
687
688 const char *concat_strings(const char *s1, const char *s2)
689 {
690         size_t  len1   = strlen(s1);
691         size_t  len2   = strlen(s2);
692
693         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
694         memcpy(concat, s1, len1);
695         memcpy(concat + len1, s2, len2 + 1);
696
697         const char *result = strset_insert(&stringset, concat);
698         if(result != concat) {
699                 obstack_free(&symbol_obstack, concat);
700         }
701
702         return result;
703 }
704
705 static void parse_string_literal(void)
706 {
707         unsigned    start_linenr = lexer_token.source_position.linenr;
708         char       *string;
709         const char *result;
710
711         assert(c == '"');
712         next_char();
713
714         int tc;
715         while(1) {
716                 switch(c) {
717                 case '\\':
718                         tc = parse_escape_sequence();
719                         obstack_1grow(&symbol_obstack, (char) tc);
720                         break;
721
722                 case EOF:
723                         error_prefix_at(lexer_token.source_position.input_name,
724                                         start_linenr);
725                         fprintf(stderr, "string has no end\n");
726                         lexer_token.type = T_ERROR;
727                         return;
728
729                 case '"':
730                         next_char();
731                         goto end_of_string;
732
733                 default:
734                         obstack_1grow(&symbol_obstack, (char) c);
735                         next_char();
736                         break;
737                 }
738         }
739
740 end_of_string:
741
742         /* TODO: concatenate multiple strings separated by whitespace... */
743
744         /* add finishing 0 to the string */
745         obstack_1grow(&symbol_obstack, '\0');
746         string = obstack_finish(&symbol_obstack);
747
748         /* check if there is already a copy of the string */
749         result = strset_insert(&stringset, string);
750         if(result != string) {
751                 obstack_free(&symbol_obstack, string);
752         }
753
754         lexer_token.type     = T_STRING_LITERAL;
755         lexer_token.v.string = result;
756 }
757
758 static void parse_wide_character_constant(void)
759 {
760         eat('\'');
761
762         int found_char = 0;
763         while(1) {
764                 switch(c) {
765                 case '\\':
766                         found_char = parse_escape_sequence();
767                         break;
768
769                 MATCH_NEWLINE(
770                         parse_error("newline while parsing character constant");
771                         break;
772                 )
773
774                 case '\'':
775                         next_char();
776                         goto end_of_wide_char_constant;
777
778                 case EOF:
779                         parse_error("EOF while parsing character constant");
780                         lexer_token.type = T_ERROR;
781                         return;
782
783                 default:
784                         if(found_char != 0) {
785                                 parse_error("more than 1 characters in character "
786                                             "constant");
787                                 goto end_of_wide_char_constant;
788                         } else {
789                                 found_char = c;
790                                 next_char();
791                         }
792                         break;
793                 }
794         }
795
796 end_of_wide_char_constant:
797         lexer_token.type       = T_INTEGER;
798         lexer_token.v.intvalue = found_char;
799         lexer_token.datatype   = type_wchar_t;
800 }
801
802 static void parse_wide_string_literal(void)
803 {
804         const unsigned start_linenr = lexer_token.source_position.linenr;
805
806         assert(c == '"');
807         next_char();
808
809         while(1) {
810                 switch(c) {
811                         case '\\': {
812                                 wchar_rep_t tc = parse_escape_sequence();
813                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
814                                 break;
815                         }
816
817                         case EOF:
818                                 error_prefix_at(lexer_token.source_position.input_name,
819                                                 start_linenr);
820                                 fprintf(stderr, "string has no end\n");
821                                 lexer_token.type = T_ERROR;
822                                 return;
823
824                         case '"':
825                                 next_char();
826                                 goto end_of_string;
827
828                         default: {
829                                 wchar_rep_t tc = c;
830                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
831                                 next_char();
832                                 break;
833                         }
834                 }
835         }
836
837 end_of_string:;
838
839         /* TODO: concatenate multiple strings separated by whitespace... */
840
841         /* add finishing 0 to the string */
842         wchar_rep_t nul = L'\0';
843         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
844         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
845         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
846
847 #if 0 /* TODO hash */
848         /* check if there is already a copy of the string */
849         const wchar_rep_t *const result = strset_insert(&stringset, string);
850         if(result != string) {
851                 obstack_free(&symbol_obstack, string);
852         }
853 #else
854         const wchar_rep_t *const result = string;
855 #endif
856
857         lexer_token.type                = T_WIDE_STRING_LITERAL;
858         lexer_token.v.wide_string.begin = result;
859         lexer_token.v.wide_string.size  = size;
860 }
861
862 static void parse_character_constant(void)
863 {
864         eat('\'');
865
866         int found_char = 0;
867         while(1) {
868                 switch(c) {
869                 case '\\':
870                         found_char = parse_escape_sequence();
871                         break;
872
873                 MATCH_NEWLINE(
874                         parse_error("newline while parsing character constant");
875                         break;
876                 )
877
878                 case '\'':
879                         next_char();
880                         goto end_of_char_constant;
881
882                 case EOF:
883                         parse_error("EOF while parsing character constant");
884                         lexer_token.type = T_ERROR;
885                         return;
886
887                 default:
888                         if(found_char != 0) {
889                                 parse_error("more than 1 characters in character "
890                                             "constant");
891                                 goto end_of_char_constant;
892                         } else {
893                                 found_char = c;
894                                 next_char();
895                         }
896                         break;
897                 }
898         }
899
900 end_of_char_constant:
901         lexer_token.type       = T_INTEGER;
902         lexer_token.v.intvalue = found_char;
903         lexer_token.datatype   = type_int;
904 }
905
906 static void skip_multiline_comment(void)
907 {
908         unsigned start_linenr = lexer_token.source_position.linenr;
909
910         while(1) {
911                 switch(c) {
912                 case '*':
913                         next_char();
914                         if(c == '/') {
915                                 next_char();
916                                 return;
917                         }
918                         break;
919
920                 MATCH_NEWLINE(break;)
921
922                 case EOF:
923                         error_prefix_at(lexer_token.source_position.input_name,
924                                         start_linenr);
925                         fprintf(stderr, "at end of file while looking for comment end\n");
926                         return;
927
928                 default:
929                         next_char();
930                         break;
931                 }
932         }
933 }
934
935 static void skip_line_comment(void)
936 {
937         while(1) {
938                 switch(c) {
939                 case EOF:
940                         return;
941
942                 case '\n':
943                 case '\r':
944                         return;
945
946                 default:
947                         next_char();
948                         break;
949                 }
950         }
951 }
952
953 static token_t pp_token;
954
955 static inline void next_pp_token(void)
956 {
957         lexer_next_preprocessing_token();
958         pp_token = lexer_token;
959 }
960
961 static void eat_until_newline(void)
962 {
963         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
964                 next_pp_token();
965         }
966 }
967
968 static void error_directive(void)
969 {
970         error_prefix();
971         fprintf(stderr, "#error directive: \n");
972
973         /* parse pp-tokens until new-line */
974 }
975
976 static void define_directive(void)
977 {
978         lexer_next_preprocessing_token();
979         if(lexer_token.type != T_IDENTIFIER) {
980                 parse_error("expected identifier after #define\n");
981                 eat_until_newline();
982         }
983 }
984
985 static void ifdef_directive(int is_ifndef)
986 {
987         (void) is_ifndef;
988         lexer_next_preprocessing_token();
989         //expect_identifier();
990         //extect_newline();
991 }
992
993 static void endif_directive(void)
994 {
995         //expect_newline();
996 }
997
998 static void parse_line_directive(void)
999 {
1000         if(pp_token.type != T_INTEGER) {
1001                 parse_error("expected integer");
1002         } else {
1003                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1004                 next_pp_token();
1005         }
1006         if(pp_token.type == T_STRING_LITERAL) {
1007                 lexer_token.source_position.input_name = pp_token.v.string;
1008                 next_pp_token();
1009         }
1010
1011         eat_until_newline();
1012 }
1013
1014 static void parse_preprocessor_identifier(void)
1015 {
1016         assert(pp_token.type == T_IDENTIFIER);
1017         symbol_t *symbol = pp_token.v.symbol;
1018
1019         switch(symbol->pp_ID) {
1020         case TP_include:
1021                 printf("include - enable header name parsing!\n");
1022                 break;
1023         case TP_define:
1024                 define_directive();
1025                 break;
1026         case TP_ifdef:
1027                 ifdef_directive(0);
1028                 break;
1029         case TP_ifndef:
1030                 ifdef_directive(1);
1031                 break;
1032         case TP_endif:
1033                 endif_directive();
1034                 break;
1035         case TP_line:
1036                 next_pp_token();
1037                 parse_line_directive();
1038                 break;
1039         case TP_if:
1040         case TP_else:
1041         case TP_elif:
1042         case TP_undef:
1043         case TP_error:
1044                 error_directive();
1045                 break;
1046         case TP_pragma:
1047                 break;
1048         }
1049 }
1050
1051 static void parse_preprocessor_directive(void)
1052 {
1053         next_pp_token();
1054
1055         switch(pp_token.type) {
1056         case T_IDENTIFIER:
1057                 parse_preprocessor_identifier();
1058                 break;
1059         case T_INTEGER:
1060                 parse_line_directive();
1061                 break;
1062         default:
1063                 parse_error("invalid preprocessor directive");
1064                 eat_until_newline();
1065                 break;
1066         }
1067 }
1068
1069 #define MAYBE_PROLOG                                       \
1070                         next_char();                                   \
1071                         while(1) {                                     \
1072                                 switch(c) {
1073
1074 #define MAYBE(ch, set_type)                                \
1075                                 case ch:                                   \
1076                                         next_char();                           \
1077                                         lexer_token.type = set_type;           \
1078                                         return;
1079
1080 #define ELSE_CODE(code)                                    \
1081                                 default:                                   \
1082                                         code;                                  \
1083                                 }                                          \
1084                         } /* end of while(1) */                        \
1085                         break;
1086
1087 #define ELSE(set_type)                                     \
1088                 ELSE_CODE(                                         \
1089                         lexer_token.type = set_type;                   \
1090                         return;                                        \
1091                 )
1092
1093 void lexer_next_preprocessing_token(void)
1094 {
1095         while(1) {
1096                 switch(c) {
1097                 case ' ':
1098                 case '\t':
1099                         next_char();
1100                         break;
1101
1102                 MATCH_NEWLINE(
1103                         lexer_token.type = '\n';
1104                         return;
1105                 )
1106
1107                 SYMBOL_CHARS
1108                         parse_symbol();
1109                         /* might be a wide string ( L"string" ) */
1110                         if(lexer_token.type == T_IDENTIFIER &&
1111                             lexer_token.v.symbol == symbol_L) {
1112                             if(c == '"') {
1113                                         parse_wide_string_literal();
1114                                 } else if(c == '\'') {
1115                                         parse_wide_character_constant();
1116                                 }
1117                         }
1118                         return;
1119
1120                 DIGITS
1121                         parse_number();
1122                         return;
1123
1124                 case '"':
1125                         parse_string_literal();
1126                         return;
1127
1128                 case '\'':
1129                         parse_character_constant();
1130                         return;
1131
1132                 case '.':
1133                         MAYBE_PROLOG
1134                                 case '.':
1135                                         MAYBE_PROLOG
1136                                         MAYBE('.', T_DOTDOTDOT)
1137                                         ELSE_CODE(
1138                                                 put_back(c);
1139                                                 c = '.';
1140                                                 lexer_token.type = '.';
1141                                                 return;
1142                                         )
1143                         ELSE('.')
1144                 case '&':
1145                         MAYBE_PROLOG
1146                         MAYBE('&', T_ANDAND)
1147                         MAYBE('=', T_ANDEQUAL)
1148                         ELSE('&')
1149                 case '*':
1150                         MAYBE_PROLOG
1151                         MAYBE('=', T_ASTERISKEQUAL)
1152                         ELSE('*')
1153                 case '+':
1154                         MAYBE_PROLOG
1155                         MAYBE('+', T_PLUSPLUS)
1156                         MAYBE('=', T_PLUSEQUAL)
1157                         ELSE('+')
1158                 case '-':
1159                         MAYBE_PROLOG
1160                         MAYBE('>', T_MINUSGREATER)
1161                         MAYBE('-', T_MINUSMINUS)
1162                         MAYBE('=', T_MINUSEQUAL)
1163                         ELSE('-')
1164                 case '!':
1165                         MAYBE_PROLOG
1166                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1167                         ELSE('!')
1168                 case '/':
1169                         MAYBE_PROLOG
1170                         MAYBE('=', T_SLASHEQUAL)
1171                                 case '*':
1172                                         next_char();
1173                                         skip_multiline_comment();
1174                                         lexer_next_preprocessing_token();
1175                                         return;
1176                                 case '/':
1177                                         next_char();
1178                                         skip_line_comment();
1179                                         lexer_next_preprocessing_token();
1180                                         return;
1181                         ELSE('/')
1182                 case '%':
1183                         MAYBE_PROLOG
1184                         MAYBE('>', T_PERCENTGREATER)
1185                         MAYBE('=', T_PERCENTEQUAL)
1186                                 case ':':
1187                                         MAYBE_PROLOG
1188                                                 case '%':
1189                                                         MAYBE_PROLOG
1190                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1191                                                         ELSE_CODE(
1192                                                                 put_back(c);
1193                                                                 c = '%';
1194                                                                 lexer_token.type = T_PERCENTCOLON;
1195                                                                 return;
1196                                                         )
1197                                         ELSE(T_PERCENTCOLON)
1198                         ELSE('%')
1199                 case '<':
1200                         MAYBE_PROLOG
1201                         MAYBE(':', T_LESSCOLON)
1202                         MAYBE('%', T_LESSPERCENT)
1203                         MAYBE('=', T_LESSEQUAL)
1204                                 case '<':
1205                                         MAYBE_PROLOG
1206                                         MAYBE('=', T_LESSLESSEQUAL)
1207                                         ELSE(T_LESSLESS)
1208                         ELSE('<')
1209                 case '>':
1210                         MAYBE_PROLOG
1211                         MAYBE('=', T_GREATEREQUAL)
1212                                 case '>':
1213                                         MAYBE_PROLOG
1214                                         MAYBE('=', T_GREATERGREATEREQUAL)
1215                                         ELSE(T_GREATERGREATER)
1216                         ELSE('>')
1217                 case '^':
1218                         MAYBE_PROLOG
1219                         MAYBE('=', T_CARETEQUAL)
1220                         ELSE('^')
1221                 case '|':
1222                         MAYBE_PROLOG
1223                         MAYBE('=', T_PIPEEQUAL)
1224                         MAYBE('|', T_PIPEPIPE)
1225                         ELSE('|')
1226                 case ':':
1227                         MAYBE_PROLOG
1228                         MAYBE('>', T_COLONGREATER)
1229                         ELSE(':')
1230                 case '=':
1231                         MAYBE_PROLOG
1232                         MAYBE('=', T_EQUALEQUAL)
1233                         ELSE('=')
1234                 case '#':
1235                         MAYBE_PROLOG
1236                         MAYBE('#', T_HASHHASH)
1237                         ELSE('#')
1238
1239                 case '?':
1240                 case '[':
1241                 case ']':
1242                 case '(':
1243                 case ')':
1244                 case '{':
1245                 case '}':
1246                 case '~':
1247                 case ';':
1248                 case ',':
1249                 case '\\':
1250                         lexer_token.type = c;
1251                         next_char();
1252                         return;
1253
1254                 case EOF:
1255                         lexer_token.type = T_EOF;
1256                         return;
1257
1258                 default:
1259                         next_char();
1260                         error_prefix();
1261                         fprintf(stderr, "unknown character '%c' found\n", c);
1262                         lexer_token.type = T_ERROR;
1263                         return;
1264                 }
1265         }
1266 }
1267
1268 void lexer_next_token(void)
1269 {
1270         lexer_next_preprocessing_token();
1271         if(lexer_token.type != '\n')
1272                 return;
1273
1274 newline_found:
1275         do {
1276                 lexer_next_preprocessing_token();
1277         } while(lexer_token.type == '\n');
1278
1279         if(lexer_token.type == '#') {
1280                 parse_preprocessor_directive();
1281                 goto newline_found;
1282         }
1283 }
1284
1285 void init_lexer(void)
1286 {
1287         strset_init(&stringset);
1288
1289         type_int       = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_NONE);
1290         type_uint      = make_atomic_type(ATOMIC_TYPE_UINT, TYPE_QUALIFIER_NONE);
1291         type_long      = make_atomic_type(ATOMIC_TYPE_LONG, TYPE_QUALIFIER_NONE);
1292         type_ulong     = make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_NONE);
1293         type_longlong  = make_atomic_type(ATOMIC_TYPE_LONGLONG,
1294                                           TYPE_QUALIFIER_NONE);
1295         type_ulonglong = make_atomic_type(ATOMIC_TYPE_ULONGLONG,
1296                                           TYPE_QUALIFIER_NONE);
1297
1298         type_float      = make_atomic_type(ATOMIC_TYPE_FLOAT, TYPE_QUALIFIER_CONST);
1299         type_double     = make_atomic_type(ATOMIC_TYPE_DOUBLE,
1300                                            TYPE_QUALIFIER_CONST);
1301         type_longdouble = make_atomic_type(ATOMIC_TYPE_LONG_DOUBLE,
1302                                            TYPE_QUALIFIER_CONST);
1303 }
1304
1305 void lexer_open_stream(FILE *stream, const char *input_name)
1306 {
1307         input                                  = stream;
1308         lexer_token.source_position.linenr     = 0;
1309         lexer_token.source_position.input_name = input_name;
1310
1311         symbol_L = symbol_table_insert("L");
1312
1313         /* place a virtual \n at the beginning so the lexer knows that we're
1314          * at the beginning of a line */
1315         c = '\n';
1316 }
1317
1318 void exit_lexer(void)
1319 {
1320         strset_destroy(&stringset);
1321 }
1322
1323 static __attribute__((unused))
1324 void dbg_pos(const source_position_t source_position)
1325 {
1326         fprintf(stdout, "%s:%u\n", source_position.input_name,
1327                 source_position.linenr);
1328         fflush(stdout);
1329 }