fix some wrong tokens
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "diagnostic.h"
4 #include "lexer.h"
5 #include "token_t.h"
6 #include "symbol_table_t.h"
7 #include "adt/error.h"
8 #include "adt/strset.h"
9 #include "adt/util.h"
10 #include "types.h"
11 #include "type_t.h"
12 #include "target_architecture.h"
13 #include "parser.h"
14 #include "warning.h"
15
16 #include <assert.h>
17 #include <errno.h>
18 #include <string.h>
19 #include <stdbool.h>
20 #include <ctype.h>
21
22 //#define DEBUG_CHARS
23 #define MAX_PUTBACK 3
24
25 #ifdef _WIN32
26 /* No strtold on windows and no replacement yet */
27 #define strtold(s, e) strtod(s, e)
28 #endif
29
30 #if defined HAS_SIGNED_CHAR
31 typedef signed char char_type;
32 #elif defined HAS_UNSIGNED_CHAR
33 typedef unsigned char char_type;
34 #else
35 #       error signedness of char not determined
36 #endif
37
38 static int         c;
39 token_t            lexer_token;
40 symbol_t          *symbol_L;
41 static FILE       *input;
42 static char        buf[1024 + MAX_PUTBACK];
43 static const char *bufend;
44 static const char *bufpos;
45 static strset_t    stringset;
46
47 /**
48  * Prints a parse error message at the current token.
49  *
50  * @param msg   the error message
51  */
52 static void parse_error(const char *msg)
53 {
54         errorf(lexer_token.source_position,  "%s", msg);
55 }
56
57 static inline void next_real_char(void)
58 {
59         assert(bufpos <= bufend);
60         if (bufpos >= bufend) {
61                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
62                                  input);
63                 if(s == 0) {
64                         c = EOF;
65                         return;
66                 }
67                 bufpos = buf + MAX_PUTBACK;
68                 bufend = buf + MAX_PUTBACK + s;
69         }
70         c = *bufpos++;
71 }
72
73 static inline void put_back(int pc)
74 {
75         assert(bufpos > buf);
76         *(--bufpos - buf + buf) = (char) pc;
77
78 #ifdef DEBUG_CHARS
79         printf("putback '%c'\n", pc);
80 #endif
81 }
82
83 static inline void next_char(void);
84
85 #define MATCH_NEWLINE(code)                   \
86         case '\r':                                \
87                 next_char();                          \
88                 if(c == '\n') {                       \
89                         next_char();                      \
90                 }                                     \
91                 lexer_token.source_position.linenr++; \
92                 code                                  \
93         case '\n':                                \
94                 next_char();                          \
95                 lexer_token.source_position.linenr++; \
96                 code
97
98 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
99
100 static void maybe_concat_lines(void)
101 {
102         eat('\\');
103
104         switch(c) {
105         MATCH_NEWLINE(return;)
106
107         default:
108                 break;
109         }
110
111         put_back(c);
112         c = '\\';
113 }
114
115 static inline void next_char(void)
116 {
117         next_real_char();
118
119         /* filter trigraphs */
120         if(UNLIKELY(c == '\\')) {
121                 maybe_concat_lines();
122                 goto end_of_next_char;
123         }
124
125         if(LIKELY(c != '?'))
126                 goto end_of_next_char;
127
128         next_real_char();
129         if(LIKELY(c != '?')) {
130                 put_back(c);
131                 c = '?';
132                 goto end_of_next_char;
133         }
134
135         next_real_char();
136         switch(c) {
137         case '=': c = '#'; break;
138         case '(': c = '['; break;
139         case '/': c = '\\'; maybe_concat_lines(); break;
140         case ')': c = ']'; break;
141         case '\'': c = '^'; break;
142         case '<': c = '{'; break;
143         case '!': c = '|'; break;
144         case '>': c = '}'; break;
145         case '-': c = '~'; break;
146         default:
147                 put_back(c);
148                 put_back('?');
149                 c = '?';
150                 break;
151         }
152
153 end_of_next_char:;
154 #ifdef DEBUG_CHARS
155         printf("nchar '%c'\n", c);
156 #endif
157 }
158
159 #define SYMBOL_CHARS  \
160         case 'a':         \
161         case 'b':         \
162         case 'c':         \
163         case 'd':         \
164         case 'e':         \
165         case 'f':         \
166         case 'g':         \
167         case 'h':         \
168         case 'i':         \
169         case 'j':         \
170         case 'k':         \
171         case 'l':         \
172         case 'm':         \
173         case 'n':         \
174         case 'o':         \
175         case 'p':         \
176         case 'q':         \
177         case 'r':         \
178         case 's':         \
179         case 't':         \
180         case 'u':         \
181         case 'v':         \
182         case 'w':         \
183         case 'x':         \
184         case 'y':         \
185         case 'z':         \
186         case 'A':         \
187         case 'B':         \
188         case 'C':         \
189         case 'D':         \
190         case 'E':         \
191         case 'F':         \
192         case 'G':         \
193         case 'H':         \
194         case 'I':         \
195         case 'J':         \
196         case 'K':         \
197         case 'L':         \
198         case 'M':         \
199         case 'N':         \
200         case 'O':         \
201         case 'P':         \
202         case 'Q':         \
203         case 'R':         \
204         case 'S':         \
205         case 'T':         \
206         case 'U':         \
207         case 'V':         \
208         case 'W':         \
209         case 'X':         \
210         case 'Y':         \
211         case 'Z':         \
212         case '_':
213
214 #define DIGITS        \
215         case '0':         \
216         case '1':         \
217         case '2':         \
218         case '3':         \
219         case '4':         \
220         case '5':         \
221         case '6':         \
222         case '7':         \
223         case '8':         \
224         case '9':
225
226 static void parse_symbol(void)
227 {
228         symbol_t *symbol;
229         char     *string;
230
231         obstack_1grow(&symbol_obstack, (char) c);
232         next_char();
233
234         while(1) {
235                 switch(c) {
236                 DIGITS
237                 SYMBOL_CHARS
238                         obstack_1grow(&symbol_obstack, (char) c);
239                         next_char();
240                         break;
241
242                 default:
243                         goto end_symbol;
244                 }
245         }
246
247 end_symbol:
248         obstack_1grow(&symbol_obstack, '\0');
249
250         string = obstack_finish(&symbol_obstack);
251         symbol = symbol_table_insert(string);
252
253         lexer_token.type     = symbol->ID;
254         lexer_token.v.symbol = symbol;
255
256         if(symbol->string != string) {
257                 obstack_free(&symbol_obstack, string);
258         }
259 }
260
261 static void parse_integer_suffix(bool is_oct_hex)
262 {
263         bool is_unsigned  = false;
264         bool min_long     = false;
265         bool min_longlong = false;
266
267         if(c == 'U' || c == 'u') {
268                 is_unsigned = true;
269                 next_char();
270                 if(c == 'L' || c == 'l') {
271                         min_long = true;
272                         next_char();
273                         if(c == 'L' || c == 'l') {
274                                 min_longlong = true;
275                                 next_char();
276                         }
277                 }
278         } else if(c == 'l' || c == 'L') {
279                 min_long = true;
280                 next_char();
281                 if(c == 'l' || c == 'L') {
282                         min_longlong = true;
283                         next_char();
284                         if(c == 'u' || c == 'U') {
285                                 is_unsigned = true;
286                                 next_char();
287                         }
288                 } else if(c == 'u' || c == 'U') {
289                         is_unsigned = true;
290                         next_char();
291                         lexer_token.datatype = type_unsigned_long;
292                 }
293         }
294
295         if(!is_unsigned) {
296                 long long v = lexer_token.v.intvalue;
297                 if(!min_long) {
298                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
299                                 lexer_token.datatype = type_int;
300                                 return;
301                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
302                                 lexer_token.datatype = type_unsigned_int;
303                                 return;
304                         }
305                 }
306                 if(!min_longlong) {
307                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
308                                 lexer_token.datatype = type_long;
309                                 return;
310                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
311                                 lexer_token.datatype = type_unsigned_long;
312                                 return;
313                         }
314                 }
315                 unsigned long long uv = (unsigned long long) v;
316                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
317                         lexer_token.datatype = type_unsigned_long_long;
318                         return;
319                 }
320
321                 lexer_token.datatype = type_long_long;
322         } else {
323                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
324                 if(!min_long && v <= TARGET_UINT_MAX) {
325                         lexer_token.datatype = type_unsigned_int;
326                         return;
327                 }
328                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
329                         lexer_token.datatype = type_unsigned_long;
330                         return;
331                 }
332                 lexer_token.datatype = type_unsigned_long_long;
333         }
334 }
335
336 static void parse_floating_suffix(void)
337 {
338         switch(c) {
339         /* TODO: do something usefull with the suffixes... */
340         case 'f':
341         case 'F':
342                 next_char();
343                 lexer_token.datatype = type_float;
344                 break;
345         case 'l':
346         case 'L':
347                 next_char();
348                 lexer_token.datatype = type_long_double;
349                 break;
350         default:
351                 lexer_token.datatype = type_double;
352                 break;
353         }
354 }
355
356 /**
357  * A replacement for strtoull. Only those parts needed for
358  * our parser are implemented.
359  */
360 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
361         unsigned long long v = 0;
362
363         switch (base) {
364         case 16:
365                 for (;; ++s) {
366                         /* check for overrun */
367                         if (v >= 0x1000000000000000ULL)
368                                 break;
369                         switch (tolower(*s)) {
370                         case '0': v <<= 4; break;
371                         case '1': v <<= 4; v |= 0x1; break;
372                         case '2': v <<= 4; v |= 0x2; break;
373                         case '3': v <<= 4; v |= 0x3; break;
374                         case '4': v <<= 4; v |= 0x4; break;
375                         case '5': v <<= 4; v |= 0x5; break;
376                         case '6': v <<= 4; v |= 0x6; break;
377                         case '7': v <<= 4; v |= 0x7; break;
378                         case '8': v <<= 4; v |= 0x8; break;
379                         case '9': v <<= 4; v |= 0x9; break;
380                         case 'a': v <<= 4; v |= 0xa; break;
381                         case 'b': v <<= 4; v |= 0xb; break;
382                         case 'c': v <<= 4; v |= 0xc; break;
383                         case 'd': v <<= 4; v |= 0xd; break;
384                         case 'e': v <<= 4; v |= 0xe; break;
385                         case 'f': v <<= 4; v |= 0xf; break;
386                         default:
387                                 goto end;
388                         }
389                 }
390                 break;
391         case 8:
392                 for (;; ++s) {
393                         /* check for overrun */
394                         if (v >= 0x2000000000000000ULL)
395                                 break;
396                         switch (tolower(*s)) {
397                         case '0': v <<= 3; break;
398                         case '1': v <<= 3; v |= 1; break;
399                         case '2': v <<= 3; v |= 2; break;
400                         case '3': v <<= 3; v |= 3; break;
401                         case '4': v <<= 3; v |= 4; break;
402                         case '5': v <<= 3; v |= 5; break;
403                         case '6': v <<= 3; v |= 6; break;
404                         case '7': v <<= 3; v |= 7; break;
405                         default:
406                                 goto end;
407                         }
408                 }
409                 break;
410         case 10:
411                 for (;; ++s) {
412                         /* check for overrun */
413                         if (v > 0x1999999999999999ULL)
414                                 break;
415                         switch (tolower(*s)) {
416                         case '0': v *= 10; break;
417                         case '1': v *= 10; v += 1; break;
418                         case '2': v *= 10; v += 2; break;
419                         case '3': v *= 10; v += 3; break;
420                         case '4': v *= 10; v += 4; break;
421                         case '5': v *= 10; v += 5; break;
422                         case '6': v *= 10; v += 6; break;
423                         case '7': v *= 10; v += 7; break;
424                         case '8': v *= 10; v += 8; break;
425                         case '9': v *= 10; v += 9; break;
426                         default:
427                                 goto end;
428                         }
429                 }
430                 break;
431         default:
432                 assert(0);
433                 break;
434         }
435 end:
436         *endptr = s;
437         return v;
438 }
439
440 static void parse_number_hex(void)
441 {
442         assert(c == 'x' || c == 'X');
443         next_char();
444
445         while(isxdigit(c)) {
446                 obstack_1grow(&symbol_obstack, (char) c);
447                 next_char();
448         }
449         obstack_1grow(&symbol_obstack, '\0');
450         char *string = obstack_finish(&symbol_obstack);
451
452         if(c == '.' || c == 'p' || c == 'P') {
453                 next_char();
454                 panic("Hex floating point numbers not implemented yet");
455         }
456         if(*string == '\0') {
457                 parse_error("invalid hex number");
458                 lexer_token.type = T_ERROR;
459         }
460
461         const char *endptr;
462         lexer_token.type       = T_INTEGER;
463         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
464         if(*endptr != '\0') {
465                 parse_error("hex number literal too long");
466         }
467
468         obstack_free(&symbol_obstack, string);
469         parse_integer_suffix(true);
470 }
471
472 static inline bool is_octal_digit(int chr)
473 {
474         return '0' <= chr && chr <= '7';
475 }
476
477 static void parse_number_oct(void)
478 {
479         while(is_octal_digit(c)) {
480                 obstack_1grow(&symbol_obstack, (char) c);
481                 next_char();
482         }
483         obstack_1grow(&symbol_obstack, '\0');
484         char *string = obstack_finish(&symbol_obstack);
485
486         const char *endptr;
487         lexer_token.type       = T_INTEGER;
488         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
489         if(*endptr != '\0') {
490                 parse_error("octal number literal too long");
491         }
492
493         obstack_free(&symbol_obstack, string);
494         parse_integer_suffix(true);
495 }
496
497 static void parse_number_dec(void)
498 {
499         bool is_float = false;
500         while(isdigit(c)) {
501                 obstack_1grow(&symbol_obstack, (char) c);
502                 next_char();
503         }
504
505         if(c == '.') {
506                 obstack_1grow(&symbol_obstack, '.');
507                 next_char();
508
509                 while(isdigit(c)) {
510                         obstack_1grow(&symbol_obstack, (char) c);
511                         next_char();
512                 }
513                 is_float = true;
514         }
515         if(c == 'e' || c == 'E') {
516                 obstack_1grow(&symbol_obstack, 'e');
517                 next_char();
518
519                 if(c == '-' || c == '+') {
520                         obstack_1grow(&symbol_obstack, (char) c);
521                         next_char();
522                 }
523
524                 while(isdigit(c)) {
525                         obstack_1grow(&symbol_obstack, (char) c);
526                         next_char();
527                 }
528                 is_float = true;
529         }
530
531         obstack_1grow(&symbol_obstack, '\0');
532         char *string = obstack_finish(&symbol_obstack);
533
534         if(is_float) {
535                 char *endptr;
536                 lexer_token.type         = T_FLOATINGPOINT;
537                 lexer_token.v.floatvalue = strtold(string, &endptr);
538
539                 if(*endptr != '\0') {
540                         parse_error("invalid number literal");
541                 }
542
543                 parse_floating_suffix();
544         } else {
545                 const char *endptr;
546                 lexer_token.type       = T_INTEGER;
547                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
548
549                 if(*endptr != '\0') {
550                         parse_error("invalid number literal");
551                 }
552
553                 parse_integer_suffix(false);
554         }
555         obstack_free(&symbol_obstack, string);
556 }
557
558 static void parse_number(void)
559 {
560         if (c == '0') {
561                 next_char();
562                 switch (c) {
563                         case 'X':
564                         case 'x':
565                                 parse_number_hex();
566                                 break;
567                         case '0':
568                         case '1':
569                         case '2':
570                         case '3':
571                         case '4':
572                         case '5':
573                         case '6':
574                         case '7':
575                                 parse_number_oct();
576                                 break;
577                         case '8':
578                         case '9':
579                                 next_char();
580                                 parse_error("invalid octal number");
581                                 lexer_token.type = T_ERROR;
582                                 return;
583                         case '.':
584                         case 'e':
585                         case 'E':
586                         default:
587                                 obstack_1grow(&symbol_obstack, '0');
588                                 parse_number_dec();
589                                 return;
590                 }
591         } else {
592                 parse_number_dec();
593         }
594 }
595
596 static int parse_octal_sequence(const int first_digit)
597 {
598         assert(is_octal_digit(first_digit));
599         int value = first_digit - '0';
600         if (!is_octal_digit(c)) return value;
601         value = 8 * value + c - '0';
602         next_char();
603         if (!is_octal_digit(c)) return value;
604         value = 8 * value + c - '0';
605         next_char();
606         return (char_type)value;
607 }
608
609 static int parse_hex_sequence(void)
610 {
611         int value = 0;
612         while(1) {
613                 if (c >= '0' && c <= '9') {
614                         value = 16 * value + c - '0';
615                 } else if ('A' <= c && c <= 'F') {
616                         value = 16 * value + c - 'A' + 10;
617                 } else if ('a' <= c && c <= 'f') {
618                         value = 16 * value + c - 'a' + 10;
619                 } else {
620                         break;
621                 }
622                 next_char();
623         }
624
625         return (char_type)value;
626 }
627
628 static int parse_escape_sequence(void)
629 {
630         eat('\\');
631
632         int ec = c;
633         next_char();
634
635         switch(ec) {
636         case '"':  return '"';
637         case '\'': return '\'';
638         case '\\': return '\\';
639         case '?': return '\?';
640         case 'a': return '\a';
641         case 'b': return '\b';
642         case 'f': return '\f';
643         case 'n': return '\n';
644         case 'r': return '\r';
645         case 't': return '\t';
646         case 'v': return '\v';
647         case 'x':
648                 return parse_hex_sequence();
649         case '0':
650         case '1':
651         case '2':
652         case '3':
653         case '4':
654         case '5':
655         case '6':
656         case '7':
657                 return parse_octal_sequence(ec);
658         case EOF:
659                 parse_error("reached end of file while parsing escape sequence");
660                 return EOF;
661         default:
662                 parse_error("unknown escape sequence");
663                 return EOF;
664         }
665 }
666
667 string_t concat_strings(const string_t *const s1, const string_t *const s2)
668 {
669         const size_t len1 = s1->size - 1;
670         const size_t len2 = s2->size - 1;
671
672         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
673         memcpy(concat, s1->begin, len1);
674         memcpy(concat + len1, s2->begin, len2 + 1);
675
676 #if 0 /* TODO hash */
677         const char *result = strset_insert(&stringset, concat);
678         if(result != concat) {
679                 obstack_free(&symbol_obstack, concat);
680         }
681
682         return result;
683 #else
684         return (string_t){ concat, len1 + len2 + 1 };
685 #endif
686 }
687
688 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
689 {
690         const size_t len1 = s1->size - 1;
691         const size_t len2 = s2->size - 1;
692
693         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
694         const char *const src = s1->begin;
695         for (size_t i = 0; i != len1; ++i) {
696                 concat[i] = src[i];
697         }
698         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
699
700         return (wide_string_t){ concat, len1 + len2 + 1 };
701 }
702
703 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
704 {
705         const size_t len1 = s1->size - 1;
706         const size_t len2 = s2->size - 1;
707
708         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
709         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
710         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
711
712         return (wide_string_t){ concat, len1 + len2 + 1 };
713 }
714
715 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
716 {
717         const size_t len1 = s1->size - 1;
718         const size_t len2 = s2->size - 1;
719
720         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
721         memcpy(concat, s1->begin, len1 * sizeof(*concat));
722         const char *const src = s2->begin;
723         for (size_t i = 0; i != len2 + 1; ++i) {
724                 concat[i] = src[i];
725         }
726
727         return (wide_string_t){ concat, len1 + len2 + 1 };
728 }
729
730 static void parse_string_literal(void)
731 {
732         const unsigned start_linenr = lexer_token.source_position.linenr;
733
734         eat('"');
735
736         int tc;
737         while(1) {
738                 switch(c) {
739                 case '\\':
740                         tc = parse_escape_sequence();
741                         obstack_1grow(&symbol_obstack, (char) tc);
742                         break;
743
744                 case EOF: {
745                         source_position_t source_position;
746                         source_position.input_name = lexer_token.source_position.input_name;
747                         source_position.linenr     = start_linenr;
748                         errorf(source_position, "string has no end");
749                         lexer_token.type = T_ERROR;
750                         return;
751                 }
752
753                 case '"':
754                         next_char();
755                         goto end_of_string;
756
757                 default:
758                         obstack_1grow(&symbol_obstack, (char) c);
759                         next_char();
760                         break;
761                 }
762         }
763
764 end_of_string:
765
766         /* TODO: concatenate multiple strings separated by whitespace... */
767
768         /* add finishing 0 to the string */
769         obstack_1grow(&symbol_obstack, '\0');
770         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
771         const char *const string = obstack_finish(&symbol_obstack);
772
773 #if 0 /* TODO hash */
774         /* check if there is already a copy of the string */
775         result = strset_insert(&stringset, string);
776         if(result != string) {
777                 obstack_free(&symbol_obstack, string);
778         }
779 #else
780         const char *const result = string;
781 #endif
782
783         lexer_token.type           = T_STRING_LITERAL;
784         lexer_token.v.string.begin = result;
785         lexer_token.v.string.size  = size;
786 }
787
788 static void parse_wide_character_constant(void)
789 {
790         eat('\'');
791
792         int found_char = 0;
793         while(1) {
794                 switch(c) {
795                 case '\\':
796                         found_char = parse_escape_sequence();
797                         break;
798
799                 MATCH_NEWLINE(
800                         parse_error("newline while parsing character constant");
801                         break;
802                 )
803
804                 case '\'':
805                         next_char();
806                         goto end_of_wide_char_constant;
807
808                 case EOF:
809                         parse_error("EOF while parsing character constant");
810                         lexer_token.type = T_ERROR;
811                         return;
812
813                 default:
814                         if(found_char != 0) {
815                                 parse_error("more than 1 characters in character "
816                                             "constant");
817                                 goto end_of_wide_char_constant;
818                         } else {
819                                 found_char = c;
820                                 next_char();
821                         }
822                         break;
823                 }
824         }
825
826 end_of_wide_char_constant:
827         lexer_token.type       = T_INTEGER;
828         lexer_token.v.intvalue = found_char;
829         lexer_token.datatype   = type_wchar_t;
830 }
831
832 static void parse_wide_string_literal(void)
833 {
834         const unsigned start_linenr = lexer_token.source_position.linenr;
835
836         assert(c == '"');
837         next_char();
838
839         while(1) {
840                 switch(c) {
841                 case '\\': {
842                         wchar_rep_t tc = parse_escape_sequence();
843                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
844                         break;
845                 }
846
847                 case EOF: {
848                         source_position_t source_position;
849                         source_position.input_name = lexer_token.source_position.input_name;
850                         source_position.linenr     = start_linenr;
851                         errorf(source_position, "string has no end");
852                         lexer_token.type = T_ERROR;
853                         return;
854                 }
855
856                 case '"':
857                         next_char();
858                         goto end_of_string;
859
860                 default: {
861                         wchar_rep_t tc = c;
862                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
863                         next_char();
864                         break;
865                 }
866                 }
867         }
868
869 end_of_string:;
870
871         /* TODO: concatenate multiple strings separated by whitespace... */
872
873         /* add finishing 0 to the string */
874         wchar_rep_t nul = L'\0';
875         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
876         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
877         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
878
879 #if 0 /* TODO hash */
880         /* check if there is already a copy of the string */
881         const wchar_rep_t *const result = strset_insert(&stringset, string);
882         if(result != string) {
883                 obstack_free(&symbol_obstack, string);
884         }
885 #else
886         const wchar_rep_t *const result = string;
887 #endif
888
889         lexer_token.type                = T_WIDE_STRING_LITERAL;
890         lexer_token.v.wide_string.begin = result;
891         lexer_token.v.wide_string.size  = size;
892 }
893
894 static void parse_character_constant(void)
895 {
896         const unsigned start_linenr = lexer_token.source_position.linenr;
897
898         eat('\'');
899
900         int tc;
901         while(1) {
902                 switch(c) {
903                 case '\\':
904                         tc = parse_escape_sequence();
905                         obstack_1grow(&symbol_obstack, (char) tc);
906                         break;
907
908                 MATCH_NEWLINE(
909                         parse_error("newline while parsing character constant");
910                         break;
911                 )
912
913                 case EOF: {
914                         source_position_t source_position;
915                         source_position.input_name = lexer_token.source_position.input_name;
916                         source_position.linenr     = start_linenr;
917                         errorf(source_position, "EOF while parsing character constant");
918                         lexer_token.type = T_ERROR;
919                         return;
920                 }
921
922                 case '\'':
923                         next_char();
924                         goto end_of_char_constant;
925
926                 default:
927                         obstack_1grow(&symbol_obstack, (char) c);
928                         next_char();
929                         break;
930
931                 }
932         }
933
934 end_of_char_constant:;
935         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
936         const char *const string = obstack_finish(&symbol_obstack);
937
938         lexer_token.type           = T_CHARS;
939         lexer_token.v.string.begin = string;
940         lexer_token.v.string.size  = size;
941         lexer_token.datatype       = type_int;
942 }
943
944 static void skip_multiline_comment(void)
945 {
946         unsigned start_linenr = lexer_token.source_position.linenr;
947
948         while(1) {
949                 switch(c) {
950                 case '/':
951                         next_char();
952                         if (c == '*') {
953                                 /* TODO: nested comment, warn here */
954                         }
955                         break;
956                 case '*':
957                         next_char();
958                         if(c == '/') {
959                                 next_char();
960                                 return;
961                         }
962                         break;
963
964                 MATCH_NEWLINE(break;)
965
966                 case EOF: {
967                         source_position_t source_position;
968                         source_position.input_name = lexer_token.source_position.input_name;
969                         source_position.linenr     = start_linenr;
970                         errorf(source_position, "at end of file while looking for comment end");
971                         return;
972                 }
973
974                 default:
975                         next_char();
976                         break;
977                 }
978         }
979 }
980
981 static void skip_line_comment(void)
982 {
983         while(1) {
984                 switch(c) {
985                 case EOF:
986                         return;
987
988                 case '\n':
989                 case '\r':
990                         return;
991
992                 default:
993                         next_char();
994                         break;
995                 }
996         }
997 }
998
999 static token_t pp_token;
1000
1001 static inline void next_pp_token(void)
1002 {
1003         lexer_next_preprocessing_token();
1004         pp_token = lexer_token;
1005 }
1006
1007 static void eat_until_newline(void)
1008 {
1009         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1010                 next_pp_token();
1011         }
1012 }
1013
1014 static void define_directive(void)
1015 {
1016         lexer_next_preprocessing_token();
1017         if(lexer_token.type != T_IDENTIFIER) {
1018                 parse_error("expected identifier after #define\n");
1019                 eat_until_newline();
1020         }
1021 }
1022
1023 static void ifdef_directive(int is_ifndef)
1024 {
1025         (void) is_ifndef;
1026         lexer_next_preprocessing_token();
1027         //expect_identifier();
1028         //extect_newline();
1029 }
1030
1031 static void endif_directive(void)
1032 {
1033         //expect_newline();
1034 }
1035
1036 static void parse_line_directive(void)
1037 {
1038         if(pp_token.type != T_INTEGER) {
1039                 parse_error("expected integer");
1040         } else {
1041                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1042                 next_pp_token();
1043         }
1044         if(pp_token.type == T_STRING_LITERAL) {
1045                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1046                 next_pp_token();
1047         }
1048
1049         eat_until_newline();
1050 }
1051
1052 static void parse_preprocessor_identifier(void)
1053 {
1054         assert(pp_token.type == T_IDENTIFIER);
1055         symbol_t *symbol = pp_token.v.symbol;
1056
1057         switch(symbol->pp_ID) {
1058         case TP_include:
1059                 printf("include - enable header name parsing!\n");
1060                 break;
1061         case TP_define:
1062                 define_directive();
1063                 break;
1064         case TP_ifdef:
1065                 ifdef_directive(0);
1066                 break;
1067         case TP_ifndef:
1068                 ifdef_directive(1);
1069                 break;
1070         case TP_endif:
1071                 endif_directive();
1072                 break;
1073         case TP_line:
1074                 next_pp_token();
1075                 parse_line_directive();
1076                 break;
1077         case TP_if:
1078         case TP_else:
1079         case TP_elif:
1080         case TP_undef:
1081         case TP_error:
1082                 /* TODO; output the rest of the line */
1083                 parse_error("#error directive: ");
1084                 break;
1085         case TP_pragma:
1086                 if (warning.unknown_pragmas) {
1087                         warningf(lexer_token.source_position, "encountered unknown #pragma");
1088                 }
1089                 eat_until_newline();
1090                 break;
1091         }
1092 }
1093
1094 static void parse_preprocessor_directive(void)
1095 {
1096         next_pp_token();
1097
1098         switch(pp_token.type) {
1099         case T_IDENTIFIER:
1100                 parse_preprocessor_identifier();
1101                 break;
1102         case T_INTEGER:
1103                 parse_line_directive();
1104                 break;
1105         default:
1106                 parse_error("invalid preprocessor directive");
1107                 eat_until_newline();
1108                 break;
1109         }
1110 }
1111
1112 #define MAYBE_PROLOG                                       \
1113                         next_char();                                   \
1114                         while(1) {                                     \
1115                                 switch(c) {
1116
1117 #define MAYBE(ch, set_type)                                \
1118                                 case ch:                                   \
1119                                         next_char();                           \
1120                                         lexer_token.type = set_type;           \
1121                                         return;
1122
1123 #define ELSE_CODE(code)                                    \
1124                                 default:                                   \
1125                                         code;                                  \
1126                                 }                                          \
1127                         } /* end of while(1) */                        \
1128                         break;
1129
1130 #define ELSE(set_type)                                     \
1131                 ELSE_CODE(                                         \
1132                         lexer_token.type = set_type;                   \
1133                         return;                                        \
1134                 )
1135
1136 void lexer_next_preprocessing_token(void)
1137 {
1138         while(1) {
1139                 switch(c) {
1140                 case ' ':
1141                 case '\t':
1142                         next_char();
1143                         break;
1144
1145                 MATCH_NEWLINE(
1146                         lexer_token.type = '\n';
1147                         return;
1148                 )
1149
1150                 SYMBOL_CHARS
1151                         parse_symbol();
1152                         /* might be a wide string ( L"string" ) */
1153                         if(lexer_token.type == T_IDENTIFIER &&
1154                             lexer_token.v.symbol == symbol_L) {
1155                             if(c == '"') {
1156                                         parse_wide_string_literal();
1157                                 } else if(c == '\'') {
1158                                         parse_wide_character_constant();
1159                                 }
1160                         }
1161                         return;
1162
1163                 DIGITS
1164                         parse_number();
1165                         return;
1166
1167                 case '"':
1168                         parse_string_literal();
1169                         return;
1170
1171                 case '\'':
1172                         parse_character_constant();
1173                         return;
1174
1175                 case '.':
1176                         MAYBE_PROLOG
1177                                 case '0':
1178                                 case '1':
1179                                 case '2':
1180                                 case '3':
1181                                 case '4':
1182                                 case '5':
1183                                 case '6':
1184                                 case '7':
1185                                 case '8':
1186                                 case '9':
1187                                         put_back(c);
1188                                         c = '.';
1189                                         parse_number_dec();
1190                                         return;
1191
1192                                 case '.':
1193                                         MAYBE_PROLOG
1194                                         MAYBE('.', T_DOTDOTDOT)
1195                                         ELSE_CODE(
1196                                                 put_back(c);
1197                                                 c = '.';
1198                                                 lexer_token.type = '.';
1199                                                 return;
1200                                         )
1201                         ELSE('.')
1202                 case '&':
1203                         MAYBE_PROLOG
1204                         MAYBE('&', T_ANDAND)
1205                         MAYBE('=', T_ANDEQUAL)
1206                         ELSE('&')
1207                 case '*':
1208                         MAYBE_PROLOG
1209                         MAYBE('=', T_ASTERISKEQUAL)
1210                         ELSE('*')
1211                 case '+':
1212                         MAYBE_PROLOG
1213                         MAYBE('+', T_PLUSPLUS)
1214                         MAYBE('=', T_PLUSEQUAL)
1215                         ELSE('+')
1216                 case '-':
1217                         MAYBE_PROLOG
1218                         MAYBE('>', T_MINUSGREATER)
1219                         MAYBE('-', T_MINUSMINUS)
1220                         MAYBE('=', T_MINUSEQUAL)
1221                         ELSE('-')
1222                 case '!':
1223                         MAYBE_PROLOG
1224                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1225                         ELSE('!')
1226                 case '/':
1227                         MAYBE_PROLOG
1228                         MAYBE('=', T_SLASHEQUAL)
1229                                 case '*':
1230                                         next_char();
1231                                         skip_multiline_comment();
1232                                         lexer_next_preprocessing_token();
1233                                         return;
1234                                 case '/':
1235                                         next_char();
1236                                         skip_line_comment();
1237                                         lexer_next_preprocessing_token();
1238                                         return;
1239                         ELSE('/')
1240                 case '%':
1241                         MAYBE_PROLOG
1242                         MAYBE('>', T_PERCENTGREATER)
1243                         MAYBE('=', T_PERCENTEQUAL)
1244                                 case ':':
1245                                         MAYBE_PROLOG
1246                                                 case '%':
1247                                                         MAYBE_PROLOG
1248                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1249                                                         ELSE_CODE(
1250                                                                 put_back(c);
1251                                                                 c = '%';
1252                                                                 lexer_token.type = T_PERCENTCOLON;
1253                                                                 return;
1254                                                         )
1255                                         ELSE(T_PERCENTCOLON)
1256                         ELSE('%')
1257                 case '<':
1258                         MAYBE_PROLOG
1259                         MAYBE(':', T_LESSCOLON)
1260                         MAYBE('%', T_LESSPERCENT)
1261                         MAYBE('=', T_LESSEQUAL)
1262                                 case '<':
1263                                         MAYBE_PROLOG
1264                                         MAYBE('=', T_LESSLESSEQUAL)
1265                                         ELSE(T_LESSLESS)
1266                         ELSE('<')
1267                 case '>':
1268                         MAYBE_PROLOG
1269                         MAYBE('=', T_GREATEREQUAL)
1270                                 case '>':
1271                                         MAYBE_PROLOG
1272                                         MAYBE('=', T_GREATERGREATEREQUAL)
1273                                         ELSE(T_GREATERGREATER)
1274                         ELSE('>')
1275                 case '^':
1276                         MAYBE_PROLOG
1277                         MAYBE('=', T_CARETEQUAL)
1278                         ELSE('^')
1279                 case '|':
1280                         MAYBE_PROLOG
1281                         MAYBE('=', T_PIPEEQUAL)
1282                         MAYBE('|', T_PIPEPIPE)
1283                         ELSE('|')
1284                 case ':':
1285                         MAYBE_PROLOG
1286                         MAYBE('>', T_COLONGREATER)
1287                         ELSE(':')
1288                 case '=':
1289                         MAYBE_PROLOG
1290                         MAYBE('=', T_EQUALEQUAL)
1291                         ELSE('=')
1292                 case '#':
1293                         MAYBE_PROLOG
1294                         MAYBE('#', T_HASHHASH)
1295                         ELSE('#')
1296
1297                 case '?':
1298                 case '[':
1299                 case ']':
1300                 case '(':
1301                 case ')':
1302                 case '{':
1303                 case '}':
1304                 case '~':
1305                 case ';':
1306                 case ',':
1307                 case '\\':
1308                         lexer_token.type = c;
1309                         next_char();
1310                         return;
1311
1312                 case EOF:
1313                         lexer_token.type = T_EOF;
1314                         return;
1315
1316                 default:
1317                         next_char();
1318                         errorf(lexer_token.source_position, "unknown character '%c' found\n", c);
1319                         lexer_token.type = T_ERROR;
1320                         return;
1321                 }
1322         }
1323 }
1324
1325 void lexer_next_token(void)
1326 {
1327         lexer_next_preprocessing_token();
1328         if(lexer_token.type != '\n')
1329                 return;
1330
1331 newline_found:
1332         do {
1333                 lexer_next_preprocessing_token();
1334         } while(lexer_token.type == '\n');
1335
1336         if(lexer_token.type == '#') {
1337                 parse_preprocessor_directive();
1338                 goto newline_found;
1339         }
1340 }
1341
1342 void init_lexer(void)
1343 {
1344         strset_init(&stringset);
1345 }
1346
1347 void lexer_open_stream(FILE *stream, const char *input_name)
1348 {
1349         input                                  = stream;
1350         lexer_token.source_position.linenr     = 0;
1351         lexer_token.source_position.input_name = input_name;
1352
1353         symbol_L = symbol_table_insert("L");
1354         bufpos = NULL;
1355         bufend = NULL;
1356
1357         /* place a virtual \n at the beginning so the lexer knows that we're
1358          * at the beginning of a line */
1359         c = '\n';
1360 }
1361
1362 void exit_lexer(void)
1363 {
1364         strset_destroy(&stringset);
1365 }
1366
1367 static __attribute__((unused))
1368 void dbg_pos(const source_position_t source_position)
1369 {
1370         fprintf(stdout, "%s:%u\n", source_position.input_name,
1371                 source_position.linenr);
1372         fflush(stdout);
1373 }