add option -l and -L, reorganize option handler
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "diagnostic.h"
4 #include "lexer.h"
5 #include "token_t.h"
6 #include "symbol_table_t.h"
7 #include "adt/error.h"
8 #include "adt/strset.h"
9 #include "adt/util.h"
10 #include "types.h"
11 #include "type_t.h"
12 #include "target_architecture.h"
13 #include "parser.h"
14 #include "warning.h"
15
16 #include <assert.h>
17 #include <errno.h>
18 #include <string.h>
19 #include <stdbool.h>
20 #include <ctype.h>
21
22 //#define DEBUG_CHARS
23 #define MAX_PUTBACK 3
24
25 #ifdef _WIN32
26 /* No strtold on windows and no replacement yet */
27 #define strtold(s, e) strtod(s, e)
28 #endif
29
30 #if defined HAS_SIGNED_CHAR
31 typedef signed char char_type;
32 #elif defined HAS_UNSIGNED_CHAR
33 typedef unsigned char char_type;
34 #else
35 #       error signedness of char not determined
36 #endif
37
38 static int         c;
39 token_t            lexer_token;
40 symbol_t          *symbol_L;
41 static FILE       *input;
42 static char        buf[1024 + MAX_PUTBACK];
43 static const char *bufend;
44 static const char *bufpos;
45 static strset_t    stringset;
46
47 static void error_prefix_at(const char *input_name, unsigned linenr)
48 {
49         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
50 }
51
52 static void error_prefix(void)
53 {
54         error_prefix_at(lexer_token.source_position.input_name,
55                         lexer_token.source_position.linenr);
56 }
57
58 static void parse_error(const char *msg)
59 {
60         error_prefix();
61         fprintf(stderr, "%s\n", msg);
62 }
63
64 static inline void next_real_char(void)
65 {
66         assert(bufpos <= bufend);
67         if (bufpos >= bufend) {
68                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
69                                  input);
70                 if(s == 0) {
71                         c = EOF;
72                         return;
73                 }
74                 bufpos = buf + MAX_PUTBACK;
75                 bufend = buf + MAX_PUTBACK + s;
76         }
77         c = *bufpos++;
78 }
79
80 static inline void put_back(int pc)
81 {
82         assert(bufpos > buf);
83         *(--bufpos - buf + buf) = (char) pc;
84
85 #ifdef DEBUG_CHARS
86         printf("putback '%c'\n", pc);
87 #endif
88 }
89
90 static inline void next_char(void);
91
92 #define MATCH_NEWLINE(code)                   \
93         case '\r':                                \
94                 next_char();                          \
95                 if(c == '\n') {                       \
96                         next_char();                      \
97                 }                                     \
98                 lexer_token.source_position.linenr++; \
99                 code                                  \
100         case '\n':                                \
101                 next_char();                          \
102                 lexer_token.source_position.linenr++; \
103                 code
104
105 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
106
107 static void maybe_concat_lines(void)
108 {
109         eat('\\');
110
111         switch(c) {
112         MATCH_NEWLINE(return;)
113
114         default:
115                 break;
116         }
117
118         put_back(c);
119         c = '\\';
120 }
121
122 static inline void next_char(void)
123 {
124         next_real_char();
125
126         /* filter trigraphs */
127         if(UNLIKELY(c == '\\')) {
128                 maybe_concat_lines();
129                 goto end_of_next_char;
130         }
131
132         if(LIKELY(c != '?'))
133                 goto end_of_next_char;
134
135         next_real_char();
136         if(LIKELY(c != '?')) {
137                 put_back(c);
138                 c = '?';
139                 goto end_of_next_char;
140         }
141
142         next_real_char();
143         switch(c) {
144         case '=': c = '#'; break;
145         case '(': c = '['; break;
146         case '/': c = '\\'; maybe_concat_lines(); break;
147         case ')': c = ']'; break;
148         case '\'': c = '^'; break;
149         case '<': c = '{'; break;
150         case '!': c = '|'; break;
151         case '>': c = '}'; break;
152         case '-': c = '~'; break;
153         default:
154                 put_back(c);
155                 put_back('?');
156                 c = '?';
157                 break;
158         }
159
160 end_of_next_char:;
161 #ifdef DEBUG_CHARS
162         printf("nchar '%c'\n", c);
163 #endif
164 }
165
166 #define SYMBOL_CHARS  \
167         case 'a':         \
168         case 'b':         \
169         case 'c':         \
170         case 'd':         \
171         case 'e':         \
172         case 'f':         \
173         case 'g':         \
174         case 'h':         \
175         case 'i':         \
176         case 'j':         \
177         case 'k':         \
178         case 'l':         \
179         case 'm':         \
180         case 'n':         \
181         case 'o':         \
182         case 'p':         \
183         case 'q':         \
184         case 'r':         \
185         case 's':         \
186         case 't':         \
187         case 'u':         \
188         case 'v':         \
189         case 'w':         \
190         case 'x':         \
191         case 'y':         \
192         case 'z':         \
193         case 'A':         \
194         case 'B':         \
195         case 'C':         \
196         case 'D':         \
197         case 'E':         \
198         case 'F':         \
199         case 'G':         \
200         case 'H':         \
201         case 'I':         \
202         case 'J':         \
203         case 'K':         \
204         case 'L':         \
205         case 'M':         \
206         case 'N':         \
207         case 'O':         \
208         case 'P':         \
209         case 'Q':         \
210         case 'R':         \
211         case 'S':         \
212         case 'T':         \
213         case 'U':         \
214         case 'V':         \
215         case 'W':         \
216         case 'X':         \
217         case 'Y':         \
218         case 'Z':         \
219         case '_':
220
221 #define DIGITS        \
222         case '0':         \
223         case '1':         \
224         case '2':         \
225         case '3':         \
226         case '4':         \
227         case '5':         \
228         case '6':         \
229         case '7':         \
230         case '8':         \
231         case '9':
232
233 static void parse_symbol(void)
234 {
235         symbol_t *symbol;
236         char     *string;
237
238         obstack_1grow(&symbol_obstack, (char) c);
239         next_char();
240
241         while(1) {
242                 switch(c) {
243                 DIGITS
244                 SYMBOL_CHARS
245                         obstack_1grow(&symbol_obstack, (char) c);
246                         next_char();
247                         break;
248
249                 default:
250                         goto end_symbol;
251                 }
252         }
253
254 end_symbol:
255         obstack_1grow(&symbol_obstack, '\0');
256
257         string = obstack_finish(&symbol_obstack);
258         symbol = symbol_table_insert(string);
259
260         lexer_token.type     = symbol->ID;
261         lexer_token.v.symbol = symbol;
262
263         if(symbol->string != string) {
264                 obstack_free(&symbol_obstack, string);
265         }
266 }
267
268 static void parse_integer_suffix(bool is_oct_hex)
269 {
270         bool is_unsigned  = false;
271         bool min_long     = false;
272         bool min_longlong = false;
273
274         if(c == 'U' || c == 'u') {
275                 is_unsigned = true;
276                 next_char();
277                 if(c == 'L' || c == 'l') {
278                         min_long = true;
279                         next_char();
280                         if(c == 'L' || c == 'l') {
281                                 min_longlong = true;
282                                 next_char();
283                         }
284                 }
285         } else if(c == 'l' || c == 'L') {
286                 min_long = true;
287                 next_char();
288                 if(c == 'l' || c == 'L') {
289                         min_longlong = true;
290                         next_char();
291                         if(c == 'u' || c == 'U') {
292                                 is_unsigned = true;
293                                 next_char();
294                         }
295                 } else if(c == 'u' || c == 'U') {
296                         is_unsigned = true;
297                         next_char();
298                         lexer_token.datatype = type_unsigned_long;
299                 }
300         }
301
302         if(!is_unsigned) {
303                 long long v = lexer_token.v.intvalue;
304                 if(!min_long) {
305                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
306                                 lexer_token.datatype = type_int;
307                                 return;
308                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
309                                 lexer_token.datatype = type_unsigned_int;
310                                 return;
311                         }
312                 }
313                 if(!min_longlong) {
314                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
315                                 lexer_token.datatype = type_long;
316                                 return;
317                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
318                                 lexer_token.datatype = type_unsigned_long;
319                                 return;
320                         }
321                 }
322                 unsigned long long uv = (unsigned long long) v;
323                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
324                         lexer_token.datatype = type_unsigned_long_long;
325                         return;
326                 }
327
328                 lexer_token.datatype = type_long_long;
329         } else {
330                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
331                 if(!min_long && v <= TARGET_UINT_MAX) {
332                         lexer_token.datatype = type_unsigned_int;
333                         return;
334                 }
335                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
336                         lexer_token.datatype = type_unsigned_long;
337                         return;
338                 }
339                 lexer_token.datatype = type_unsigned_long_long;
340         }
341 }
342
343 static void parse_floating_suffix(void)
344 {
345         switch(c) {
346         /* TODO: do something usefull with the suffixes... */
347         case 'f':
348         case 'F':
349                 next_char();
350                 lexer_token.datatype = type_float;
351                 break;
352         case 'l':
353         case 'L':
354                 next_char();
355                 lexer_token.datatype = type_long_double;
356                 break;
357         default:
358                 lexer_token.datatype = type_double;
359                 break;
360         }
361 }
362
363 /**
364  * A replacement for strtoull. Only those parts needed for
365  * our parser are implemented.
366  */
367 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
368         unsigned long long v = 0;
369
370         switch (base) {
371         case 16:
372                 for (;; ++s) {
373                         /* check for overrun */
374                         if (v >= 0x1000000000000000ULL)
375                                 break;
376                         switch (tolower(*s)) {
377                         case '0': v <<= 4; break;
378                         case '1': v <<= 4; v |= 0x1; break;
379                         case '2': v <<= 4; v |= 0x2; break;
380                         case '3': v <<= 4; v |= 0x3; break;
381                         case '4': v <<= 4; v |= 0x4; break;
382                         case '5': v <<= 4; v |= 0x5; break;
383                         case '6': v <<= 4; v |= 0x6; break;
384                         case '7': v <<= 4; v |= 0x7; break;
385                         case '8': v <<= 4; v |= 0x8; break;
386                         case '9': v <<= 4; v |= 0x9; break;
387                         case 'a': v <<= 4; v |= 0xa; break;
388                         case 'b': v <<= 4; v |= 0xb; break;
389                         case 'c': v <<= 4; v |= 0xc; break;
390                         case 'd': v <<= 4; v |= 0xd; break;
391                         case 'e': v <<= 4; v |= 0xe; break;
392                         case 'f': v <<= 4; v |= 0xf; break;
393                         default:
394                                 goto end;
395                         }
396                 }
397                 break;
398         case 8:
399                 for (;; ++s) {
400                         /* check for overrun */
401                         if (v >= 0x2000000000000000ULL)
402                                 break;
403                         switch (tolower(*s)) {
404                         case '0': v <<= 3; break;
405                         case '1': v <<= 3; v |= 1; break;
406                         case '2': v <<= 3; v |= 2; break;
407                         case '3': v <<= 3; v |= 3; break;
408                         case '4': v <<= 3; v |= 4; break;
409                         case '5': v <<= 3; v |= 5; break;
410                         case '6': v <<= 3; v |= 6; break;
411                         case '7': v <<= 3; v |= 7; break;
412                         default:
413                                 goto end;
414                         }
415                 }
416                 break;
417         case 10:
418                 for (;; ++s) {
419                         /* check for overrun */
420                         if (v > 0x1999999999999999ULL)
421                                 break;
422                         switch (tolower(*s)) {
423                         case '0': v *= 10; break;
424                         case '1': v *= 10; v += 1; break;
425                         case '2': v *= 10; v += 2; break;
426                         case '3': v *= 10; v += 3; break;
427                         case '4': v *= 10; v += 4; break;
428                         case '5': v *= 10; v += 5; break;
429                         case '6': v *= 10; v += 6; break;
430                         case '7': v *= 10; v += 7; break;
431                         case '8': v *= 10; v += 8; break;
432                         case '9': v *= 10; v += 9; break;
433                         default:
434                                 goto end;
435                         }
436                 }
437                 break;
438         default:
439                 assert(0);
440                 break;
441         }
442 end:
443         *endptr = s;
444         return v;
445 }
446
447 static void parse_number_hex(void)
448 {
449         assert(c == 'x' || c == 'X');
450         next_char();
451
452         while(isxdigit(c)) {
453                 obstack_1grow(&symbol_obstack, (char) c);
454                 next_char();
455         }
456         obstack_1grow(&symbol_obstack, '\0');
457         char *string = obstack_finish(&symbol_obstack);
458
459         if(c == '.' || c == 'p' || c == 'P') {
460                 next_char();
461                 panic("Hex floating point numbers not implemented yet");
462         }
463         if(*string == '\0') {
464                 parse_error("invalid hex number");
465                 lexer_token.type = T_ERROR;
466         }
467
468         const char *endptr;
469         lexer_token.type       = T_INTEGER;
470         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
471         if(*endptr != '\0') {
472                 parse_error("hex number literal too long");
473         }
474
475         obstack_free(&symbol_obstack, string);
476         parse_integer_suffix(true);
477 }
478
479 static inline bool is_octal_digit(int chr)
480 {
481         return '0' <= chr && chr <= '7';
482 }
483
484 static void parse_number_oct(void)
485 {
486         while(is_octal_digit(c)) {
487                 obstack_1grow(&symbol_obstack, (char) c);
488                 next_char();
489         }
490         obstack_1grow(&symbol_obstack, '\0');
491         char *string = obstack_finish(&symbol_obstack);
492
493         const char *endptr;
494         lexer_token.type       = T_INTEGER;
495         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
496         if(*endptr != '\0') {
497                 parse_error("octal number literal too long");
498         }
499
500         obstack_free(&symbol_obstack, string);
501         parse_integer_suffix(true);
502 }
503
504 static void parse_number_dec(void)
505 {
506         bool is_float = false;
507         while(isdigit(c)) {
508                 obstack_1grow(&symbol_obstack, (char) c);
509                 next_char();
510         }
511
512         if(c == '.') {
513                 obstack_1grow(&symbol_obstack, '.');
514                 next_char();
515
516                 while(isdigit(c)) {
517                         obstack_1grow(&symbol_obstack, (char) c);
518                         next_char();
519                 }
520                 is_float = true;
521         }
522         if(c == 'e' || c == 'E') {
523                 obstack_1grow(&symbol_obstack, 'e');
524                 next_char();
525
526                 if(c == '-' || c == '+') {
527                         obstack_1grow(&symbol_obstack, (char) c);
528                         next_char();
529                 }
530
531                 while(isdigit(c)) {
532                         obstack_1grow(&symbol_obstack, (char) c);
533                         next_char();
534                 }
535                 is_float = true;
536         }
537
538         obstack_1grow(&symbol_obstack, '\0');
539         char *string = obstack_finish(&symbol_obstack);
540
541         if(is_float) {
542                 char *endptr;
543                 lexer_token.type         = T_FLOATINGPOINT;
544                 lexer_token.v.floatvalue = strtold(string, &endptr);
545
546                 if(*endptr != '\0') {
547                         parse_error("invalid number literal");
548                 }
549
550                 parse_floating_suffix();
551         } else {
552                 const char *endptr;
553                 lexer_token.type       = T_INTEGER;
554                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
555
556                 if(*endptr != '\0') {
557                         parse_error("invalid number literal");
558                 }
559
560                 parse_integer_suffix(false);
561         }
562         obstack_free(&symbol_obstack, string);
563 }
564
565 static void parse_number(void)
566 {
567         if (c == '0') {
568                 next_char();
569                 switch (c) {
570                         case 'X':
571                         case 'x':
572                                 parse_number_hex();
573                                 break;
574                         case '0':
575                         case '1':
576                         case '2':
577                         case '3':
578                         case '4':
579                         case '5':
580                         case '6':
581                         case '7':
582                                 parse_number_oct();
583                                 break;
584                         case '8':
585                         case '9':
586                                 next_char();
587                                 parse_error("invalid octal number");
588                                 lexer_token.type = T_ERROR;
589                                 return;
590                         case '.':
591                         case 'e':
592                         case 'E':
593                         default:
594                                 obstack_1grow(&symbol_obstack, '0');
595                                 parse_number_dec();
596                                 return;
597                 }
598         } else {
599                 parse_number_dec();
600         }
601 }
602
603 static int parse_octal_sequence(const int first_digit)
604 {
605         assert(is_octal_digit(first_digit));
606         int value = first_digit - '0';
607         if (!is_octal_digit(c)) return value;
608         value = 8 * value + c - '0';
609         next_char();
610         if (!is_octal_digit(c)) return value;
611         value = 8 * value + c - '0';
612         next_char();
613         return (char_type)value;
614 }
615
616 static int parse_hex_sequence(void)
617 {
618         int value = 0;
619         while(1) {
620                 if (c >= '0' && c <= '9') {
621                         value = 16 * value + c - '0';
622                 } else if ('A' <= c && c <= 'F') {
623                         value = 16 * value + c - 'A' + 10;
624                 } else if ('a' <= c && c <= 'f') {
625                         value = 16 * value + c - 'a' + 10;
626                 } else {
627                         break;
628                 }
629                 next_char();
630         }
631
632         return (char_type)value;
633 }
634
635 static int parse_escape_sequence(void)
636 {
637         eat('\\');
638
639         int ec = c;
640         next_char();
641
642         switch(ec) {
643         case '"':  return '"';
644         case '\'': return '\'';
645         case '\\': return '\\';
646         case '?': return '\?';
647         case 'a': return '\a';
648         case 'b': return '\b';
649         case 'f': return '\f';
650         case 'n': return '\n';
651         case 'r': return '\r';
652         case 't': return '\t';
653         case 'v': return '\v';
654         case 'x':
655                 return parse_hex_sequence();
656         case '0':
657         case '1':
658         case '2':
659         case '3':
660         case '4':
661         case '5':
662         case '6':
663         case '7':
664                 return parse_octal_sequence(ec);
665         case EOF:
666                 parse_error("reached end of file while parsing escape sequence");
667                 return EOF;
668         default:
669                 parse_error("unknown escape sequence");
670                 return EOF;
671         }
672 }
673
674 string_t concat_strings(const string_t *const s1, const string_t *const s2)
675 {
676         const size_t len1 = s1->size - 1;
677         const size_t len2 = s2->size - 1;
678
679         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
680         memcpy(concat, s1->begin, len1);
681         memcpy(concat + len1, s2->begin, len2 + 1);
682
683 #if 0 /* TODO hash */
684         const char *result = strset_insert(&stringset, concat);
685         if(result != concat) {
686                 obstack_free(&symbol_obstack, concat);
687         }
688
689         return result;
690 #else
691         return (string_t){ concat, len1 + len2 + 1 };
692 #endif
693 }
694
695 static void parse_string_literal(void)
696 {
697         const unsigned start_linenr = lexer_token.source_position.linenr;
698
699         assert(c == '"');
700         next_char();
701
702         int tc;
703         while(1) {
704                 switch(c) {
705                 case '\\':
706                         tc = parse_escape_sequence();
707                         obstack_1grow(&symbol_obstack, (char) tc);
708                         break;
709
710                 case EOF:
711                         error_prefix_at(lexer_token.source_position.input_name,
712                                         start_linenr);
713                         fprintf(stderr, "string has no end\n");
714                         lexer_token.type = T_ERROR;
715                         return;
716
717                 case '"':
718                         next_char();
719                         goto end_of_string;
720
721                 default:
722                         obstack_1grow(&symbol_obstack, (char) c);
723                         next_char();
724                         break;
725                 }
726         }
727
728 end_of_string:
729
730         /* TODO: concatenate multiple strings separated by whitespace... */
731
732         /* add finishing 0 to the string */
733         obstack_1grow(&symbol_obstack, '\0');
734         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
735         const char *const string = obstack_finish(&symbol_obstack);
736
737 #if 0 /* TODO hash */
738         /* check if there is already a copy of the string */
739         result = strset_insert(&stringset, string);
740         if(result != string) {
741                 obstack_free(&symbol_obstack, string);
742         }
743 #else
744         const char *const result = string;
745 #endif
746
747         lexer_token.type           = T_STRING_LITERAL;
748         lexer_token.v.string.begin = result;
749         lexer_token.v.string.size  = size;
750 }
751
752 static void parse_wide_character_constant(void)
753 {
754         eat('\'');
755
756         int found_char = 0;
757         while(1) {
758                 switch(c) {
759                 case '\\':
760                         found_char = parse_escape_sequence();
761                         break;
762
763                 MATCH_NEWLINE(
764                         parse_error("newline while parsing character constant");
765                         break;
766                 )
767
768                 case '\'':
769                         next_char();
770                         goto end_of_wide_char_constant;
771
772                 case EOF:
773                         parse_error("EOF while parsing character constant");
774                         lexer_token.type = T_ERROR;
775                         return;
776
777                 default:
778                         if(found_char != 0) {
779                                 parse_error("more than 1 characters in character "
780                                             "constant");
781                                 goto end_of_wide_char_constant;
782                         } else {
783                                 found_char = c;
784                                 next_char();
785                         }
786                         break;
787                 }
788         }
789
790 end_of_wide_char_constant:
791         lexer_token.type       = T_INTEGER;
792         lexer_token.v.intvalue = found_char;
793         lexer_token.datatype   = type_wchar_t;
794 }
795
796 static void parse_wide_string_literal(void)
797 {
798         const unsigned start_linenr = lexer_token.source_position.linenr;
799
800         assert(c == '"');
801         next_char();
802
803         while(1) {
804                 switch(c) {
805                         case '\\': {
806                                 wchar_rep_t tc = parse_escape_sequence();
807                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
808                                 break;
809                         }
810
811                         case EOF:
812                                 error_prefix_at(lexer_token.source_position.input_name,
813                                                 start_linenr);
814                                 fprintf(stderr, "string has no end\n");
815                                 lexer_token.type = T_ERROR;
816                                 return;
817
818                         case '"':
819                                 next_char();
820                                 goto end_of_string;
821
822                         default: {
823                                 wchar_rep_t tc = c;
824                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
825                                 next_char();
826                                 break;
827                         }
828                 }
829         }
830
831 end_of_string:;
832
833         /* TODO: concatenate multiple strings separated by whitespace... */
834
835         /* add finishing 0 to the string */
836         wchar_rep_t nul = L'\0';
837         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
838         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
839         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
840
841 #if 0 /* TODO hash */
842         /* check if there is already a copy of the string */
843         const wchar_rep_t *const result = strset_insert(&stringset, string);
844         if(result != string) {
845                 obstack_free(&symbol_obstack, string);
846         }
847 #else
848         const wchar_rep_t *const result = string;
849 #endif
850
851         lexer_token.type                = T_WIDE_STRING_LITERAL;
852         lexer_token.v.wide_string.begin = result;
853         lexer_token.v.wide_string.size  = size;
854 }
855
856 static void parse_character_constant(void)
857 {
858         eat('\'');
859
860         int found_char = 0;
861         while(1) {
862                 switch(c) {
863                 case '\\':
864                         found_char = parse_escape_sequence();
865                         break;
866
867                 MATCH_NEWLINE(
868                         parse_error("newline while parsing character constant");
869                         break;
870                 )
871
872                 case '\'':
873                         next_char();
874                         goto end_of_char_constant;
875
876                 case EOF:
877                         parse_error("EOF while parsing character constant");
878                         lexer_token.type = T_ERROR;
879                         return;
880
881                 default:
882                         if(found_char != 0) {
883                                 parse_error("more than 1 characters in character "
884                                             "constant");
885                                 goto end_of_char_constant;
886                         } else {
887                                 found_char = c;
888                                 next_char();
889                         }
890                         break;
891                 }
892         }
893
894 end_of_char_constant:
895         lexer_token.type       = T_INTEGER;
896         lexer_token.v.intvalue = found_char;
897         lexer_token.datatype   = type_int;
898 }
899
900 static void skip_multiline_comment(void)
901 {
902         unsigned start_linenr = lexer_token.source_position.linenr;
903
904         while(1) {
905                 switch(c) {
906                 case '*':
907                         next_char();
908                         if(c == '/') {
909                                 next_char();
910                                 return;
911                         }
912                         break;
913
914                 MATCH_NEWLINE(break;)
915
916                 case EOF:
917                         error_prefix_at(lexer_token.source_position.input_name,
918                                         start_linenr);
919                         fprintf(stderr, "at end of file while looking for comment end\n");
920                         return;
921
922                 default:
923                         next_char();
924                         break;
925                 }
926         }
927 }
928
929 static void skip_line_comment(void)
930 {
931         while(1) {
932                 switch(c) {
933                 case EOF:
934                         return;
935
936                 case '\n':
937                 case '\r':
938                         return;
939
940                 default:
941                         next_char();
942                         break;
943                 }
944         }
945 }
946
947 static token_t pp_token;
948
949 static inline void next_pp_token(void)
950 {
951         lexer_next_preprocessing_token();
952         pp_token = lexer_token;
953 }
954
955 static void eat_until_newline(void)
956 {
957         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
958                 next_pp_token();
959         }
960 }
961
962 static void error_directive(void)
963 {
964         error_prefix();
965         fprintf(stderr, "#error directive: \n");
966
967         /* parse pp-tokens until new-line */
968 }
969
970 static void define_directive(void)
971 {
972         lexer_next_preprocessing_token();
973         if(lexer_token.type != T_IDENTIFIER) {
974                 parse_error("expected identifier after #define\n");
975                 eat_until_newline();
976         }
977 }
978
979 static void ifdef_directive(int is_ifndef)
980 {
981         (void) is_ifndef;
982         lexer_next_preprocessing_token();
983         //expect_identifier();
984         //extect_newline();
985 }
986
987 static void endif_directive(void)
988 {
989         //expect_newline();
990 }
991
992 static void parse_line_directive(void)
993 {
994         if(pp_token.type != T_INTEGER) {
995                 parse_error("expected integer");
996         } else {
997                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
998                 next_pp_token();
999         }
1000         if(pp_token.type == T_STRING_LITERAL) {
1001                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1002                 next_pp_token();
1003         }
1004
1005         eat_until_newline();
1006 }
1007
1008 static void parse_preprocessor_identifier(void)
1009 {
1010         assert(pp_token.type == T_IDENTIFIER);
1011         symbol_t *symbol = pp_token.v.symbol;
1012
1013         switch(symbol->pp_ID) {
1014         case TP_include:
1015                 printf("include - enable header name parsing!\n");
1016                 break;
1017         case TP_define:
1018                 define_directive();
1019                 break;
1020         case TP_ifdef:
1021                 ifdef_directive(0);
1022                 break;
1023         case TP_ifndef:
1024                 ifdef_directive(1);
1025                 break;
1026         case TP_endif:
1027                 endif_directive();
1028                 break;
1029         case TP_line:
1030                 next_pp_token();
1031                 parse_line_directive();
1032                 break;
1033         case TP_if:
1034         case TP_else:
1035         case TP_elif:
1036         case TP_undef:
1037         case TP_error:
1038                 error_directive();
1039                 break;
1040         case TP_pragma:
1041                 if (warning.unknown_pragmas) {
1042                         warningf(lexer_token.source_position, "encountered unknown #pragma");
1043                 }
1044                 eat_until_newline();
1045                 break;
1046         }
1047 }
1048
1049 static void parse_preprocessor_directive(void)
1050 {
1051         next_pp_token();
1052
1053         switch(pp_token.type) {
1054         case T_IDENTIFIER:
1055                 parse_preprocessor_identifier();
1056                 break;
1057         case T_INTEGER:
1058                 parse_line_directive();
1059                 break;
1060         default:
1061                 parse_error("invalid preprocessor directive");
1062                 eat_until_newline();
1063                 break;
1064         }
1065 }
1066
1067 #define MAYBE_PROLOG                                       \
1068                         next_char();                                   \
1069                         while(1) {                                     \
1070                                 switch(c) {
1071
1072 #define MAYBE(ch, set_type)                                \
1073                                 case ch:                                   \
1074                                         next_char();                           \
1075                                         lexer_token.type = set_type;           \
1076                                         return;
1077
1078 #define ELSE_CODE(code)                                    \
1079                                 default:                                   \
1080                                         code;                                  \
1081                                 }                                          \
1082                         } /* end of while(1) */                        \
1083                         break;
1084
1085 #define ELSE(set_type)                                     \
1086                 ELSE_CODE(                                         \
1087                         lexer_token.type = set_type;                   \
1088                         return;                                        \
1089                 )
1090
1091 void lexer_next_preprocessing_token(void)
1092 {
1093         while(1) {
1094                 switch(c) {
1095                 case ' ':
1096                 case '\t':
1097                         next_char();
1098                         break;
1099
1100                 MATCH_NEWLINE(
1101                         lexer_token.type = '\n';
1102                         return;
1103                 )
1104
1105                 SYMBOL_CHARS
1106                         parse_symbol();
1107                         /* might be a wide string ( L"string" ) */
1108                         if(lexer_token.type == T_IDENTIFIER &&
1109                             lexer_token.v.symbol == symbol_L) {
1110                             if(c == '"') {
1111                                         parse_wide_string_literal();
1112                                 } else if(c == '\'') {
1113                                         parse_wide_character_constant();
1114                                 }
1115                         }
1116                         return;
1117
1118                 DIGITS
1119                         parse_number();
1120                         return;
1121
1122                 case '"':
1123                         parse_string_literal();
1124                         return;
1125
1126                 case '\'':
1127                         parse_character_constant();
1128                         return;
1129
1130                 case '.':
1131                         MAYBE_PROLOG
1132                                 case '0':
1133                                 case '1':
1134                                 case '2':
1135                                 case '3':
1136                                 case '4':
1137                                 case '5':
1138                                 case '6':
1139                                 case '7':
1140                                 case '8':
1141                                 case '9':
1142                                         put_back(c);
1143                                         c = '.';
1144                                         parse_number_dec();
1145                                         return;
1146
1147                                 case '.':
1148                                         MAYBE_PROLOG
1149                                         MAYBE('.', T_DOTDOTDOT)
1150                                         ELSE_CODE(
1151                                                 put_back(c);
1152                                                 c = '.';
1153                                                 lexer_token.type = '.';
1154                                                 return;
1155                                         )
1156                         ELSE('.')
1157                 case '&':
1158                         MAYBE_PROLOG
1159                         MAYBE('&', T_ANDAND)
1160                         MAYBE('=', T_ANDEQUAL)
1161                         ELSE('&')
1162                 case '*':
1163                         MAYBE_PROLOG
1164                         MAYBE('=', T_ASTERISKEQUAL)
1165                         ELSE('*')
1166                 case '+':
1167                         MAYBE_PROLOG
1168                         MAYBE('+', T_PLUSPLUS)
1169                         MAYBE('=', T_PLUSEQUAL)
1170                         ELSE('+')
1171                 case '-':
1172                         MAYBE_PROLOG
1173                         MAYBE('>', T_MINUSGREATER)
1174                         MAYBE('-', T_MINUSMINUS)
1175                         MAYBE('=', T_MINUSEQUAL)
1176                         ELSE('-')
1177                 case '!':
1178                         MAYBE_PROLOG
1179                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1180                         ELSE('!')
1181                 case '/':
1182                         MAYBE_PROLOG
1183                         MAYBE('=', T_SLASHEQUAL)
1184                                 case '*':
1185                                         next_char();
1186                                         skip_multiline_comment();
1187                                         lexer_next_preprocessing_token();
1188                                         return;
1189                                 case '/':
1190                                         next_char();
1191                                         skip_line_comment();
1192                                         lexer_next_preprocessing_token();
1193                                         return;
1194                         ELSE('/')
1195                 case '%':
1196                         MAYBE_PROLOG
1197                         MAYBE('>', T_PERCENTGREATER)
1198                         MAYBE('=', T_PERCENTEQUAL)
1199                                 case ':':
1200                                         MAYBE_PROLOG
1201                                                 case '%':
1202                                                         MAYBE_PROLOG
1203                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1204                                                         ELSE_CODE(
1205                                                                 put_back(c);
1206                                                                 c = '%';
1207                                                                 lexer_token.type = T_PERCENTCOLON;
1208                                                                 return;
1209                                                         )
1210                                         ELSE(T_PERCENTCOLON)
1211                         ELSE('%')
1212                 case '<':
1213                         MAYBE_PROLOG
1214                         MAYBE(':', T_LESSCOLON)
1215                         MAYBE('%', T_LESSPERCENT)
1216                         MAYBE('=', T_LESSEQUAL)
1217                                 case '<':
1218                                         MAYBE_PROLOG
1219                                         MAYBE('=', T_LESSLESSEQUAL)
1220                                         ELSE(T_LESSLESS)
1221                         ELSE('<')
1222                 case '>':
1223                         MAYBE_PROLOG
1224                         MAYBE('=', T_GREATEREQUAL)
1225                                 case '>':
1226                                         MAYBE_PROLOG
1227                                         MAYBE('=', T_GREATERGREATEREQUAL)
1228                                         ELSE(T_GREATERGREATER)
1229                         ELSE('>')
1230                 case '^':
1231                         MAYBE_PROLOG
1232                         MAYBE('=', T_CARETEQUAL)
1233                         ELSE('^')
1234                 case '|':
1235                         MAYBE_PROLOG
1236                         MAYBE('=', T_PIPEEQUAL)
1237                         MAYBE('|', T_PIPEPIPE)
1238                         ELSE('|')
1239                 case ':':
1240                         MAYBE_PROLOG
1241                         MAYBE('>', T_COLONGREATER)
1242                         ELSE(':')
1243                 case '=':
1244                         MAYBE_PROLOG
1245                         MAYBE('=', T_EQUALEQUAL)
1246                         ELSE('=')
1247                 case '#':
1248                         MAYBE_PROLOG
1249                         MAYBE('#', T_HASHHASH)
1250                         ELSE('#')
1251
1252                 case '?':
1253                 case '[':
1254                 case ']':
1255                 case '(':
1256                 case ')':
1257                 case '{':
1258                 case '}':
1259                 case '~':
1260                 case ';':
1261                 case ',':
1262                 case '\\':
1263                         lexer_token.type = c;
1264                         next_char();
1265                         return;
1266
1267                 case EOF:
1268                         lexer_token.type = T_EOF;
1269                         return;
1270
1271                 default:
1272                         next_char();
1273                         error_prefix();
1274                         fprintf(stderr, "unknown character '%c' found\n", c);
1275                         lexer_token.type = T_ERROR;
1276                         return;
1277                 }
1278         }
1279 }
1280
1281 void lexer_next_token(void)
1282 {
1283         lexer_next_preprocessing_token();
1284         if(lexer_token.type != '\n')
1285                 return;
1286
1287 newline_found:
1288         do {
1289                 lexer_next_preprocessing_token();
1290         } while(lexer_token.type == '\n');
1291
1292         if(lexer_token.type == '#') {
1293                 parse_preprocessor_directive();
1294                 goto newline_found;
1295         }
1296 }
1297
1298 void init_lexer(void)
1299 {
1300         strset_init(&stringset);
1301 }
1302
1303 void lexer_open_stream(FILE *stream, const char *input_name)
1304 {
1305         input                                  = stream;
1306         lexer_token.source_position.linenr     = 0;
1307         lexer_token.source_position.input_name = input_name;
1308
1309         symbol_L = symbol_table_insert("L");
1310         bufpos = NULL;
1311         bufend = NULL;
1312
1313         /* place a virtual \n at the beginning so the lexer knows that we're
1314          * at the beginning of a line */
1315         c = '\n';
1316 }
1317
1318 void exit_lexer(void)
1319 {
1320         strset_destroy(&stringset);
1321 }
1322
1323 static __attribute__((unused))
1324 void dbg_pos(const source_position_t source_position)
1325 {
1326         fprintf(stdout, "%s:%u\n", source_position.input_name,
1327                 source_position.linenr);
1328         fflush(stdout);
1329 }