d7b280978c3aaead5661de210be1dccf536a7d93
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "diagnostic.h"
4 #include "lexer.h"
5 #include "token_t.h"
6 #include "symbol_table_t.h"
7 #include "adt/error.h"
8 #include "adt/strset.h"
9 #include "adt/util.h"
10 #include "types.h"
11 #include "type_t.h"
12 #include "target_architecture.h"
13 #include "parser.h"
14 #include "warning.h"
15
16 #include <assert.h>
17 #include <errno.h>
18 #include <string.h>
19 #include <stdbool.h>
20 #include <ctype.h>
21
22 //#define DEBUG_CHARS
23 #define MAX_PUTBACK 3
24
25 #ifdef _WIN32
26 /* No strtold on windows and no replacement yet */
27 #define strtold(s, e) strtod(s, e)
28 #endif
29
30 #if defined HAS_SIGNED_CHAR
31 typedef signed char char_type;
32 #elif defined HAS_UNSIGNED_CHAR
33 typedef unsigned char char_type;
34 #else
35 #       error signedness of char not determined
36 #endif
37
38 static int         c;
39 token_t            lexer_token;
40 symbol_t          *symbol_L;
41 static FILE       *input;
42 static char        buf[1024 + MAX_PUTBACK];
43 static const char *bufend;
44 static const char *bufpos;
45 static strset_t    stringset;
46
47 /**
48  * Print an error prefix at the given coordinates.
49  *
50  * @param input_name   the input file name
51  * @param linenr       the line number
52  */
53 static void error_prefix_at(const char *input_name, unsigned linenr)
54 {
55         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
56 }
57
58 /**
59  * Print an error prefix at the current token coordinates.
60  */
61 static void error_prefix(void)
62 {
63         error_prefix_at(lexer_token.source_position.input_name,
64                         lexer_token.source_position.linenr);
65 }
66
67 /**
68  * Prints a parse error message at the current token.
69  *
70  * @param msg   the error message
71  */
72 static void parse_error(const char *msg)
73 {
74         error_prefix();
75         fprintf(stderr, "%s\n", msg);
76 }
77
78 static inline void next_real_char(void)
79 {
80         assert(bufpos <= bufend);
81         if (bufpos >= bufend) {
82                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
83                                  input);
84                 if(s == 0) {
85                         c = EOF;
86                         return;
87                 }
88                 bufpos = buf + MAX_PUTBACK;
89                 bufend = buf + MAX_PUTBACK + s;
90         }
91         c = *bufpos++;
92 }
93
94 static inline void put_back(int pc)
95 {
96         assert(bufpos > buf);
97         *(--bufpos - buf + buf) = (char) pc;
98
99 #ifdef DEBUG_CHARS
100         printf("putback '%c'\n", pc);
101 #endif
102 }
103
104 static inline void next_char(void);
105
106 #define MATCH_NEWLINE(code)                   \
107         case '\r':                                \
108                 next_char();                          \
109                 if(c == '\n') {                       \
110                         next_char();                      \
111                 }                                     \
112                 lexer_token.source_position.linenr++; \
113                 code                                  \
114         case '\n':                                \
115                 next_char();                          \
116                 lexer_token.source_position.linenr++; \
117                 code
118
119 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
120
121 static void maybe_concat_lines(void)
122 {
123         eat('\\');
124
125         switch(c) {
126         MATCH_NEWLINE(return;)
127
128         default:
129                 break;
130         }
131
132         put_back(c);
133         c = '\\';
134 }
135
136 static inline void next_char(void)
137 {
138         next_real_char();
139
140         /* filter trigraphs */
141         if(UNLIKELY(c == '\\')) {
142                 maybe_concat_lines();
143                 goto end_of_next_char;
144         }
145
146         if(LIKELY(c != '?'))
147                 goto end_of_next_char;
148
149         next_real_char();
150         if(LIKELY(c != '?')) {
151                 put_back(c);
152                 c = '?';
153                 goto end_of_next_char;
154         }
155
156         next_real_char();
157         switch(c) {
158         case '=': c = '#'; break;
159         case '(': c = '['; break;
160         case '/': c = '\\'; maybe_concat_lines(); break;
161         case ')': c = ']'; break;
162         case '\'': c = '^'; break;
163         case '<': c = '{'; break;
164         case '!': c = '|'; break;
165         case '>': c = '}'; break;
166         case '-': c = '~'; break;
167         default:
168                 put_back(c);
169                 put_back('?');
170                 c = '?';
171                 break;
172         }
173
174 end_of_next_char:;
175 #ifdef DEBUG_CHARS
176         printf("nchar '%c'\n", c);
177 #endif
178 }
179
180 #define SYMBOL_CHARS  \
181         case 'a':         \
182         case 'b':         \
183         case 'c':         \
184         case 'd':         \
185         case 'e':         \
186         case 'f':         \
187         case 'g':         \
188         case 'h':         \
189         case 'i':         \
190         case 'j':         \
191         case 'k':         \
192         case 'l':         \
193         case 'm':         \
194         case 'n':         \
195         case 'o':         \
196         case 'p':         \
197         case 'q':         \
198         case 'r':         \
199         case 's':         \
200         case 't':         \
201         case 'u':         \
202         case 'v':         \
203         case 'w':         \
204         case 'x':         \
205         case 'y':         \
206         case 'z':         \
207         case 'A':         \
208         case 'B':         \
209         case 'C':         \
210         case 'D':         \
211         case 'E':         \
212         case 'F':         \
213         case 'G':         \
214         case 'H':         \
215         case 'I':         \
216         case 'J':         \
217         case 'K':         \
218         case 'L':         \
219         case 'M':         \
220         case 'N':         \
221         case 'O':         \
222         case 'P':         \
223         case 'Q':         \
224         case 'R':         \
225         case 'S':         \
226         case 'T':         \
227         case 'U':         \
228         case 'V':         \
229         case 'W':         \
230         case 'X':         \
231         case 'Y':         \
232         case 'Z':         \
233         case '_':
234
235 #define DIGITS        \
236         case '0':         \
237         case '1':         \
238         case '2':         \
239         case '3':         \
240         case '4':         \
241         case '5':         \
242         case '6':         \
243         case '7':         \
244         case '8':         \
245         case '9':
246
247 static void parse_symbol(void)
248 {
249         symbol_t *symbol;
250         char     *string;
251
252         obstack_1grow(&symbol_obstack, (char) c);
253         next_char();
254
255         while(1) {
256                 switch(c) {
257                 DIGITS
258                 SYMBOL_CHARS
259                         obstack_1grow(&symbol_obstack, (char) c);
260                         next_char();
261                         break;
262
263                 default:
264                         goto end_symbol;
265                 }
266         }
267
268 end_symbol:
269         obstack_1grow(&symbol_obstack, '\0');
270
271         string = obstack_finish(&symbol_obstack);
272         symbol = symbol_table_insert(string);
273
274         lexer_token.type     = symbol->ID;
275         lexer_token.v.symbol = symbol;
276
277         if(symbol->string != string) {
278                 obstack_free(&symbol_obstack, string);
279         }
280 }
281
282 static void parse_integer_suffix(bool is_oct_hex)
283 {
284         bool is_unsigned  = false;
285         bool min_long     = false;
286         bool min_longlong = false;
287
288         if(c == 'U' || c == 'u') {
289                 is_unsigned = true;
290                 next_char();
291                 if(c == 'L' || c == 'l') {
292                         min_long = true;
293                         next_char();
294                         if(c == 'L' || c == 'l') {
295                                 min_longlong = true;
296                                 next_char();
297                         }
298                 }
299         } else if(c == 'l' || c == 'L') {
300                 min_long = true;
301                 next_char();
302                 if(c == 'l' || c == 'L') {
303                         min_longlong = true;
304                         next_char();
305                         if(c == 'u' || c == 'U') {
306                                 is_unsigned = true;
307                                 next_char();
308                         }
309                 } else if(c == 'u' || c == 'U') {
310                         is_unsigned = true;
311                         next_char();
312                         lexer_token.datatype = type_unsigned_long;
313                 }
314         }
315
316         if(!is_unsigned) {
317                 long long v = lexer_token.v.intvalue;
318                 if(!min_long) {
319                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
320                                 lexer_token.datatype = type_int;
321                                 return;
322                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
323                                 lexer_token.datatype = type_unsigned_int;
324                                 return;
325                         }
326                 }
327                 if(!min_longlong) {
328                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
329                                 lexer_token.datatype = type_long;
330                                 return;
331                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
332                                 lexer_token.datatype = type_unsigned_long;
333                                 return;
334                         }
335                 }
336                 unsigned long long uv = (unsigned long long) v;
337                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
338                         lexer_token.datatype = type_unsigned_long_long;
339                         return;
340                 }
341
342                 lexer_token.datatype = type_long_long;
343         } else {
344                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
345                 if(!min_long && v <= TARGET_UINT_MAX) {
346                         lexer_token.datatype = type_unsigned_int;
347                         return;
348                 }
349                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
350                         lexer_token.datatype = type_unsigned_long;
351                         return;
352                 }
353                 lexer_token.datatype = type_unsigned_long_long;
354         }
355 }
356
357 static void parse_floating_suffix(void)
358 {
359         switch(c) {
360         /* TODO: do something usefull with the suffixes... */
361         case 'f':
362         case 'F':
363                 next_char();
364                 lexer_token.datatype = type_float;
365                 break;
366         case 'l':
367         case 'L':
368                 next_char();
369                 lexer_token.datatype = type_long_double;
370                 break;
371         default:
372                 lexer_token.datatype = type_double;
373                 break;
374         }
375 }
376
377 /**
378  * A replacement for strtoull. Only those parts needed for
379  * our parser are implemented.
380  */
381 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
382         unsigned long long v = 0;
383
384         switch (base) {
385         case 16:
386                 for (;; ++s) {
387                         /* check for overrun */
388                         if (v >= 0x1000000000000000ULL)
389                                 break;
390                         switch (tolower(*s)) {
391                         case '0': v <<= 4; break;
392                         case '1': v <<= 4; v |= 0x1; break;
393                         case '2': v <<= 4; v |= 0x2; break;
394                         case '3': v <<= 4; v |= 0x3; break;
395                         case '4': v <<= 4; v |= 0x4; break;
396                         case '5': v <<= 4; v |= 0x5; break;
397                         case '6': v <<= 4; v |= 0x6; break;
398                         case '7': v <<= 4; v |= 0x7; break;
399                         case '8': v <<= 4; v |= 0x8; break;
400                         case '9': v <<= 4; v |= 0x9; break;
401                         case 'a': v <<= 4; v |= 0xa; break;
402                         case 'b': v <<= 4; v |= 0xb; break;
403                         case 'c': v <<= 4; v |= 0xc; break;
404                         case 'd': v <<= 4; v |= 0xd; break;
405                         case 'e': v <<= 4; v |= 0xe; break;
406                         case 'f': v <<= 4; v |= 0xf; break;
407                         default:
408                                 goto end;
409                         }
410                 }
411                 break;
412         case 8:
413                 for (;; ++s) {
414                         /* check for overrun */
415                         if (v >= 0x2000000000000000ULL)
416                                 break;
417                         switch (tolower(*s)) {
418                         case '0': v <<= 3; break;
419                         case '1': v <<= 3; v |= 1; break;
420                         case '2': v <<= 3; v |= 2; break;
421                         case '3': v <<= 3; v |= 3; break;
422                         case '4': v <<= 3; v |= 4; break;
423                         case '5': v <<= 3; v |= 5; break;
424                         case '6': v <<= 3; v |= 6; break;
425                         case '7': v <<= 3; v |= 7; break;
426                         default:
427                                 goto end;
428                         }
429                 }
430                 break;
431         case 10:
432                 for (;; ++s) {
433                         /* check for overrun */
434                         if (v > 0x1999999999999999ULL)
435                                 break;
436                         switch (tolower(*s)) {
437                         case '0': v *= 10; break;
438                         case '1': v *= 10; v += 1; break;
439                         case '2': v *= 10; v += 2; break;
440                         case '3': v *= 10; v += 3; break;
441                         case '4': v *= 10; v += 4; break;
442                         case '5': v *= 10; v += 5; break;
443                         case '6': v *= 10; v += 6; break;
444                         case '7': v *= 10; v += 7; break;
445                         case '8': v *= 10; v += 8; break;
446                         case '9': v *= 10; v += 9; break;
447                         default:
448                                 goto end;
449                         }
450                 }
451                 break;
452         default:
453                 assert(0);
454                 break;
455         }
456 end:
457         *endptr = s;
458         return v;
459 }
460
461 static void parse_number_hex(void)
462 {
463         assert(c == 'x' || c == 'X');
464         next_char();
465
466         while(isxdigit(c)) {
467                 obstack_1grow(&symbol_obstack, (char) c);
468                 next_char();
469         }
470         obstack_1grow(&symbol_obstack, '\0');
471         char *string = obstack_finish(&symbol_obstack);
472
473         if(c == '.' || c == 'p' || c == 'P') {
474                 next_char();
475                 panic("Hex floating point numbers not implemented yet");
476         }
477         if(*string == '\0') {
478                 parse_error("invalid hex number");
479                 lexer_token.type = T_ERROR;
480         }
481
482         const char *endptr;
483         lexer_token.type       = T_INTEGER;
484         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
485         if(*endptr != '\0') {
486                 parse_error("hex number literal too long");
487         }
488
489         obstack_free(&symbol_obstack, string);
490         parse_integer_suffix(true);
491 }
492
493 static inline bool is_octal_digit(int chr)
494 {
495         return '0' <= chr && chr <= '7';
496 }
497
498 static void parse_number_oct(void)
499 {
500         while(is_octal_digit(c)) {
501                 obstack_1grow(&symbol_obstack, (char) c);
502                 next_char();
503         }
504         obstack_1grow(&symbol_obstack, '\0');
505         char *string = obstack_finish(&symbol_obstack);
506
507         const char *endptr;
508         lexer_token.type       = T_INTEGER;
509         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
510         if(*endptr != '\0') {
511                 parse_error("octal number literal too long");
512         }
513
514         obstack_free(&symbol_obstack, string);
515         parse_integer_suffix(true);
516 }
517
518 static void parse_number_dec(void)
519 {
520         bool is_float = false;
521         while(isdigit(c)) {
522                 obstack_1grow(&symbol_obstack, (char) c);
523                 next_char();
524         }
525
526         if(c == '.') {
527                 obstack_1grow(&symbol_obstack, '.');
528                 next_char();
529
530                 while(isdigit(c)) {
531                         obstack_1grow(&symbol_obstack, (char) c);
532                         next_char();
533                 }
534                 is_float = true;
535         }
536         if(c == 'e' || c == 'E') {
537                 obstack_1grow(&symbol_obstack, 'e');
538                 next_char();
539
540                 if(c == '-' || c == '+') {
541                         obstack_1grow(&symbol_obstack, (char) c);
542                         next_char();
543                 }
544
545                 while(isdigit(c)) {
546                         obstack_1grow(&symbol_obstack, (char) c);
547                         next_char();
548                 }
549                 is_float = true;
550         }
551
552         obstack_1grow(&symbol_obstack, '\0');
553         char *string = obstack_finish(&symbol_obstack);
554
555         if(is_float) {
556                 char *endptr;
557                 lexer_token.type         = T_FLOATINGPOINT;
558                 lexer_token.v.floatvalue = strtold(string, &endptr);
559
560                 if(*endptr != '\0') {
561                         parse_error("invalid number literal");
562                 }
563
564                 parse_floating_suffix();
565         } else {
566                 const char *endptr;
567                 lexer_token.type       = T_INTEGER;
568                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
569
570                 if(*endptr != '\0') {
571                         parse_error("invalid number literal");
572                 }
573
574                 parse_integer_suffix(false);
575         }
576         obstack_free(&symbol_obstack, string);
577 }
578
579 static void parse_number(void)
580 {
581         if (c == '0') {
582                 next_char();
583                 switch (c) {
584                         case 'X':
585                         case 'x':
586                                 parse_number_hex();
587                                 break;
588                         case '0':
589                         case '1':
590                         case '2':
591                         case '3':
592                         case '4':
593                         case '5':
594                         case '6':
595                         case '7':
596                                 parse_number_oct();
597                                 break;
598                         case '8':
599                         case '9':
600                                 next_char();
601                                 parse_error("invalid octal number");
602                                 lexer_token.type = T_ERROR;
603                                 return;
604                         case '.':
605                         case 'e':
606                         case 'E':
607                         default:
608                                 obstack_1grow(&symbol_obstack, '0');
609                                 parse_number_dec();
610                                 return;
611                 }
612         } else {
613                 parse_number_dec();
614         }
615 }
616
617 static int parse_octal_sequence(const int first_digit)
618 {
619         assert(is_octal_digit(first_digit));
620         int value = first_digit - '0';
621         if (!is_octal_digit(c)) return value;
622         value = 8 * value + c - '0';
623         next_char();
624         if (!is_octal_digit(c)) return value;
625         value = 8 * value + c - '0';
626         next_char();
627         return (char_type)value;
628 }
629
630 static int parse_hex_sequence(void)
631 {
632         int value = 0;
633         while(1) {
634                 if (c >= '0' && c <= '9') {
635                         value = 16 * value + c - '0';
636                 } else if ('A' <= c && c <= 'F') {
637                         value = 16 * value + c - 'A' + 10;
638                 } else if ('a' <= c && c <= 'f') {
639                         value = 16 * value + c - 'a' + 10;
640                 } else {
641                         break;
642                 }
643                 next_char();
644         }
645
646         return (char_type)value;
647 }
648
649 static int parse_escape_sequence(void)
650 {
651         eat('\\');
652
653         int ec = c;
654         next_char();
655
656         switch(ec) {
657         case '"':  return '"';
658         case '\'': return '\'';
659         case '\\': return '\\';
660         case '?': return '\?';
661         case 'a': return '\a';
662         case 'b': return '\b';
663         case 'f': return '\f';
664         case 'n': return '\n';
665         case 'r': return '\r';
666         case 't': return '\t';
667         case 'v': return '\v';
668         case 'x':
669                 return parse_hex_sequence();
670         case '0':
671         case '1':
672         case '2':
673         case '3':
674         case '4':
675         case '5':
676         case '6':
677         case '7':
678                 return parse_octal_sequence(ec);
679         case EOF:
680                 parse_error("reached end of file while parsing escape sequence");
681                 return EOF;
682         default:
683                 parse_error("unknown escape sequence");
684                 return EOF;
685         }
686 }
687
688 string_t concat_strings(const string_t *const s1, const string_t *const s2)
689 {
690         const size_t len1 = s1->size - 1;
691         const size_t len2 = s2->size - 1;
692
693         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
694         memcpy(concat, s1->begin, len1);
695         memcpy(concat + len1, s2->begin, len2 + 1);
696
697 #if 0 /* TODO hash */
698         const char *result = strset_insert(&stringset, concat);
699         if(result != concat) {
700                 obstack_free(&symbol_obstack, concat);
701         }
702
703         return result;
704 #else
705         return (string_t){ concat, len1 + len2 + 1 };
706 #endif
707 }
708
709 static void parse_string_literal(void)
710 {
711         const unsigned start_linenr = lexer_token.source_position.linenr;
712
713         assert(c == '"');
714         next_char();
715
716         int tc;
717         while(1) {
718                 switch(c) {
719                 case '\\':
720                         tc = parse_escape_sequence();
721                         obstack_1grow(&symbol_obstack, (char) tc);
722                         break;
723
724                 case EOF:
725                         error_prefix_at(lexer_token.source_position.input_name,
726                                         start_linenr);
727                         fprintf(stderr, "string has no end\n");
728                         lexer_token.type = T_ERROR;
729                         return;
730
731                 case '"':
732                         next_char();
733                         goto end_of_string;
734
735                 default:
736                         obstack_1grow(&symbol_obstack, (char) c);
737                         next_char();
738                         break;
739                 }
740         }
741
742 end_of_string:
743
744         /* TODO: concatenate multiple strings separated by whitespace... */
745
746         /* add finishing 0 to the string */
747         obstack_1grow(&symbol_obstack, '\0');
748         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
749         const char *const string = obstack_finish(&symbol_obstack);
750
751 #if 0 /* TODO hash */
752         /* check if there is already a copy of the string */
753         result = strset_insert(&stringset, string);
754         if(result != string) {
755                 obstack_free(&symbol_obstack, string);
756         }
757 #else
758         const char *const result = string;
759 #endif
760
761         lexer_token.type           = T_STRING_LITERAL;
762         lexer_token.v.string.begin = result;
763         lexer_token.v.string.size  = size;
764 }
765
766 static void parse_wide_character_constant(void)
767 {
768         eat('\'');
769
770         int found_char = 0;
771         while(1) {
772                 switch(c) {
773                 case '\\':
774                         found_char = parse_escape_sequence();
775                         break;
776
777                 MATCH_NEWLINE(
778                         parse_error("newline while parsing character constant");
779                         break;
780                 )
781
782                 case '\'':
783                         next_char();
784                         goto end_of_wide_char_constant;
785
786                 case EOF:
787                         parse_error("EOF while parsing character constant");
788                         lexer_token.type = T_ERROR;
789                         return;
790
791                 default:
792                         if(found_char != 0) {
793                                 parse_error("more than 1 characters in character "
794                                             "constant");
795                                 goto end_of_wide_char_constant;
796                         } else {
797                                 found_char = c;
798                                 next_char();
799                         }
800                         break;
801                 }
802         }
803
804 end_of_wide_char_constant:
805         lexer_token.type       = T_INTEGER;
806         lexer_token.v.intvalue = found_char;
807         lexer_token.datatype   = type_wchar_t;
808 }
809
810 static void parse_wide_string_literal(void)
811 {
812         const unsigned start_linenr = lexer_token.source_position.linenr;
813
814         assert(c == '"');
815         next_char();
816
817         while(1) {
818                 switch(c) {
819                         case '\\': {
820                                 wchar_rep_t tc = parse_escape_sequence();
821                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
822                                 break;
823                         }
824
825                         case EOF:
826                                 error_prefix_at(lexer_token.source_position.input_name,
827                                                 start_linenr);
828                                 fprintf(stderr, "string has no end\n");
829                                 lexer_token.type = T_ERROR;
830                                 return;
831
832                         case '"':
833                                 next_char();
834                                 goto end_of_string;
835
836                         default: {
837                                 wchar_rep_t tc = c;
838                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
839                                 next_char();
840                                 break;
841                         }
842                 }
843         }
844
845 end_of_string:;
846
847         /* TODO: concatenate multiple strings separated by whitespace... */
848
849         /* add finishing 0 to the string */
850         wchar_rep_t nul = L'\0';
851         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
852         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
853         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
854
855 #if 0 /* TODO hash */
856         /* check if there is already a copy of the string */
857         const wchar_rep_t *const result = strset_insert(&stringset, string);
858         if(result != string) {
859                 obstack_free(&symbol_obstack, string);
860         }
861 #else
862         const wchar_rep_t *const result = string;
863 #endif
864
865         lexer_token.type                = T_WIDE_STRING_LITERAL;
866         lexer_token.v.wide_string.begin = result;
867         lexer_token.v.wide_string.size  = size;
868 }
869
870 static void parse_character_constant(void)
871 {
872         eat('\'');
873
874         int found_char = 0;
875         while(1) {
876                 switch(c) {
877                 case '\\':
878                         found_char = parse_escape_sequence();
879                         break;
880
881                 MATCH_NEWLINE(
882                         parse_error("newline while parsing character constant");
883                         break;
884                 )
885
886                 case '\'':
887                         next_char();
888                         goto end_of_char_constant;
889
890                 case EOF:
891                         parse_error("EOF while parsing character constant");
892                         lexer_token.type = T_ERROR;
893                         return;
894
895                 default:
896                         if(found_char != 0) {
897                                 parse_error("more than 1 characters in character "
898                                             "constant");
899                                 goto end_of_char_constant;
900                         } else {
901                                 found_char = c;
902                                 next_char();
903                         }
904                         break;
905                 }
906         }
907
908 end_of_char_constant:
909         lexer_token.type       = T_INTEGER;
910         lexer_token.v.intvalue = found_char;
911         lexer_token.datatype   = type_int;
912 }
913
914 static void skip_multiline_comment(void)
915 {
916         unsigned start_linenr = lexer_token.source_position.linenr;
917
918         while(1) {
919                 switch(c) {
920                 case '*':
921                         next_char();
922                         if(c == '/') {
923                                 next_char();
924                                 return;
925                         }
926                         break;
927
928                 MATCH_NEWLINE(break;)
929
930                 case EOF:
931                         error_prefix_at(lexer_token.source_position.input_name,
932                                         start_linenr);
933                         fprintf(stderr, "at end of file while looking for comment end\n");
934                         return;
935
936                 default:
937                         next_char();
938                         break;
939                 }
940         }
941 }
942
943 static void skip_line_comment(void)
944 {
945         while(1) {
946                 switch(c) {
947                 case EOF:
948                         return;
949
950                 case '\n':
951                 case '\r':
952                         return;
953
954                 default:
955                         next_char();
956                         break;
957                 }
958         }
959 }
960
961 static token_t pp_token;
962
963 static inline void next_pp_token(void)
964 {
965         lexer_next_preprocessing_token();
966         pp_token = lexer_token;
967 }
968
969 static void eat_until_newline(void)
970 {
971         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
972                 next_pp_token();
973         }
974 }
975
976 static void error_directive(void)
977 {
978         error_prefix();
979         fprintf(stderr, "#error directive: \n");
980
981         /* parse pp-tokens until new-line */
982 }
983
984 static void define_directive(void)
985 {
986         lexer_next_preprocessing_token();
987         if(lexer_token.type != T_IDENTIFIER) {
988                 parse_error("expected identifier after #define\n");
989                 eat_until_newline();
990         }
991 }
992
993 static void ifdef_directive(int is_ifndef)
994 {
995         (void) is_ifndef;
996         lexer_next_preprocessing_token();
997         //expect_identifier();
998         //extect_newline();
999 }
1000
1001 static void endif_directive(void)
1002 {
1003         //expect_newline();
1004 }
1005
1006 static void parse_line_directive(void)
1007 {
1008         if(pp_token.type != T_INTEGER) {
1009                 parse_error("expected integer");
1010         } else {
1011                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1012                 next_pp_token();
1013         }
1014         if(pp_token.type == T_STRING_LITERAL) {
1015                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1016                 next_pp_token();
1017         }
1018
1019         eat_until_newline();
1020 }
1021
1022 static void parse_preprocessor_identifier(void)
1023 {
1024         assert(pp_token.type == T_IDENTIFIER);
1025         symbol_t *symbol = pp_token.v.symbol;
1026
1027         switch(symbol->pp_ID) {
1028         case TP_include:
1029                 printf("include - enable header name parsing!\n");
1030                 break;
1031         case TP_define:
1032                 define_directive();
1033                 break;
1034         case TP_ifdef:
1035                 ifdef_directive(0);
1036                 break;
1037         case TP_ifndef:
1038                 ifdef_directive(1);
1039                 break;
1040         case TP_endif:
1041                 endif_directive();
1042                 break;
1043         case TP_line:
1044                 next_pp_token();
1045                 parse_line_directive();
1046                 break;
1047         case TP_if:
1048         case TP_else:
1049         case TP_elif:
1050         case TP_undef:
1051         case TP_error:
1052                 error_directive();
1053                 break;
1054         case TP_pragma:
1055                 if (warning.unknown_pragmas) {
1056                         warningf(lexer_token.source_position, "encountered unknown #pragma");
1057                 }
1058                 eat_until_newline();
1059                 break;
1060         }
1061 }
1062
1063 static void parse_preprocessor_directive(void)
1064 {
1065         next_pp_token();
1066
1067         switch(pp_token.type) {
1068         case T_IDENTIFIER:
1069                 parse_preprocessor_identifier();
1070                 break;
1071         case T_INTEGER:
1072                 parse_line_directive();
1073                 break;
1074         default:
1075                 parse_error("invalid preprocessor directive");
1076                 eat_until_newline();
1077                 break;
1078         }
1079 }
1080
1081 #define MAYBE_PROLOG                                       \
1082                         next_char();                                   \
1083                         while(1) {                                     \
1084                                 switch(c) {
1085
1086 #define MAYBE(ch, set_type)                                \
1087                                 case ch:                                   \
1088                                         next_char();                           \
1089                                         lexer_token.type = set_type;           \
1090                                         return;
1091
1092 #define ELSE_CODE(code)                                    \
1093                                 default:                                   \
1094                                         code;                                  \
1095                                 }                                          \
1096                         } /* end of while(1) */                        \
1097                         break;
1098
1099 #define ELSE(set_type)                                     \
1100                 ELSE_CODE(                                         \
1101                         lexer_token.type = set_type;                   \
1102                         return;                                        \
1103                 )
1104
1105 void lexer_next_preprocessing_token(void)
1106 {
1107         while(1) {
1108                 switch(c) {
1109                 case ' ':
1110                 case '\t':
1111                         next_char();
1112                         break;
1113
1114                 MATCH_NEWLINE(
1115                         lexer_token.type = '\n';
1116                         return;
1117                 )
1118
1119                 SYMBOL_CHARS
1120                         parse_symbol();
1121                         /* might be a wide string ( L"string" ) */
1122                         if(lexer_token.type == T_IDENTIFIER &&
1123                             lexer_token.v.symbol == symbol_L) {
1124                             if(c == '"') {
1125                                         parse_wide_string_literal();
1126                                 } else if(c == '\'') {
1127                                         parse_wide_character_constant();
1128                                 }
1129                         }
1130                         return;
1131
1132                 DIGITS
1133                         parse_number();
1134                         return;
1135
1136                 case '"':
1137                         parse_string_literal();
1138                         return;
1139
1140                 case '\'':
1141                         parse_character_constant();
1142                         return;
1143
1144                 case '.':
1145                         MAYBE_PROLOG
1146                                 case '0':
1147                                 case '1':
1148                                 case '2':
1149                                 case '3':
1150                                 case '4':
1151                                 case '5':
1152                                 case '6':
1153                                 case '7':
1154                                 case '8':
1155                                 case '9':
1156                                         put_back(c);
1157                                         c = '.';
1158                                         parse_number_dec();
1159                                         return;
1160
1161                                 case '.':
1162                                         MAYBE_PROLOG
1163                                         MAYBE('.', T_DOTDOTDOT)
1164                                         ELSE_CODE(
1165                                                 put_back(c);
1166                                                 c = '.';
1167                                                 lexer_token.type = '.';
1168                                                 return;
1169                                         )
1170                         ELSE('.')
1171                 case '&':
1172                         MAYBE_PROLOG
1173                         MAYBE('&', T_ANDAND)
1174                         MAYBE('=', T_ANDEQUAL)
1175                         ELSE('&')
1176                 case '*':
1177                         MAYBE_PROLOG
1178                         MAYBE('=', T_ASTERISKEQUAL)
1179                         ELSE('*')
1180                 case '+':
1181                         MAYBE_PROLOG
1182                         MAYBE('+', T_PLUSPLUS)
1183                         MAYBE('=', T_PLUSEQUAL)
1184                         ELSE('+')
1185                 case '-':
1186                         MAYBE_PROLOG
1187                         MAYBE('>', T_MINUSGREATER)
1188                         MAYBE('-', T_MINUSMINUS)
1189                         MAYBE('=', T_MINUSEQUAL)
1190                         ELSE('-')
1191                 case '!':
1192                         MAYBE_PROLOG
1193                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1194                         ELSE('!')
1195                 case '/':
1196                         MAYBE_PROLOG
1197                         MAYBE('=', T_SLASHEQUAL)
1198                                 case '*':
1199                                         next_char();
1200                                         skip_multiline_comment();
1201                                         lexer_next_preprocessing_token();
1202                                         return;
1203                                 case '/':
1204                                         next_char();
1205                                         skip_line_comment();
1206                                         lexer_next_preprocessing_token();
1207                                         return;
1208                         ELSE('/')
1209                 case '%':
1210                         MAYBE_PROLOG
1211                         MAYBE('>', T_PERCENTGREATER)
1212                         MAYBE('=', T_PERCENTEQUAL)
1213                                 case ':':
1214                                         MAYBE_PROLOG
1215                                                 case '%':
1216                                                         MAYBE_PROLOG
1217                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1218                                                         ELSE_CODE(
1219                                                                 put_back(c);
1220                                                                 c = '%';
1221                                                                 lexer_token.type = T_PERCENTCOLON;
1222                                                                 return;
1223                                                         )
1224                                         ELSE(T_PERCENTCOLON)
1225                         ELSE('%')
1226                 case '<':
1227                         MAYBE_PROLOG
1228                         MAYBE(':', T_LESSCOLON)
1229                         MAYBE('%', T_LESSPERCENT)
1230                         MAYBE('=', T_LESSEQUAL)
1231                                 case '<':
1232                                         MAYBE_PROLOG
1233                                         MAYBE('=', T_LESSLESSEQUAL)
1234                                         ELSE(T_LESSLESS)
1235                         ELSE('<')
1236                 case '>':
1237                         MAYBE_PROLOG
1238                         MAYBE('=', T_GREATEREQUAL)
1239                                 case '>':
1240                                         MAYBE_PROLOG
1241                                         MAYBE('=', T_GREATERGREATEREQUAL)
1242                                         ELSE(T_GREATERGREATER)
1243                         ELSE('>')
1244                 case '^':
1245                         MAYBE_PROLOG
1246                         MAYBE('=', T_CARETEQUAL)
1247                         ELSE('^')
1248                 case '|':
1249                         MAYBE_PROLOG
1250                         MAYBE('=', T_PIPEEQUAL)
1251                         MAYBE('|', T_PIPEPIPE)
1252                         ELSE('|')
1253                 case ':':
1254                         MAYBE_PROLOG
1255                         MAYBE('>', T_COLONGREATER)
1256                         ELSE(':')
1257                 case '=':
1258                         MAYBE_PROLOG
1259                         MAYBE('=', T_EQUALEQUAL)
1260                         ELSE('=')
1261                 case '#':
1262                         MAYBE_PROLOG
1263                         MAYBE('#', T_HASHHASH)
1264                         ELSE('#')
1265
1266                 case '?':
1267                 case '[':
1268                 case ']':
1269                 case '(':
1270                 case ')':
1271                 case '{':
1272                 case '}':
1273                 case '~':
1274                 case ';':
1275                 case ',':
1276                 case '\\':
1277                         lexer_token.type = c;
1278                         next_char();
1279                         return;
1280
1281                 case EOF:
1282                         lexer_token.type = T_EOF;
1283                         return;
1284
1285                 default:
1286                         next_char();
1287                         error_prefix();
1288                         fprintf(stderr, "unknown character '%c' found\n", c);
1289                         lexer_token.type = T_ERROR;
1290                         return;
1291                 }
1292         }
1293 }
1294
1295 void lexer_next_token(void)
1296 {
1297         lexer_next_preprocessing_token();
1298         if(lexer_token.type != '\n')
1299                 return;
1300
1301 newline_found:
1302         do {
1303                 lexer_next_preprocessing_token();
1304         } while(lexer_token.type == '\n');
1305
1306         if(lexer_token.type == '#') {
1307                 parse_preprocessor_directive();
1308                 goto newline_found;
1309         }
1310 }
1311
1312 void init_lexer(void)
1313 {
1314         strset_init(&stringset);
1315 }
1316
1317 void lexer_open_stream(FILE *stream, const char *input_name)
1318 {
1319         input                                  = stream;
1320         lexer_token.source_position.linenr     = 0;
1321         lexer_token.source_position.input_name = input_name;
1322
1323         symbol_L = symbol_table_insert("L");
1324         bufpos = NULL;
1325         bufend = NULL;
1326
1327         /* place a virtual \n at the beginning so the lexer knows that we're
1328          * at the beginning of a line */
1329         c = '\n';
1330 }
1331
1332 void exit_lexer(void)
1333 {
1334         strset_destroy(&stringset);
1335 }
1336
1337 static __attribute__((unused))
1338 void dbg_pos(const source_position_t source_position)
1339 {
1340         fprintf(stdout, "%s:%u\n", source_position.input_name,
1341                 source_position.linenr);
1342         fflush(stdout);
1343 }