add beginnings of preprocessor code (and ugly hack to activate it) to svn
[cparser] / preprocessor.c
1 #include <config.h>
2
3 #include "token_t.h"
4 #include "symbol_t.h"
5 #include "adt/util.h"
6 #include "adt/error.h"
7 #include "lang_features.h"
8 #include "diagnostic.h"
9 #include "string_rep.h"
10
11 #include <assert.h>
12 #include <errno.h>
13 #include <string.h>
14 #include <stdbool.h>
15 #include <ctype.h>
16
17 //#define DEBUG_CHARS
18 #define MAX_PUTBACK 3
19
20 struct pp_definition_t {
21         symbol_t          *symbol;
22         source_position_t  source_position;
23         pp_definition_t   *parent_expansion;
24         size_t             expand_pos;
25         bool               is_variadic  : 1;
26         bool               is_expanding : 1;
27         size_t             argument_count;
28         token_t           *arguments;
29         size_t             list_len;
30         token_t           *replacement_list;
31 };
32
33 static int                c;
34 token_t                   pp_token;
35 static FILE              *input;
36 static char               buf[1024 + MAX_PUTBACK];
37 static const char        *bufend;
38 static const char        *bufpos;
39 static bool               resolve_escape_sequences = false;
40 static bool               print_spaces             = true;
41 static FILE              *out;
42 static struct obstack     pp_obstack;
43 static unsigned           counted_newlines;
44 static unsigned           counted_spaces;
45 static source_position_t  input_position;
46 static const char        *printed_input_name = NULL;
47 static pp_definition_t   *current_expansion  = NULL;
48 static bool               do_expansions;
49
50 static void next_preprocessing_token(void);
51
52 /**
53  * Prints a parse error message at the current token.
54  *
55  * @param msg   the error message
56  */
57 static void parse_error(const char *msg)
58 {
59         errorf(&pp_token.source_position,  "%s", msg);
60 }
61
62 static inline void next_real_char(void)
63 {
64         assert(bufpos <= bufend);
65         if (bufpos >= bufend) {
66                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
67                                  input);
68                 if(s == 0) {
69                         c = EOF;
70                         return;
71                 }
72                 bufpos = buf + MAX_PUTBACK;
73                 bufend = buf + MAX_PUTBACK + s;
74         }
75         c = *bufpos++;
76 }
77
78 /**
79  * Put a character back into the buffer.
80  *
81  * @param pc  the character to put back
82  */
83 static inline void put_back(int pc)
84 {
85         assert(bufpos > buf);
86         *(--bufpos - buf + buf) = (char) pc;
87
88 #ifdef DEBUG_CHARS
89         printf("putback '%c'\n", pc);
90 #endif
91 }
92
93 static inline void next_char(void);
94
95 #define MATCH_NEWLINE(code)                   \
96         case '\r':                                \
97                 next_char();                          \
98                 if(c == '\n') {                       \
99                         next_char();                      \
100                 }                                     \
101                 ++input_position.linenr;              \
102                 code                                  \
103         case '\n':                                \
104                 next_char();                          \
105                 ++input_position.linenr;              \
106                 code
107
108 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
109
110 static void maybe_concat_lines(void)
111 {
112         eat('\\');
113
114         switch(c) {
115         MATCH_NEWLINE(return;)
116
117         default:
118                 break;
119         }
120
121         put_back(c);
122         c = '\\';
123 }
124
125 /**
126  * Set c to the next input character, ie.
127  * after expanding trigraphs.
128  */
129 static inline void next_char(void)
130 {
131         next_real_char();
132
133         /* filter trigraphs and concatenated lines */
134         if(UNLIKELY(c == '\\')) {
135                 maybe_concat_lines();
136                 goto end_of_next_char;
137         }
138
139         if(LIKELY(c != '?'))
140                 goto end_of_next_char;
141
142         next_real_char();
143         if(LIKELY(c != '?')) {
144                 put_back(c);
145                 c = '?';
146                 goto end_of_next_char;
147         }
148
149         next_real_char();
150         switch(c) {
151         case '=': c = '#'; break;
152         case '(': c = '['; break;
153         case '/': c = '\\'; maybe_concat_lines(); break;
154         case ')': c = ']'; break;
155         case '\'': c = '^'; break;
156         case '<': c = '{'; break;
157         case '!': c = '|'; break;
158         case '>': c = '}'; break;
159         case '-': c = '~'; break;
160         default:
161                 put_back(c);
162                 put_back('?');
163                 c = '?';
164                 break;
165         }
166
167 end_of_next_char:;
168 #ifdef DEBUG_CHARS
169         printf("nchar '%c'\n", c);
170 #endif
171 }
172
173
174
175 /**
176  * Returns true if the given char is a octal digit.
177  *
178  * @param char  the character to check
179  */
180 static inline bool is_octal_digit(int chr)
181 {
182         switch(chr) {
183         case '0':
184         case '1':
185         case '2':
186         case '3':
187         case '4':
188         case '5':
189         case '6':
190         case '7':
191                 return true;
192         default:
193                 return false;
194         }
195 }
196
197 /**
198  * Returns the value of a digit.
199  * The only portable way to do it ...
200  */
201 static int digit_value(int digit) {
202         switch (digit) {
203         case '0': return 0;
204         case '1': return 1;
205         case '2': return 2;
206         case '3': return 3;
207         case '4': return 4;
208         case '5': return 5;
209         case '6': return 6;
210         case '7': return 7;
211         case '8': return 8;
212         case '9': return 9;
213         case 'a':
214         case 'A': return 10;
215         case 'b':
216         case 'B': return 11;
217         case 'c':
218         case 'C': return 12;
219         case 'd':
220         case 'D': return 13;
221         case 'e':
222         case 'E': return 14;
223         case 'f':
224         case 'F': return 15;
225         default:
226                 panic("wrong character given");
227         }
228 }
229
230 /**
231  * Parses an octal character sequence.
232  *
233  * @param first_digit  the already read first digit
234  */
235 static int parse_octal_sequence(const int first_digit)
236 {
237         assert(is_octal_digit(first_digit));
238         int value = digit_value(first_digit);
239         if (!is_octal_digit(c)) return value;
240         value = 8 * value + digit_value(c);
241         next_char();
242         if (!is_octal_digit(c)) return value;
243         value = 8 * value + digit_value(c);
244         next_char();
245
246         if(char_is_signed) {
247                 return (signed char) value;
248         } else {
249                 return (unsigned char) value;
250         }
251 }
252
253 /**
254  * Parses a hex character sequence.
255  */
256 static int parse_hex_sequence(void)
257 {
258         int value = 0;
259         while(isxdigit(c)) {
260                 value = 16 * value + digit_value(c);
261                 next_char();
262         }
263
264         if(char_is_signed) {
265                 return (signed char) value;
266         } else {
267                 return (unsigned char) value;
268         }
269 }
270
271 /**
272  * Parse an escape sequence.
273  */
274 static int parse_escape_sequence(void)
275 {
276         eat('\\');
277
278         int ec = c;
279         next_char();
280
281         switch(ec) {
282         case '"':  return '"';
283         case '\'': return '\'';
284         case '\\': return '\\';
285         case '?': return '\?';
286         case 'a': return '\a';
287         case 'b': return '\b';
288         case 'f': return '\f';
289         case 'n': return '\n';
290         case 'r': return '\r';
291         case 't': return '\t';
292         case 'v': return '\v';
293         case 'x':
294                 return parse_hex_sequence();
295         case '0':
296         case '1':
297         case '2':
298         case '3':
299         case '4':
300         case '5':
301         case '6':
302         case '7':
303                 return parse_octal_sequence(ec);
304         case EOF:
305                 parse_error("reached end of file while parsing escape sequence");
306                 return EOF;
307         default:
308                 parse_error("unknown escape sequence");
309                 return EOF;
310         }
311 }
312
313 static void parse_string_literal(void)
314 {
315         const unsigned start_linenr = input_position.linenr;
316
317         eat('"');
318
319         int tc;
320         while(1) {
321                 switch(c) {
322                 case '\\':
323                         if(resolve_escape_sequences) {
324                                 tc = parse_escape_sequence();
325                                 obstack_1grow(&symbol_obstack, (char) tc);
326                         } else {
327                                 obstack_1grow(&symbol_obstack, (char) c);
328                                 next_char();
329                                 obstack_1grow(&symbol_obstack, (char) c);
330                                 next_char();
331                         }
332                         break;
333
334                 case EOF: {
335                         source_position_t source_position;
336                         source_position.input_name = pp_token.source_position.input_name;
337                         source_position.linenr     = start_linenr;
338                         errorf(&source_position, "string has no end");
339                         pp_token.type = TP_ERROR;
340                         return;
341                 }
342
343                 case '"':
344                         next_char();
345                         goto end_of_string;
346
347                 default:
348                         obstack_1grow(&symbol_obstack, (char) c);
349                         next_char();
350                         break;
351                 }
352         }
353
354 end_of_string:
355         /* add finishing 0 to the string */
356         obstack_1grow(&symbol_obstack, '\0');
357         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
358         const char *const string = obstack_finish(&symbol_obstack);
359
360 #if 0 /* TODO hash */
361         /* check if there is already a copy of the string */
362         result = strset_insert(&stringset, string);
363         if(result != string) {
364                 obstack_free(&symbol_obstack, string);
365         }
366 #else
367         const char *const result = string;
368 #endif
369
370         pp_token.type           = TP_STRING_LITERAL;
371         pp_token.v.string.begin = result;
372         pp_token.v.string.size  = size;
373 }
374
375 static void parse_wide_character_constant(void)
376 {
377         eat('\'');
378
379         int found_char = 0;
380         while(1) {
381                 switch(c) {
382                 case '\\':
383                         found_char = parse_escape_sequence();
384                         break;
385
386                 MATCH_NEWLINE(
387                         parse_error("newline while parsing character constant");
388                         break;
389                 )
390
391                 case '\'':
392                         next_char();
393                         goto end_of_wide_char_constant;
394
395                 case EOF:
396                         parse_error("EOF while parsing character constant");
397                         pp_token.type = TP_ERROR;
398                         return;
399
400                 default:
401                         if(found_char != 0) {
402                                 parse_error("more than 1 characters in character "
403                                             "constant");
404                                 goto end_of_wide_char_constant;
405                         } else {
406                                 found_char = c;
407                                 next_char();
408                         }
409                         break;
410                 }
411         }
412
413 end_of_wide_char_constant:
414         pp_token.type       = TP_WIDE_CHARACTER_CONSTANT;
415         /* TODO... */
416 }
417
418 static void parse_wide_string_literal(void)
419 {
420         const unsigned start_linenr = input_position.linenr;
421
422         assert(c == '"');
423         next_char();
424
425         while(1) {
426                 switch(c) {
427                 case '\\': {
428                         wchar_rep_t tc = parse_escape_sequence();
429                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
430                         break;
431                 }
432
433                 case EOF: {
434                         source_position_t source_position;
435                         source_position.input_name = pp_token.source_position.input_name;
436                         source_position.linenr     = start_linenr;
437                         errorf(&source_position, "string has no end");
438                         pp_token.type = TP_ERROR;
439                         return;
440                 }
441
442                 case '"':
443                         next_char();
444                         goto end_of_string;
445
446                 default: {
447                         wchar_rep_t tc = c;
448                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
449                         next_char();
450                         break;
451                 }
452                 }
453         }
454
455 end_of_string:;
456         /* add finishing 0 to the string */
457         static const wchar_rep_t nul = L'\0';
458         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
459
460         const size_t size
461                 = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
462         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
463
464 #if 0 /* TODO hash */
465         /* check if there is already a copy of the string */
466         const wchar_rep_t *const result = strset_insert(&stringset, string);
467         if(result != string) {
468                 obstack_free(&symbol_obstack, string);
469         }
470 #else
471         const wchar_rep_t *const result = string;
472 #endif
473
474         pp_token.type                = TP_WIDE_STRING_LITERAL;
475         pp_token.v.wide_string.begin = result;
476         pp_token.v.wide_string.size  = size;
477 }
478
479 static void parse_character_constant(void)
480 {
481         const unsigned start_linenr = input_position.linenr;
482
483         eat('\'');
484
485         int tc;
486         while(1) {
487                 switch(c) {
488                 case '\\':
489                         tc = parse_escape_sequence();
490                         obstack_1grow(&symbol_obstack, (char) tc);
491                         break;
492
493                 MATCH_NEWLINE(
494                         parse_error("newline while parsing character constant");
495                         break;
496                 )
497
498                 case EOF: {
499                         source_position_t source_position;
500                         source_position.input_name = pp_token.source_position.input_name;
501                         source_position.linenr     = start_linenr;
502                         errorf(&source_position, "EOF while parsing character constant");
503                         pp_token.type = TP_ERROR;
504                         return;
505                 }
506
507                 case '\'':
508                         next_char();
509                         goto end_of_char_constant;
510
511                 default:
512                         obstack_1grow(&symbol_obstack, (char) c);
513                         next_char();
514                         break;
515
516                 }
517         }
518
519 end_of_char_constant:;
520         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
521         const char *const string = obstack_finish(&symbol_obstack);
522
523         pp_token.type           = TP_CHARACTER_CONSTANT;
524         pp_token.v.string.begin = string;
525         pp_token.v.string.size  = size;
526 }
527
528 #define SYMBOL_CHARS_WITHOUT_E_P \
529         case 'a': \
530         case 'b': \
531         case 'c': \
532         case 'd': \
533         case 'f': \
534         case 'g': \
535         case 'h': \
536         case 'i': \
537         case 'j': \
538         case 'k': \
539         case 'l': \
540         case 'm': \
541         case 'n': \
542         case 'o': \
543         case 'q': \
544         case 'r': \
545         case 's': \
546         case 't': \
547         case 'u': \
548         case 'v': \
549         case 'w': \
550         case 'x': \
551         case 'y': \
552         case 'z': \
553         case 'A': \
554         case 'B': \
555         case 'C': \
556         case 'D': \
557         case 'F': \
558         case 'G': \
559         case 'H': \
560         case 'I': \
561         case 'J': \
562         case 'K': \
563         case 'L': \
564         case 'M': \
565         case 'N': \
566         case 'O': \
567         case 'Q': \
568         case 'R': \
569         case 'S': \
570         case 'T': \
571         case 'U': \
572         case 'V': \
573         case 'W': \
574         case 'X': \
575         case 'Y': \
576         case 'Z': \
577         case '_':
578
579 #define SYMBOL_CHARS \
580         SYMBOL_CHARS_WITHOUT_E_P \
581         case 'e': \
582         case 'p': \
583         case 'E': \
584         case 'P':
585
586 #define DIGITS \
587         case '0':  \
588         case '1':  \
589         case '2':  \
590         case '3':  \
591         case '4':  \
592         case '5':  \
593         case '6':  \
594         case '7':  \
595         case '8':  \
596         case '9':
597
598 /**
599  * returns next final token from a preprocessor macro expansion
600  */
601 static void expand_next(void)
602 {
603         assert(current_expansion != NULL);
604
605         pp_definition_t *definition = current_expansion;
606
607 restart:
608         if(definition->list_len == 0
609                         || definition->expand_pos >= definition->list_len) {
610                 /* we're finished with the current macro, move up 1 level in the
611                  * expansion stack */
612                 pp_definition_t *parent = definition->parent_expansion;
613                 definition->parent_expansion = NULL;
614                 definition->is_expanding     = false;
615                 if(parent == NULL) {
616                         current_expansion = NULL;
617                         next_preprocessing_token();
618                         return;
619                 }
620                 definition        = parent;
621                 current_expansion = definition;
622                 goto restart;
623         }
624         pp_token = definition->replacement_list[definition->expand_pos];
625         ++definition->expand_pos;
626
627         if(pp_token.type != TP_IDENTIFIER)
628                 return;
629
630         pp_definition_t *symbol_definition = pp_token.v.symbol->pp_definition;
631         if(symbol_definition != NULL && !symbol_definition->is_expanding) {
632                 symbol_definition->parent_expansion = definition;
633                 symbol_definition->expand_pos       = 0;
634                 symbol_definition->is_expanding     = true;
635                 definition                          = symbol_definition;
636                 current_expansion                   = definition;
637                 goto restart;
638         }
639 }
640
641 static void parse_symbol(void)
642 {
643         obstack_1grow(&symbol_obstack, (char) c);
644         next_char();
645
646         while(1) {
647                 switch(c) {
648                 DIGITS
649                 SYMBOL_CHARS
650                         obstack_1grow(&symbol_obstack, (char) c);
651                         next_char();
652                         break;
653
654                 default:
655                         goto end_symbol;
656                 }
657         }
658
659 end_symbol:
660         obstack_1grow(&symbol_obstack, '\0');
661         char *string = obstack_finish(&symbol_obstack);
662
663         /* might be a wide string or character constant ( L"string"/L'c' ) */
664         if(c == '"' && string[0] == 'L' && string[1] == '\0') {
665                 obstack_free(&symbol_obstack, string);
666                 parse_wide_string_literal();
667                 return;
668         } else if(c == '\'' && string[0] == 'L' && string[1] == '\0') {
669                 obstack_free(&symbol_obstack, string);
670                 parse_wide_character_constant();
671                 return;
672         }
673
674         symbol_t *symbol = symbol_table_insert(string);
675
676         pp_token.type     = symbol->pp_ID;
677         pp_token.v.symbol = symbol;
678
679         /* we can free the memory from symbol obstack if we already had an entry in
680          * the symbol table */
681         if(symbol->string != string) {
682                 obstack_free(&symbol_obstack, string);
683         }
684
685         pp_definition_t *pp_definition = symbol->pp_definition;
686         if(do_expansions && pp_definition != NULL) {
687                 pp_definition->expand_pos   = 0;
688                 pp_definition->is_expanding = true,
689                 current_expansion           = pp_definition;
690                 expand_next();
691         }
692 }
693
694 static void parse_number(void)
695 {
696         obstack_1grow(&symbol_obstack, (char) c);
697         next_char();
698
699         while(1) {
700                 switch(c) {
701                 case '.':
702                 DIGITS
703                 SYMBOL_CHARS_WITHOUT_E_P
704                         obstack_1grow(&symbol_obstack, (char) c);
705                         next_char();
706                         break;
707
708                 case 'e':
709                 case 'p':
710                 case 'E':
711                 case 'P':
712                         obstack_1grow(&symbol_obstack, (char) c);
713                         next_char();
714                         if(c == '+' || c == '-') {
715                                 obstack_1grow(&symbol_obstack, (char) c);
716                                 next_char();
717                         }
718                         break;
719
720                 default:
721                         goto end_number;
722                 }
723         }
724
725 end_number:
726         obstack_1grow(&symbol_obstack, '\0');
727         size_t  size   = obstack_object_size(&symbol_obstack);
728         char   *string = obstack_finish(&symbol_obstack);
729
730         pp_token.type           = TP_NUMBER;
731         pp_token.v.string.begin = string;
732         pp_token.v.string.size  = size;
733 }
734
735 static void skip_multiline_comment(void)
736 {
737         unsigned start_linenr = input_position.linenr;
738
739         while(1) {
740                 switch(c) {
741                 case '/':
742                         next_char();
743                         if (c == '*') {
744                                 /* TODO: nested comment, warn here */
745                         }
746                         break;
747                 case '*':
748                         next_char();
749                         if(c == '/') {
750                                 next_char();
751                                 return;
752                         }
753                         break;
754
755                 MATCH_NEWLINE(
756                         if(print_spaces) {
757                                 counted_newlines++;
758                                 counted_spaces = 0;
759                         }
760                         break;
761                 )
762
763                 case EOF: {
764                         source_position_t source_position;
765                         source_position.input_name = pp_token.source_position.input_name;
766                         source_position.linenr     = start_linenr;
767                         errorf(&source_position, "at end of file while looking for comment end");
768                         return;
769                 }
770
771                 default:
772                         next_char();
773                         break;
774                 }
775         }
776 }
777
778 static void skip_line_comment(void)
779 {
780         while(1) {
781                 switch(c) {
782                 case EOF:
783                         return;
784
785                 case '\n':
786                 case '\r':
787                         return;
788
789                 default:
790                         next_char();
791                         break;
792                 }
793         }
794 }
795
796
797
798 #define MAYBE_PROLOG                                       \
799                         next_char();                                   \
800                         while(1) {                                     \
801                                 switch(c) {
802
803 #define MAYBE(ch, set_type)                                \
804                                 case ch:                                   \
805                                         next_char();                           \
806                                         pp_token.type = set_type;              \
807                                         return;
808
809 #define ELSE_CODE(code)                                    \
810                                 default:                                   \
811                                         code;                                  \
812                                 }                                          \
813                         } /* end of while(1) */                        \
814                         break;
815
816 #define ELSE(set_type)                                     \
817                 ELSE_CODE(                                         \
818                         pp_token.type = set_type;                      \
819                         return;                                        \
820                 )
821
822 static void next_preprocessing_token(void)
823 {
824         if(current_expansion != NULL) {
825                 expand_next();
826                 return;
827         }
828
829         pp_token.source_position = input_position;
830
831 restart:
832         switch(c) {
833         case ' ':
834         case '\t':
835                 if(print_spaces)
836                         counted_spaces++;
837                 next_char();
838                 goto restart;
839
840         MATCH_NEWLINE(
841                 counted_newlines++;
842                 counted_spaces = 0;
843                 pp_token.type = '\n';
844                 return;
845         )
846
847         SYMBOL_CHARS
848                 parse_symbol();
849                 return;
850
851         DIGITS
852                 parse_number();
853                 return;
854
855         case '"':
856                 parse_string_literal();
857                 return;
858
859         case '\'':
860                 parse_character_constant();
861                 return;
862
863         case '.':
864                 MAYBE_PROLOG
865                         case '0':
866                         case '1':
867                         case '2':
868                         case '3':
869                         case '4':
870                         case '5':
871                         case '6':
872                         case '7':
873                         case '8':
874                         case '9':
875                                 put_back(c);
876                                 c = '.';
877                                 parse_number();
878                                 return;
879
880                         case '.':
881                                 MAYBE_PROLOG
882                                 MAYBE('.', TP_DOTDOTDOT)
883                                 ELSE_CODE(
884                                         put_back(c);
885                                         c = '.';
886                                         pp_token.type = '.';
887                                         return;
888                                 )
889                 ELSE('.')
890         case '&':
891                 MAYBE_PROLOG
892                 MAYBE('&', TP_ANDAND)
893                 MAYBE('=', TP_ANDEQUAL)
894                 ELSE('&')
895         case '*':
896                 MAYBE_PROLOG
897                 MAYBE('=', TP_ASTERISKEQUAL)
898                 ELSE('*')
899         case '+':
900                 MAYBE_PROLOG
901                 MAYBE('+', TP_PLUSPLUS)
902                 MAYBE('=', TP_PLUSEQUAL)
903                 ELSE('+')
904         case '-':
905                 MAYBE_PROLOG
906                 MAYBE('>', TP_MINUSGREATER)
907                 MAYBE('-', TP_MINUSMINUS)
908                 MAYBE('=', TP_MINUSEQUAL)
909                 ELSE('-')
910         case '!':
911                 MAYBE_PROLOG
912                 MAYBE('=', TP_EXCLAMATIONMARKEQUAL)
913                 ELSE('!')
914         case '/':
915                 MAYBE_PROLOG
916                 MAYBE('=', TP_SLASHEQUAL)
917                         case '*':
918                                 next_char();
919                                 skip_multiline_comment();
920                                 if(print_spaces)
921                                         counted_spaces++;
922                                 goto restart;
923                         case '/':
924                                 next_char();
925                                 skip_line_comment();
926                                 if(print_spaces)
927                                         counted_spaces++;
928                                 goto restart;
929                 ELSE('/')
930         case '%':
931                 MAYBE_PROLOG
932                 MAYBE('>', '}')
933                 MAYBE('=', TP_PERCENTEQUAL)
934                         case ':':
935                                 MAYBE_PROLOG
936                                         case '%':
937                                                 MAYBE_PROLOG
938                                                 MAYBE(':', TP_HASHHASH)
939                                                 ELSE_CODE(
940                                                         put_back(c);
941                                                         c = '%';
942                                                         pp_token.type = '#';
943                                                         return;
944                                                 )
945                                 ELSE('#')
946                 ELSE('%')
947         case '<':
948                 MAYBE_PROLOG
949                 MAYBE(':', '[')
950                 MAYBE('%', '{')
951                 MAYBE('=', TP_LESSEQUAL)
952                         case '<':
953                                 MAYBE_PROLOG
954                                 MAYBE('=', TP_LESSLESSEQUAL)
955                                 ELSE(TP_LESSLESS)
956                 ELSE('<')
957         case '>':
958                 MAYBE_PROLOG
959                 MAYBE('=', TP_GREATEREQUAL)
960                         case '>':
961                                 MAYBE_PROLOG
962                                 MAYBE('=', TP_GREATERGREATEREQUAL)
963                                 ELSE(TP_GREATERGREATER)
964                 ELSE('>')
965         case '^':
966                 MAYBE_PROLOG
967                 MAYBE('=', TP_CARETEQUAL)
968                 ELSE('^')
969         case '|':
970                 MAYBE_PROLOG
971                 MAYBE('=', TP_PIPEEQUAL)
972                 MAYBE('|', TP_PIPEPIPE)
973                 ELSE('|')
974         case ':':
975                 MAYBE_PROLOG
976                 MAYBE('>', ']')
977                 ELSE(':')
978         case '=':
979                 MAYBE_PROLOG
980                 MAYBE('=', TP_EQUALEQUAL)
981                 ELSE('=')
982         case '#':
983                 MAYBE_PROLOG
984                 MAYBE('#', TP_HASHHASH)
985                 ELSE('#')
986
987         case '?':
988         case '[':
989         case ']':
990         case '(':
991         case ')':
992         case '{':
993         case '}':
994         case '~':
995         case ';':
996         case ',':
997         case '\\':
998                 pp_token.type = c;
999                 next_char();
1000                 return;
1001
1002         case EOF:
1003                 pp_token.type = TP_EOF;
1004                 return;
1005
1006         default:
1007                 next_char();
1008                 errorf(&pp_token.source_position, "unknown character '%c' found\n", c);
1009                 pp_token.type = TP_ERROR;
1010                 return;
1011         }
1012 }
1013
1014 static void print_quoted_string(const char *const string)
1015 {
1016         fputc('"', out);
1017         for (const char *c = string; *c != 0; ++c) {
1018                 switch(*c) {
1019                 case '"': fputs("\\\"", out); break;
1020                 case '\\':  fputs("\\\\", out); break;
1021                 case '\a':  fputs("\\a", out); break;
1022                 case '\b':  fputs("\\b", out); break;
1023                 case '\f':  fputs("\\f", out); break;
1024                 case '\n':  fputs("\\n", out); break;
1025                 case '\r':  fputs("\\r", out); break;
1026                 case '\t':  fputs("\\t", out); break;
1027                 case '\v':  fputs("\\v", out); break;
1028                 case '\?':  fputs("\\?", out); break;
1029                 default:
1030                         if(!isprint(*c)) {
1031                                 fprintf(out, "\\%03o", *c);
1032                                 break;
1033                         }
1034                         fputc(*c, out);
1035                         break;
1036                 }
1037         }
1038         fputc('"', out);
1039 }
1040
1041 static void print_line_directive(const source_position_t *pos)
1042 {
1043         fprintf(out, "# %d ", pos->linenr);
1044         print_quoted_string(pos->input_name);
1045         fputc('\n', out);
1046
1047         printed_input_name = pos->input_name;
1048 }
1049
1050 static bool had_non_space = false;
1051
1052 static void emit_pp_token(void)
1053 {
1054         if (printed_input_name != pp_token.source_position.input_name) {
1055                 print_line_directive(&pp_token.source_position);
1056         } else if (pp_token.type != '\n') {
1057                 if (counted_newlines >= 9) {
1058                         if (had_non_space) {
1059                                 fputc('\n', out);
1060                         }
1061                         print_line_directive(&pp_token.source_position);
1062                         counted_newlines = 0;
1063                 } else {
1064                         for (unsigned i = 0; i < counted_newlines; ++i)
1065                                 fputc('\n', out);
1066                         counted_newlines = 0;
1067                 }
1068                 for (unsigned i = 0; i < counted_spaces; ++i)
1069                         fputc(' ', out);
1070                 counted_spaces = 0;
1071                 had_non_space  = true;
1072         }
1073
1074         switch(pp_token.type) {
1075         case TP_IDENTIFIER:
1076                 fputs(pp_token.v.symbol->string, out);
1077                 break;
1078         case TP_NUMBER:
1079                 fputs(pp_token.v.string.begin, out);
1080                 break;
1081         case TP_STRING_LITERAL:
1082                 fputc('"', out);
1083                 fputs(pp_token.v.string.begin, out);
1084                 fputc('"', out);
1085                 break;
1086         case '\n':
1087                 break;
1088         default:
1089                 print_pp_token_type(out, pp_token.type);
1090                 break;
1091         }
1092 }
1093
1094 static void eat_pp(preprocessor_token_type_t type)
1095 {
1096         (void) type;
1097         assert(pp_token.type == type);
1098         next_preprocessing_token();
1099 }
1100
1101 static void eat_pp_directive(void)
1102 {
1103         while(pp_token.type != '\n' && pp_token.type != TP_EOF) {
1104                 next_preprocessing_token();
1105         }
1106 }
1107
1108 static bool strings_equal(const string_t *string1, const string_t *string2)
1109 {
1110         size_t size = string1->size;
1111         if(size != string2->size)
1112                 return false;
1113
1114         const char *c1 = string1->begin;
1115         const char *c2 = string2->begin;
1116         for(size_t i = 0; i < size; ++i, ++c1, ++c2) {
1117                 if(*c1 != *c2)
1118                         return false;
1119         }
1120         return true;
1121 }
1122
1123 static bool wide_strings_equal(const wide_string_t *string1,
1124                                const wide_string_t *string2)
1125 {
1126         size_t size = string1->size;
1127         if(size != string2->size)
1128                 return false;
1129
1130         const wchar_rep_t *c1 = string1->begin;
1131         const wchar_rep_t *c2 = string2->begin;
1132         for(size_t i = 0; i < size; ++i, ++c1, ++c2) {
1133                 if(*c1 != *c2)
1134                         return false;
1135         }
1136         return true;
1137 }
1138
1139 static bool pp_tokens_equal(const token_t *token1, const token_t *token2)
1140 {
1141         if(token1->type != token2->type)
1142                 return false;
1143
1144         switch(token1->type) {
1145         case TP_HEADERNAME:
1146                 /* TODO */
1147                 return false;
1148         case TP_IDENTIFIER:
1149                 return token1->v.symbol == token2->v.symbol;
1150         case TP_NUMBER:
1151         case TP_CHARACTER_CONSTANT:
1152         case TP_STRING_LITERAL:
1153                 return strings_equal(&token1->v.string, &token2->v.string);
1154
1155         case TP_WIDE_CHARACTER_CONSTANT:
1156         case TP_WIDE_STRING_LITERAL:
1157                 return wide_strings_equal(&token1->v.wide_string,
1158                                           &token2->v.wide_string);
1159         default:
1160                 return true;
1161         }
1162 }
1163
1164 static bool pp_definitions_equal(const pp_definition_t *definition1,
1165                                  const pp_definition_t *definition2)
1166 {
1167         if(definition1->list_len != definition2->list_len)
1168                 return false;
1169
1170         size_t         len = definition1->list_len;
1171         const token_t *t1  = definition1->replacement_list;
1172         const token_t *t2  = definition2->replacement_list;
1173         for(size_t i = 0; i < len; ++i, ++t1, ++t2) {
1174                 if(!pp_tokens_equal(t1, t2))
1175                         return false;
1176         }
1177         return true;
1178 }
1179
1180 static void parse_define_directive(void)
1181 {
1182         eat_pp(TP_define);
1183
1184         if(pp_token.type != TP_IDENTIFIER) {
1185                 errorf(&pp_token.source_position,
1186                        "expected identifier after #define, got '%T'", &pp_token);
1187                 eat_pp_directive();
1188                 return;
1189         }
1190         symbol_t *symbol = pp_token.v.symbol;
1191
1192         pp_definition_t *new_definition
1193                 = obstack_alloc(&pp_obstack, sizeof(new_definition[0]));
1194         memset(new_definition, 0, sizeof(new_definition[0]));
1195         new_definition->source_position = input_position;
1196
1197         /* this is probably the only place where spaces are significant in the
1198          * lexer (except for the fact that they separate tokens). #define b(x)
1199          * is something else than #define b (x) */
1200         //token_t *arguments = NULL;
1201         if(c == '(') {
1202                 next_preprocessing_token();
1203                 while(pp_token.type != ')') {
1204                         if(pp_token.type == TP_DOTDOTDOT) {
1205                                 new_definition->is_variadic = true;
1206                                 next_preprocessing_token();
1207                                 if(pp_token.type != ')') {
1208                                         errorf(&input_position,
1209                                                         "'...' not at end of macro argument list");
1210                                         continue;
1211                                 }
1212                         } else if(pp_token.type != TP_IDENTIFIER) {
1213                                 next_preprocessing_token();
1214                         }
1215                 }
1216         } else {
1217                 next_preprocessing_token();
1218         }
1219
1220         /* construct a new pp_definition on the obstack */
1221         assert(obstack_object_size(&pp_obstack) == 0);
1222         size_t list_len = 0;
1223         while(pp_token.type != '\n' && pp_token.type != TP_EOF) {
1224                 obstack_grow(&pp_obstack, &pp_token, sizeof(pp_token));
1225                 ++list_len;
1226                 next_preprocessing_token();
1227         }
1228
1229         new_definition->list_len         = list_len;
1230         new_definition->replacement_list = obstack_finish(&pp_obstack);
1231
1232         pp_definition_t *old_definition = symbol->pp_definition;
1233         if(old_definition != NULL) {
1234                 if(!pp_definitions_equal(old_definition, new_definition)) {
1235                         warningf(&input_position, "multiple definition of macro '%Y' (first defined %P)",
1236                                  symbol, &old_definition->source_position);
1237                 } else {
1238                         /* reuse the old definition */
1239                         obstack_free(&pp_obstack, new_definition);
1240                         new_definition = old_definition;
1241                 }
1242         }
1243
1244         symbol->pp_definition = new_definition;
1245 }
1246
1247 static void parse_undef_directive(void)
1248 {
1249         eat_pp(TP_undef);
1250
1251         if(pp_token.type != TP_IDENTIFIER) {
1252                 errorf(&input_position,
1253                        "expected identifier after #undef, got '%T'", &pp_token);
1254                 eat_pp_directive();
1255                 return;
1256         }
1257
1258         symbol_t *symbol = pp_token.v.symbol;
1259         symbol->pp_definition = NULL;
1260         next_preprocessing_token();
1261
1262         if(pp_token.type != '\n') {
1263                 warningf(&input_position, "extra tokens at end of #undef directive");
1264         }
1265         /* eat until '\n' */
1266         eat_pp_directive();
1267 }
1268
1269 static void parse_preprocessing_directive(void)
1270 {
1271         print_spaces  = false;
1272         do_expansions = false;
1273         eat_pp('#');
1274
1275         switch(pp_token.type) {
1276         case TP_define:
1277                 parse_define_directive();
1278                 break;
1279         case TP_undef:
1280                 parse_undef_directive();
1281                 break;
1282         default:
1283                 errorf(&pp_token.source_position,
1284                        "invalid preprocessing directive #%T", &pp_token);
1285                 eat_pp_directive();
1286                 break;
1287         }
1288
1289         print_spaces  = true;
1290         do_expansions = true;
1291
1292         /* eat '\n' */
1293         assert(pp_token.type == '\n' || pp_token.type == TP_EOF);
1294         next_preprocessing_token();
1295 }
1296
1297 int pptest_main(int argc, char **argv);
1298
1299 #define GCC_COMPAT_MODE
1300
1301 int pptest_main(int argc, char **argv)
1302 {
1303         init_symbol_table();
1304         init_tokens();
1305
1306         obstack_init(&pp_obstack);
1307
1308         const char *infname = "t.c";
1309         if (argc > 1)
1310                 infname = argv[1];
1311
1312         input = fopen(infname, "r");
1313         assert(input != NULL);
1314         input_position.input_name = infname;
1315         input_position.linenr     = 1;
1316
1317         bufpos       = NULL;
1318         bufend       = NULL;
1319         counted_newlines = 0;
1320         counted_spaces   = 0;
1321
1322         out = stdout;
1323
1324 #ifdef GCC_COMPAT_MODE
1325         /* this is here so we can directly compare "gcc -E" output and our output */
1326         fprintf(out, "# 1 \"%s\"\n", input_position.input_name);
1327         fputs("# 1 \"<built-in>\"\n", out);
1328         fputs("# 1 \"<command-line>\"\n", out);
1329 #endif
1330
1331         next_char();
1332
1333         next_preprocessing_token();
1334
1335         while(true) {
1336                 /* we're at a line begin */
1337                 if(pp_token.type == '#') {
1338                         parse_preprocessing_directive();
1339                 } else {
1340                         /* parse+emit a line */
1341                         while(pp_token.type != '\n') {
1342                                 if(pp_token.type == TP_EOF)
1343                                         goto end_of_main_loop;
1344                                 emit_pp_token();
1345                                 next_preprocessing_token();
1346                         }
1347                         emit_pp_token();
1348                         next_preprocessing_token();
1349                 }
1350         }
1351 end_of_main_loop:
1352
1353         if (counted_newlines > 0) {
1354                 fputc('\n', out);
1355         }
1356
1357         obstack_free(&pp_obstack, NULL);
1358
1359         exit_tokens();
1360         exit_symbol_table();
1361
1362         return 0;
1363 }