nsz Git - cparser/blob - preprocessor.c

   1 #include <config.h>
   2
   3 #include "token_t.h"
   4 #include "symbol_t.h"
   5 #include "adt/util.h"
   6 #include "adt/error.h"
   7 #include "lang_features.h"
   8 #include "diagnostic.h"
   9 #include "string_rep.h"
  10
  11 #include <assert.h>
  12 #include <errno.h>
  13 #include <string.h>
  14 #include <stdbool.h>
  15 #include <ctype.h>
  16
  17 //#define DEBUG_CHARS
  18 #define MAX_PUTBACK 3
  19
  20 struct pp_definition_t {
  21         symbol_t          *symbol;
  22         source_position_t  source_position;
  23         pp_definition_t   *parent_expansion;
  24         size_t             expand_pos;
  25         bool               is_variadic  : 1;
  26         bool               is_expanding : 1;
  27         size_t             argument_count;
  28         token_t           *arguments;
  29         size_t             list_len;
  30         token_t           *replacement_list;
  31 };
  32
  33 static int                c;
  34 token_t                   pp_token;
  35 static FILE              *input;
  36 static char               buf[1024 + MAX_PUTBACK];
  37 static const char        *bufend;
  38 static const char        *bufpos;
  39 static bool               resolve_escape_sequences = false;
  40 static bool               print_spaces             = true;
  41 static FILE              *out;
  42 static struct obstack     pp_obstack;
  43 static unsigned           counted_newlines;
  44 static unsigned           counted_spaces;
  45 static source_position_t  input_position;
  46 static const char        *printed_input_name = NULL;
  47 static pp_definition_t   *current_expansion  = NULL;
  48 static bool               do_expansions;
  49
  50 static void next_preprocessing_token(void);
  51
  52 /**
  53  * Prints a parse error message at the current token.
  54  *
  55  * @param msg   the error message
  56  */
  57 static void parse_error(const char *msg)
  58 {
  59         errorf(&pp_token.source_position,  "%s", msg);
  60 }
  61
  62 static inline void next_real_char(void)
  63 {
  64         assert(bufpos <= bufend);
  65         if (bufpos >= bufend) {
  66                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  67                                  input);
  68                 if(s == 0) {
  69                         c = EOF;
  70                         return;
  71                 }
  72                 bufpos = buf + MAX_PUTBACK;
  73                 bufend = buf + MAX_PUTBACK + s;
  74         }
  75         c = *bufpos++;
  76 }
  77
  78 /**
  79  * Put a character back into the buffer.
  80  *
  81  * @param pc  the character to put back
  82  */
  83 static inline void put_back(int pc)
  84 {
  85         assert(bufpos > buf);
  86         *(--bufpos - buf + buf) = (char) pc;
  87
  88 #ifdef DEBUG_CHARS
  89         printf("putback '%c'\n", pc);
  90 #endif
  91 }
  92
  93 static inline void next_char(void);
  94
  95 #define MATCH_NEWLINE(code)                   \
  96         case '\r':                                \
  97                 next_char();                          \
  98                 if(c == '\n') {                       \
  99                         next_char();                      \
 100                 }                                     \
 101                 ++input_position.linenr;              \
 102                 code                                  \
 103         case '\n':                                \
 104                 next_char();                          \
 105                 ++input_position.linenr;              \
 106                 code
 107
 108 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 109
 110 static void maybe_concat_lines(void)
 111 {
 112         eat('\\');
 113
 114         switch(c) {
 115         MATCH_NEWLINE(return;)
 116
 117         default:
 118                 break;
 119         }
 120
 121         put_back(c);
 122         c = '\\';
 123 }
 124
 125 /**
 126  * Set c to the next input character, ie.
 127  * after expanding trigraphs.
 128  */
 129 static inline void next_char(void)
 130 {
 131         next_real_char();
 132
 133         /* filter trigraphs and concatenated lines */
 134         if(UNLIKELY(c == '\\')) {
 135                 maybe_concat_lines();
 136                 goto end_of_next_char;
 137         }
 138
 139         if(LIKELY(c != '?'))
 140                 goto end_of_next_char;
 141
 142         next_real_char();
 143         if(LIKELY(c != '?')) {
 144                 put_back(c);
 145                 c = '?';
 146                 goto end_of_next_char;
 147         }
 148
 149         next_real_char();
 150         switch(c) {
 151         case '=': c = '#'; break;
 152         case '(': c = '['; break;
 153         case '/': c = '\\'; maybe_concat_lines(); break;
 154         case ')': c = ']'; break;
 155         case '\'': c = '^'; break;
 156         case '<': c = '{'; break;
 157         case '!': c = '|'; break;
 158         case '>': c = '}'; break;
 159         case '-': c = '~'; break;
 160         default:
 161                 put_back(c);
 162                 put_back('?');
 163                 c = '?';
 164                 break;
 165         }
 166
 167 end_of_next_char:;
 168 #ifdef DEBUG_CHARS
 169         printf("nchar '%c'\n", c);
 170 #endif
 171 }
 172
 173
 174
 175 /**
 176  * Returns true if the given char is a octal digit.
 177  *
 178  * @param char  the character to check
 179  */
 180 static inline bool is_octal_digit(int chr)
 181 {
 182         switch(chr) {
 183         case '0':
 184         case '1':
 185         case '2':
 186         case '3':
 187         case '4':
 188         case '5':
 189         case '6':
 190         case '7':
 191                 return true;
 192         default:
 193                 return false;
 194         }
 195 }
 196
 197 /**
 198  * Returns the value of a digit.
 199  * The only portable way to do it ...
 200  */
 201 static int digit_value(int digit) {
 202         switch (digit) {
 203         case '0': return 0;
 204         case '1': return 1;
 205         case '2': return 2;
 206         case '3': return 3;
 207         case '4': return 4;
 208         case '5': return 5;
 209         case '6': return 6;
 210         case '7': return 7;
 211         case '8': return 8;
 212         case '9': return 9;
 213         case 'a':
 214         case 'A': return 10;
 215         case 'b':
 216         case 'B': return 11;
 217         case 'c':
 218         case 'C': return 12;
 219         case 'd':
 220         case 'D': return 13;
 221         case 'e':
 222         case 'E': return 14;
 223         case 'f':
 224         case 'F': return 15;
 225         default:
 226                 panic("wrong character given");
 227         }
 228 }
 229
 230 /**
 231  * Parses an octal character sequence.
 232  *
 233  * @param first_digit  the already read first digit
 234  */
 235 static int parse_octal_sequence(const int first_digit)
 236 {
 237         assert(is_octal_digit(first_digit));
 238         int value = digit_value(first_digit);
 239         if (!is_octal_digit(c)) return value;
 240         value = 8 * value + digit_value(c);
 241         next_char();
 242         if (!is_octal_digit(c)) return value;
 243         value = 8 * value + digit_value(c);
 244         next_char();
 245
 246         if(char_is_signed) {
 247                 return (signed char) value;
 248         } else {
 249                 return (unsigned char) value;
 250         }
 251 }
 252
 253 /**
 254  * Parses a hex character sequence.
 255  */
 256 static int parse_hex_sequence(void)
 257 {
 258         int value = 0;
 259         while(isxdigit(c)) {
 260                 value = 16 * value + digit_value(c);
 261                 next_char();
 262         }
 263
 264         if(char_is_signed) {
 265                 return (signed char) value;
 266         } else {
 267                 return (unsigned char) value;
 268         }
 269 }
 270
 271 /**
 272  * Parse an escape sequence.
 273  */
 274 static int parse_escape_sequence(void)
 275 {
 276         eat('\\');
 277
 278         int ec = c;
 279         next_char();
 280
 281         switch(ec) {
 282         case '"':  return '"';
 283         case '\'': return '\'';
 284         case '\\': return '\\';
 285         case '?': return '\?';
 286         case 'a': return '\a';
 287         case 'b': return '\b';
 288         case 'f': return '\f';
 289         case 'n': return '\n';
 290         case 'r': return '\r';
 291         case 't': return '\t';
 292         case 'v': return '\v';
 293         case 'x':
 294                 return parse_hex_sequence();
 295         case '0':
 296         case '1':
 297         case '2':
 298         case '3':
 299         case '4':
 300         case '5':
 301         case '6':
 302         case '7':
 303                 return parse_octal_sequence(ec);
 304         case EOF:
 305                 parse_error("reached end of file while parsing escape sequence");
 306                 return EOF;
 307         default:
 308                 parse_error("unknown escape sequence");
 309                 return EOF;
 310         }
 311 }
 312
 313 static void parse_string_literal(void)
 314 {
 315         const unsigned start_linenr = input_position.linenr;
 316
 317         eat('"');
 318
 319         int tc;
 320         while(1) {
 321                 switch(c) {
 322                 case '\\':
 323                         if(resolve_escape_sequences) {
 324                                 tc = parse_escape_sequence();
 325                                 obstack_1grow(&symbol_obstack, (char) tc);
 326                         } else {
 327                                 obstack_1grow(&symbol_obstack, (char) c);
 328                                 next_char();
 329                                 obstack_1grow(&symbol_obstack, (char) c);
 330                                 next_char();
 331                         }
 332                         break;
 333
 334                 case EOF: {
 335                         source_position_t source_position;
 336                         source_position.input_name = pp_token.source_position.input_name;
 337                         source_position.linenr     = start_linenr;
 338                         errorf(&source_position, "string has no end");
 339                         pp_token.type = TP_ERROR;
 340                         return;
 341                 }
 342
 343                 case '"':
 344                         next_char();
 345                         goto end_of_string;
 346
 347                 default:
 348                         obstack_1grow(&symbol_obstack, (char) c);
 349                         next_char();
 350                         break;
 351                 }
 352         }
 353
 354 end_of_string:
 355         /* add finishing 0 to the string */
 356         obstack_1grow(&symbol_obstack, '\0');
 357         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 358         const char *const string = obstack_finish(&symbol_obstack);
 359
 360 #if 0 /* TODO hash */
 361         /* check if there is already a copy of the string */
 362         result = strset_insert(&stringset, string);
 363         if(result != string) {
 364                 obstack_free(&symbol_obstack, string);
 365         }
 366 #else
 367         const char *const result = string;
 368 #endif
 369
 370         pp_token.type           = TP_STRING_LITERAL;
 371         pp_token.v.string.begin = result;
 372         pp_token.v.string.size  = size;
 373 }
 374
 375 static void parse_wide_character_constant(void)
 376 {
 377         eat('\'');
 378
 379         int found_char = 0;
 380         while(1) {
 381                 switch(c) {
 382                 case '\\':
 383                         found_char = parse_escape_sequence();
 384                         break;
 385
 386                 MATCH_NEWLINE(
 387                         parse_error("newline while parsing character constant");
 388                         break;
 389                 )
 390
 391                 case '\'':
 392                         next_char();
 393                         goto end_of_wide_char_constant;
 394
 395                 case EOF:
 396                         parse_error("EOF while parsing character constant");
 397                         pp_token.type = TP_ERROR;
 398                         return;
 399
 400                 default:
 401                         if(found_char != 0) {
 402                                 parse_error("more than 1 characters in character "
 403                                             "constant");
 404                                 goto end_of_wide_char_constant;
 405                         } else {
 406                                 found_char = c;
 407                                 next_char();
 408                         }
 409                         break;
 410                 }
 411         }
 412
 413 end_of_wide_char_constant:
 414         pp_token.type       = TP_WIDE_CHARACTER_CONSTANT;
 415         /* TODO... */
 416 }
 417
 418 static void parse_wide_string_literal(void)
 419 {
 420         const unsigned start_linenr = input_position.linenr;
 421
 422         assert(c == '"');
 423         next_char();
 424
 425         while(1) {
 426                 switch(c) {
 427                 case '\\': {
 428                         wchar_rep_t tc = parse_escape_sequence();
 429                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 430                         break;
 431                 }
 432
 433                 case EOF: {
 434                         source_position_t source_position;
 435                         source_position.input_name = pp_token.source_position.input_name;
 436                         source_position.linenr     = start_linenr;
 437                         errorf(&source_position, "string has no end");
 438                         pp_token.type = TP_ERROR;
 439                         return;
 440                 }
 441
 442                 case '"':
 443                         next_char();
 444                         goto end_of_string;
 445
 446                 default: {
 447                         wchar_rep_t tc = c;
 448                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 449                         next_char();
 450                         break;
 451                 }
 452                 }
 453         }
 454
 455 end_of_string:;
 456         /* add finishing 0 to the string */
 457         static const wchar_rep_t nul = L'\0';
 458         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 459
 460         const size_t size
 461                 = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 462         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
 463
 464 #if 0 /* TODO hash */
 465         /* check if there is already a copy of the string */
 466         const wchar_rep_t *const result = strset_insert(&stringset, string);
 467         if(result != string) {
 468                 obstack_free(&symbol_obstack, string);
 469         }
 470 #else
 471         const wchar_rep_t *const result = string;
 472 #endif
 473
 474         pp_token.type                = TP_WIDE_STRING_LITERAL;
 475         pp_token.v.wide_string.begin = result;
 476         pp_token.v.wide_string.size  = size;
 477 }
 478
 479 static void parse_character_constant(void)
 480 {
 481         const unsigned start_linenr = input_position.linenr;
 482
 483         eat('\'');
 484
 485         int tc;
 486         while(1) {
 487                 switch(c) {
 488                 case '\\':
 489                         tc = parse_escape_sequence();
 490                         obstack_1grow(&symbol_obstack, (char) tc);
 491                         break;
 492
 493                 MATCH_NEWLINE(
 494                         parse_error("newline while parsing character constant");
 495                         break;
 496                 )
 497
 498                 case EOF: {
 499                         source_position_t source_position;
 500                         source_position.input_name = pp_token.source_position.input_name;
 501                         source_position.linenr     = start_linenr;
 502                         errorf(&source_position, "EOF while parsing character constant");
 503                         pp_token.type = TP_ERROR;
 504                         return;
 505                 }
 506
 507                 case '\'':
 508                         next_char();
 509                         goto end_of_char_constant;
 510
 511                 default:
 512                         obstack_1grow(&symbol_obstack, (char) c);
 513                         next_char();
 514                         break;
 515
 516                 }
 517         }
 518
 519 end_of_char_constant:;
 520         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 521         const char *const string = obstack_finish(&symbol_obstack);
 522
 523         pp_token.type           = TP_CHARACTER_CONSTANT;
 524         pp_token.v.string.begin = string;
 525         pp_token.v.string.size  = size;
 526 }
 527
 528 #define SYMBOL_CHARS_WITHOUT_E_P \
 529         case 'a': \
 530         case 'b': \
 531         case 'c': \
 532         case 'd': \
 533         case 'f': \
 534         case 'g': \
 535         case 'h': \
 536         case 'i': \
 537         case 'j': \
 538         case 'k': \
 539         case 'l': \
 540         case 'm': \
 541         case 'n': \
 542         case 'o': \
 543         case 'q': \
 544         case 'r': \
 545         case 's': \
 546         case 't': \
 547         case 'u': \
 548         case 'v': \
 549         case 'w': \
 550         case 'x': \
 551         case 'y': \
 552         case 'z': \
 553         case 'A': \
 554         case 'B': \
 555         case 'C': \
 556         case 'D': \
 557         case 'F': \
 558         case 'G': \
 559         case 'H': \
 560         case 'I': \
 561         case 'J': \
 562         case 'K': \
 563         case 'L': \
 564         case 'M': \
 565         case 'N': \
 566         case 'O': \
 567         case 'Q': \
 568         case 'R': \
 569         case 'S': \
 570         case 'T': \
 571         case 'U': \
 572         case 'V': \
 573         case 'W': \
 574         case 'X': \
 575         case 'Y': \
 576         case 'Z': \
 577         case '_':
 578
 579 #define SYMBOL_CHARS \
 580         SYMBOL_CHARS_WITHOUT_E_P \
 581         case 'e': \
 582         case 'p': \
 583         case 'E': \
 584         case 'P':
 585
 586 #define DIGITS \
 587         case '0':  \
 588         case '1':  \
 589         case '2':  \
 590         case '3':  \
 591         case '4':  \
 592         case '5':  \
 593         case '6':  \
 594         case '7':  \
 595         case '8':  \
 596         case '9':
 597
 598 /**
 599  * returns next final token from a preprocessor macro expansion
 600  */
 601 static void expand_next(void)
 602 {
 603         assert(current_expansion != NULL);
 604
 605         pp_definition_t *definition = current_expansion;
 606
 607 restart:
 608         if(definition->list_len == 0
 609                         || definition->expand_pos >= definition->list_len) {
 610                 /* we're finished with the current macro, move up 1 level in the
 611                  * expansion stack */
 612                 pp_definition_t *parent = definition->parent_expansion;
 613                 definition->parent_expansion = NULL;
 614                 definition->is_expanding     = false;
 615                 if(parent == NULL) {
 616                         current_expansion = NULL;
 617                         next_preprocessing_token();
 618                         return;
 619                 }
 620                 definition        = parent;
 621                 current_expansion = definition;
 622                 goto restart;
 623         }
 624         pp_token = definition->replacement_list[definition->expand_pos];
 625         ++definition->expand_pos;
 626
 627         if(pp_token.type != TP_IDENTIFIER)
 628                 return;
 629
 630         pp_definition_t *symbol_definition = pp_token.v.symbol->pp_definition;
 631         if(symbol_definition != NULL && !symbol_definition->is_expanding) {
 632                 symbol_definition->parent_expansion = definition;
 633                 symbol_definition->expand_pos       = 0;
 634                 symbol_definition->is_expanding     = true;
 635                 definition                          = symbol_definition;
 636                 current_expansion                   = definition;
 637                 goto restart;
 638         }
 639 }
 640
 641 static void parse_symbol(void)
 642 {
 643         obstack_1grow(&symbol_obstack, (char) c);
 644         next_char();
 645
 646         while(1) {
 647                 switch(c) {
 648                 DIGITS
 649                 SYMBOL_CHARS
 650                         obstack_1grow(&symbol_obstack, (char) c);
 651                         next_char();
 652                         break;
 653
 654                 default:
 655                         goto end_symbol;
 656                 }
 657         }
 658
 659 end_symbol:
 660         obstack_1grow(&symbol_obstack, '\0');
 661         char *string = obstack_finish(&symbol_obstack);
 662
 663         /* might be a wide string or character constant ( L"string"/L'c' ) */
 664         if(c == '"' && string[0] == 'L' && string[1] == '\0') {
 665                 obstack_free(&symbol_obstack, string);
 666                 parse_wide_string_literal();
 667                 return;
 668         } else if(c == '\'' && string[0] == 'L' && string[1] == '\0') {
 669                 obstack_free(&symbol_obstack, string);
 670                 parse_wide_character_constant();
 671                 return;
 672         }
 673
 674         symbol_t *symbol = symbol_table_insert(string);
 675
 676         pp_token.type     = symbol->pp_ID;
 677         pp_token.v.symbol = symbol;
 678
 679         /* we can free the memory from symbol obstack if we already had an entry in
 680          * the symbol table */
 681         if(symbol->string != string) {
 682                 obstack_free(&symbol_obstack, string);
 683         }
 684
 685         pp_definition_t *pp_definition = symbol->pp_definition;
 686         if(do_expansions && pp_definition != NULL) {
 687                 pp_definition->expand_pos   = 0;
 688                 pp_definition->is_expanding = true,
 689                 current_expansion           = pp_definition;
 690                 expand_next();
 691         }
 692 }
 693
 694 static void parse_number(void)
 695 {
 696         obstack_1grow(&symbol_obstack, (char) c);
 697         next_char();
 698
 699         while(1) {
 700                 switch(c) {
 701                 case '.':
 702                 DIGITS
 703                 SYMBOL_CHARS_WITHOUT_E_P
 704                         obstack_1grow(&symbol_obstack, (char) c);
 705                         next_char();
 706                         break;
 707
 708                 case 'e':
 709                 case 'p':
 710                 case 'E':
 711                 case 'P':
 712                         obstack_1grow(&symbol_obstack, (char) c);
 713                         next_char();
 714                         if(c == '+' || c == '-') {
 715                                 obstack_1grow(&symbol_obstack, (char) c);
 716                                 next_char();
 717                         }
 718                         break;
 719
 720                 default:
 721                         goto end_number;
 722                 }
 723         }
 724
 725 end_number:
 726         obstack_1grow(&symbol_obstack, '\0');
 727         size_t  size   = obstack_object_size(&symbol_obstack);
 728         char   *string = obstack_finish(&symbol_obstack);
 729
 730         pp_token.type           = TP_NUMBER;
 731         pp_token.v.string.begin = string;
 732         pp_token.v.string.size  = size;
 733 }
 734
 735 static void skip_multiline_comment(void)
 736 {
 737         unsigned start_linenr = input_position.linenr;
 738
 739         while(1) {
 740                 switch(c) {
 741                 case '/':
 742                         next_char();
 743                         if (c == '*') {
 744                                 /* TODO: nested comment, warn here */
 745                         }
 746                         break;
 747                 case '*':
 748                         next_char();
 749                         if(c == '/') {
 750                                 next_char();
 751                                 return;
 752                         }
 753                         break;
 754
 755                 MATCH_NEWLINE(
 756                         if(print_spaces) {
 757                                 counted_newlines++;
 758                                 counted_spaces = 0;
 759                         }
 760                         break;
 761                 )
 762
 763                 case EOF: {
 764                         source_position_t source_position;
 765                         source_position.input_name = pp_token.source_position.input_name;
 766                         source_position.linenr     = start_linenr;
 767                         errorf(&source_position, "at end of file while looking for comment end");
 768                         return;
 769                 }
 770
 771                 default:
 772                         next_char();
 773                         break;
 774                 }
 775         }
 776 }
 777
 778 static void skip_line_comment(void)
 779 {
 780         while(1) {
 781                 switch(c) {
 782                 case EOF:
 783                         return;
 784
 785                 case '\n':
 786                 case '\r':
 787                         return;
 788
 789                 default:
 790                         next_char();
 791                         break;
 792                 }
 793         }
 794 }
 795
 796
 797
 798 #define MAYBE_PROLOG                                       \
 799                         next_char();                                   \
 800                         while(1) {                                     \
 801                                 switch(c) {
 802
 803 #define MAYBE(ch, set_type)                                \
 804                                 case ch:                                   \
 805                                         next_char();                           \
 806                                         pp_token.type = set_type;              \
 807                                         return;
 808
 809 #define ELSE_CODE(code)                                    \
 810                                 default:                                   \
 811                                         code;                                  \
 812                                 }                                          \
 813                         } /* end of while(1) */                        \
 814                         break;
 815
 816 #define ELSE(set_type)                                     \
 817                 ELSE_CODE(                                         \
 818                         pp_token.type = set_type;                      \
 819                         return;                                        \
 820                 )
 821
 822 static void next_preprocessing_token(void)
 823 {
 824         if(current_expansion != NULL) {
 825                 expand_next();
 826                 return;
 827         }
 828
 829         pp_token.source_position = input_position;
 830
 831 restart:
 832         switch(c) {
 833         case ' ':
 834         case '\t':
 835                 if(print_spaces)
 836                         counted_spaces++;
 837                 next_char();
 838                 goto restart;
 839
 840         MATCH_NEWLINE(
 841                 counted_newlines++;
 842                 counted_spaces = 0;
 843                 pp_token.type = '\n';
 844                 return;
 845         )
 846
 847         SYMBOL_CHARS
 848                 parse_symbol();
 849                 return;
 850
 851         DIGITS
 852                 parse_number();
 853                 return;
 854
 855         case '"':
 856                 parse_string_literal();
 857                 return;
 858
 859         case '\'':
 860                 parse_character_constant();
 861                 return;
 862
 863         case '.':
 864                 MAYBE_PROLOG
 865                         case '0':
 866                         case '1':
 867                         case '2':
 868                         case '3':
 869                         case '4':
 870                         case '5':
 871                         case '6':
 872                         case '7':
 873                         case '8':
 874                         case '9':
 875                                 put_back(c);
 876                                 c = '.';
 877                                 parse_number();
 878                                 return;
 879
 880                         case '.':
 881                                 MAYBE_PROLOG
 882                                 MAYBE('.', TP_DOTDOTDOT)
 883                                 ELSE_CODE(
 884                                         put_back(c);
 885                                         c = '.';
 886                                         pp_token.type = '.';
 887                                         return;
 888                                 )
 889                 ELSE('.')
 890         case '&':
 891                 MAYBE_PROLOG
 892                 MAYBE('&', TP_ANDAND)
 893                 MAYBE('=', TP_ANDEQUAL)
 894                 ELSE('&')
 895         case '*':
 896                 MAYBE_PROLOG
 897                 MAYBE('=', TP_ASTERISKEQUAL)
 898                 ELSE('*')
 899         case '+':
 900                 MAYBE_PROLOG
 901                 MAYBE('+', TP_PLUSPLUS)
 902                 MAYBE('=', TP_PLUSEQUAL)
 903                 ELSE('+')
 904         case '-':
 905                 MAYBE_PROLOG
 906                 MAYBE('>', TP_MINUSGREATER)
 907                 MAYBE('-', TP_MINUSMINUS)
 908                 MAYBE('=', TP_MINUSEQUAL)
 909                 ELSE('-')
 910         case '!':
 911                 MAYBE_PROLOG
 912                 MAYBE('=', TP_EXCLAMATIONMARKEQUAL)
 913                 ELSE('!')
 914         case '/':
 915                 MAYBE_PROLOG
 916                 MAYBE('=', TP_SLASHEQUAL)
 917                         case '*':
 918                                 next_char();
 919                                 skip_multiline_comment();
 920                                 if(print_spaces)
 921                                         counted_spaces++;
 922                                 goto restart;
 923                         case '/':
 924                                 next_char();
 925                                 skip_line_comment();
 926                                 if(print_spaces)
 927                                         counted_spaces++;
 928                                 goto restart;
 929                 ELSE('/')
 930         case '%':
 931                 MAYBE_PROLOG
 932                 MAYBE('>', '}')
 933                 MAYBE('=', TP_PERCENTEQUAL)
 934                         case ':':
 935                                 MAYBE_PROLOG
 936                                         case '%':
 937                                                 MAYBE_PROLOG
 938                                                 MAYBE(':', TP_HASHHASH)
 939                                                 ELSE_CODE(
 940                                                         put_back(c);
 941                                                         c = '%';
 942                                                         pp_token.type = '#';
 943                                                         return;
 944                                                 )
 945                                 ELSE('#')
 946                 ELSE('%')
 947         case '<':
 948                 MAYBE_PROLOG
 949                 MAYBE(':', '[')
 950                 MAYBE('%', '{')
 951                 MAYBE('=', TP_LESSEQUAL)
 952                         case '<':
 953                                 MAYBE_PROLOG
 954                                 MAYBE('=', TP_LESSLESSEQUAL)
 955                                 ELSE(TP_LESSLESS)
 956                 ELSE('<')
 957         case '>':
 958                 MAYBE_PROLOG
 959                 MAYBE('=', TP_GREATEREQUAL)
 960                         case '>':
 961                                 MAYBE_PROLOG
 962                                 MAYBE('=', TP_GREATERGREATEREQUAL)
 963                                 ELSE(TP_GREATERGREATER)
 964                 ELSE('>')
 965         case '^':
 966                 MAYBE_PROLOG
 967                 MAYBE('=', TP_CARETEQUAL)
 968                 ELSE('^')
 969         case '|':
 970                 MAYBE_PROLOG
 971                 MAYBE('=', TP_PIPEEQUAL)
 972                 MAYBE('|', TP_PIPEPIPE)
 973                 ELSE('|')
 974         case ':':
 975                 MAYBE_PROLOG
 976                 MAYBE('>', ']')
 977                 ELSE(':')
 978         case '=':
 979                 MAYBE_PROLOG
 980                 MAYBE('=', TP_EQUALEQUAL)
 981                 ELSE('=')
 982         case '#':
 983                 MAYBE_PROLOG
 984                 MAYBE('#', TP_HASHHASH)
 985                 ELSE('#')
 986
 987         case '?':
 988         case '[':
 989         case ']':
 990         case '(':
 991         case ')':
 992         case '{':
 993         case '}':
 994         case '~':
 995         case ';':
 996         case ',':
 997         case '\\':
 998                 pp_token.type = c;
 999                 next_char();
1000                 return;
1001
1002         case EOF:
1003                 pp_token.type = TP_EOF;
1004                 return;
1005
1006         default:
1007                 next_char();
1008                 errorf(&pp_token.source_position, "unknown character '%c' found\n", c);
1009                 pp_token.type = TP_ERROR;
1010                 return;
1011         }
1012 }
1013
1014 static void print_quoted_string(const char *const string)
1015 {
1016         fputc('"', out);
1017         for (const char *c = string; *c != 0; ++c) {
1018                 switch(*c) {
1019                 case '"': fputs("\\\"", out); break;
1020                 case '\\':  fputs("\\\\", out); break;
1021                 case '\a':  fputs("\\a", out); break;
1022                 case '\b':  fputs("\\b", out); break;
1023                 case '\f':  fputs("\\f", out); break;
1024                 case '\n':  fputs("\\n", out); break;
1025                 case '\r':  fputs("\\r", out); break;
1026                 case '\t':  fputs("\\t", out); break;
1027                 case '\v':  fputs("\\v", out); break;
1028                 case '\?':  fputs("\\?", out); break;
1029                 default:
1030                         if(!isprint(*c)) {
1031                                 fprintf(out, "\\%03o", *c);
1032                                 break;
1033                         }
1034                         fputc(*c, out);
1035                         break;
1036                 }
1037         }
1038         fputc('"', out);
1039 }
1040
1041 static void print_line_directive(const source_position_t *pos)
1042 {
1043         fprintf(out, "# %d ", pos->linenr);
1044         print_quoted_string(pos->input_name);
1045         fputc('\n', out);
1046
1047         printed_input_name = pos->input_name;
1048 }
1049
1050 static bool had_non_space = false;
1051
1052 static void emit_pp_token(void)
1053 {
1054         if (printed_input_name != pp_token.source_position.input_name) {
1055                 print_line_directive(&pp_token.source_position);
1056         } else if (pp_token.type != '\n') {
1057                 if (counted_newlines >= 9) {
1058                         if (had_non_space) {
1059                                 fputc('\n', out);
1060                         }
1061                         print_line_directive(&pp_token.source_position);
1062                         counted_newlines = 0;
1063                 } else {
1064                         for (unsigned i = 0; i < counted_newlines; ++i)
1065                                 fputc('\n', out);
1066                         counted_newlines = 0;
1067                 }
1068                 for (unsigned i = 0; i < counted_spaces; ++i)
1069                         fputc(' ', out);
1070                 counted_spaces = 0;
1071                 had_non_space  = true;
1072         }
1073
1074         switch(pp_token.type) {
1075         case TP_IDENTIFIER:
1076                 fputs(pp_token.v.symbol->string, out);
1077                 break;
1078         case TP_NUMBER:
1079                 fputs(pp_token.v.string.begin, out);
1080                 break;
1081         case TP_STRING_LITERAL:
1082                 fputc('"', out);
1083                 fputs(pp_token.v.string.begin, out);
1084                 fputc('"', out);
1085                 break;
1086         case '\n':
1087                 break;
1088         default:
1089                 print_pp_token_type(out, pp_token.type);
1090                 break;
1091         }
1092 }
1093
1094 static void eat_pp(preprocessor_token_type_t type)
1095 {
1096         (void) type;
1097         assert(pp_token.type == type);
1098         next_preprocessing_token();
1099 }
1100
1101 static void eat_pp_directive(void)
1102 {
1103         while(pp_token.type != '\n' && pp_token.type != TP_EOF) {
1104                 next_preprocessing_token();
1105         }
1106 }
1107
1108 static bool strings_equal(const string_t *string1, const string_t *string2)
1109 {
1110         size_t size = string1->size;
1111         if(size != string2->size)
1112                 return false;
1113
1114         const char *c1 = string1->begin;
1115         const char *c2 = string2->begin;
1116         for(size_t i = 0; i < size; ++i, ++c1, ++c2) {
1117                 if(*c1 != *c2)
1118                         return false;
1119         }
1120         return true;
1121 }
1122
1123 static bool wide_strings_equal(const wide_string_t *string1,
1124                                const wide_string_t *string2)
1125 {
1126         size_t size = string1->size;
1127         if(size != string2->size)
1128                 return false;
1129
1130         const wchar_rep_t *c1 = string1->begin;
1131         const wchar_rep_t *c2 = string2->begin;
1132         for(size_t i = 0; i < size; ++i, ++c1, ++c2) {
1133                 if(*c1 != *c2)
1134                         return false;
1135         }
1136         return true;
1137 }
1138
1139 static bool pp_tokens_equal(const token_t *token1, const token_t *token2)
1140 {
1141         if(token1->type != token2->type)
1142                 return false;
1143
1144         switch(token1->type) {
1145         case TP_HEADERNAME:
1146                 /* TODO */
1147                 return false;
1148         case TP_IDENTIFIER:
1149                 return token1->v.symbol == token2->v.symbol;
1150         case TP_NUMBER:
1151         case TP_CHARACTER_CONSTANT:
1152         case TP_STRING_LITERAL:
1153                 return strings_equal(&token1->v.string, &token2->v.string);
1154
1155         case TP_WIDE_CHARACTER_CONSTANT:
1156         case TP_WIDE_STRING_LITERAL:
1157                 return wide_strings_equal(&token1->v.wide_string,
1158                                           &token2->v.wide_string);
1159         default:
1160                 return true;
1161         }
1162 }
1163
1164 static bool pp_definitions_equal(const pp_definition_t *definition1,
1165                                  const pp_definition_t *definition2)
1166 {
1167         if(definition1->list_len != definition2->list_len)
1168                 return false;
1169
1170         size_t         len = definition1->list_len;
1171         const token_t *t1  = definition1->replacement_list;
1172         const token_t *t2  = definition2->replacement_list;
1173         for(size_t i = 0; i < len; ++i, ++t1, ++t2) {
1174                 if(!pp_tokens_equal(t1, t2))
1175                         return false;
1176         }
1177         return true;
1178 }
1179
1180 static void parse_define_directive(void)
1181 {
1182         eat_pp(TP_define);
1183
1184         if(pp_token.type != TP_IDENTIFIER) {
1185                 errorf(&pp_token.source_position,
1186                        "expected identifier after #define, got '%T'", &pp_token);
1187                 eat_pp_directive();
1188                 return;
1189         }
1190         symbol_t *symbol = pp_token.v.symbol;
1191
1192         pp_definition_t *new_definition
1193                 = obstack_alloc(&pp_obstack, sizeof(new_definition[0]));
1194         memset(new_definition, 0, sizeof(new_definition[0]));
1195         new_definition->source_position = input_position;
1196
1197         /* this is probably the only place where spaces are significant in the
1198          * lexer (except for the fact that they separate tokens). #define b(x)
1199          * is something else than #define b (x) */
1200         //token_t *arguments = NULL;
1201         if(c == '(') {
1202                 next_preprocessing_token();
1203                 while(pp_token.type != ')') {
1204                         if(pp_token.type == TP_DOTDOTDOT) {
1205                                 new_definition->is_variadic = true;
1206                                 next_preprocessing_token();
1207                                 if(pp_token.type != ')') {
1208                                         errorf(&input_position,
1209                                                         "'...' not at end of macro argument list");
1210                                         continue;
1211                                 }
1212                         } else if(pp_token.type != TP_IDENTIFIER) {
1213                                 next_preprocessing_token();
1214                         }
1215                 }
1216         } else {
1217                 next_preprocessing_token();
1218         }
1219
1220         /* construct a new pp_definition on the obstack */
1221         assert(obstack_object_size(&pp_obstack) == 0);
1222         size_t list_len = 0;
1223         while(pp_token.type != '\n' && pp_token.type != TP_EOF) {
1224                 obstack_grow(&pp_obstack, &pp_token, sizeof(pp_token));
1225                 ++list_len;
1226                 next_preprocessing_token();
1227         }
1228
1229         new_definition->list_len         = list_len;
1230         new_definition->replacement_list = obstack_finish(&pp_obstack);
1231
1232         pp_definition_t *old_definition = symbol->pp_definition;
1233         if(old_definition != NULL) {
1234                 if(!pp_definitions_equal(old_definition, new_definition)) {
1235                         warningf(&input_position, "multiple definition of macro '%Y' (first defined %P)",
1236                                  symbol, &old_definition->source_position);
1237                 } else {
1238                         /* reuse the old definition */
1239                         obstack_free(&pp_obstack, new_definition);
1240                         new_definition = old_definition;
1241                 }
1242         }
1243
1244         symbol->pp_definition = new_definition;
1245 }
1246
1247 static void parse_undef_directive(void)
1248 {
1249         eat_pp(TP_undef);
1250
1251         if(pp_token.type != TP_IDENTIFIER) {
1252                 errorf(&input_position,
1253                        "expected identifier after #undef, got '%T'", &pp_token);
1254                 eat_pp_directive();
1255                 return;
1256         }
1257
1258         symbol_t *symbol = pp_token.v.symbol;
1259         symbol->pp_definition = NULL;
1260         next_preprocessing_token();
1261
1262         if(pp_token.type != '\n') {
1263                 warningf(&input_position, "extra tokens at end of #undef directive");
1264         }
1265         /* eat until '\n' */
1266         eat_pp_directive();
1267 }
1268
1269 static void parse_preprocessing_directive(void)
1270 {
1271         print_spaces  = false;
1272         do_expansions = false;
1273         eat_pp('#');
1274
1275         switch(pp_token.type) {
1276         case TP_define:
1277                 parse_define_directive();
1278                 break;
1279         case TP_undef:
1280                 parse_undef_directive();
1281                 break;
1282         default:
1283                 errorf(&pp_token.source_position,
1284                        "invalid preprocessing directive #%T", &pp_token);
1285                 eat_pp_directive();
1286                 break;
1287         }
1288
1289         print_spaces  = true;
1290         do_expansions = true;
1291
1292         /* eat '\n' */
1293         assert(pp_token.type == '\n' || pp_token.type == TP_EOF);
1294         next_preprocessing_token();
1295 }
1296
1297 int pptest_main(int argc, char **argv);
1298
1299 #define GCC_COMPAT_MODE
1300
1301 int pptest_main(int argc, char **argv)
1302 {
1303         init_symbol_table();
1304         init_tokens();
1305
1306         obstack_init(&pp_obstack);
1307
1308         const char *infname = "t.c";
1309         if (argc > 1)
1310                 infname = argv[1];
1311
1312         input = fopen(infname, "r");
1313         assert(input != NULL);
1314         input_position.input_name = infname;
1315         input_position.linenr     = 1;
1316
1317         bufpos       = NULL;
1318         bufend       = NULL;
1319         counted_newlines = 0;
1320         counted_spaces   = 0;
1321
1322         out = stdout;
1323
1324 #ifdef GCC_COMPAT_MODE
1325         /* this is here so we can directly compare "gcc -E" output and our output */
1326         fprintf(out, "# 1 \"%s\"\n", input_position.input_name);
1327         fputs("# 1 \"<built-in>\"\n", out);
1328         fputs("# 1 \"<command-line>\"\n", out);
1329 #endif
1330
1331         next_char();
1332
1333         next_preprocessing_token();
1334
1335         while(true) {
1336                 /* we're at a line begin */
1337                 if(pp_token.type == '#') {
1338                         parse_preprocessing_directive();
1339                 } else {
1340                         /* parse+emit a line */
1341                         while(pp_token.type != '\n') {
1342                                 if(pp_token.type == TP_EOF)
1343                                         goto end_of_main_loop;
1344                                 emit_pp_token();
1345                                 next_preprocessing_token();
1346                         }
1347                         emit_pp_token();
1348                         next_preprocessing_token();
1349                 }
1350         }
1351 end_of_main_loop:
1352
1353         if (counted_newlines > 0) {
1354                 fputc('\n', out);
1355         }
1356
1357         obstack_free(&pp_obstack, NULL);
1358
1359         exit_tokens();
1360         exit_symbol_table();
1361
1362         return 0;
1363 }