nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8
   9 #include <assert.h>
  10 #include <errno.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13
  14 //#define DEBUG_CHARS
  15 #define MAX_PUTBACK 3
  16
  17 static int         c;
  18 token_t            lexer_token;
  19 static FILE       *input;
  20 static char        buf[1024 + MAX_PUTBACK];
  21 static const char *bufend;
  22 static const char *bufpos;
  23 static strset_t    stringset;
  24 //static FILE      **input_stack;
  25 //static char      **buf_stack;
  26
  27 static
  28 void error_prefix_at(const char *input_name, unsigned linenr)
  29 {
  30         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  31 }
  32
  33 static
  34 void error_prefix(void)
  35 {
  36         error_prefix_at(lexer_token.source_position.input_name,
  37                         lexer_token.source_position.linenr);
  38 }
  39
  40 static
  41 void parse_error(const char *msg)
  42 {
  43         error_prefix();
  44         fprintf(stderr, "%s\n", msg);
  45 }
  46
  47 static inline
  48 void next_char(void)
  49 {
  50         bufpos++;
  51         if(bufpos >= bufend) {
  52                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  53                                  input);
  54                 if(s == 0) {
  55                         c = EOF;
  56                         return;
  57                 }
  58                 bufpos = buf + MAX_PUTBACK;
  59                 bufend = buf + MAX_PUTBACK + s;
  60         }
  61         c = *(bufpos);
  62 #ifdef DEBUG_CHARS
  63         printf("nchar '%c'\n", c);
  64 #endif
  65 }
  66
  67 static inline
  68 void put_back(int pc)
  69 {
  70         char *p = (char*) bufpos - 1;
  71         bufpos--;
  72         assert(p >= buf);
  73         *p = pc;
  74
  75 #ifdef DEBUG_CHARS
  76         printf("putback '%c'\n", pc);
  77 #endif
  78 }
  79
  80
  81 static
  82 int replace_trigraph(void)
  83 {
  84 #define MATCH_TRIGRAPH(ch,replacement)           \
  85         case ch:                                     \
  86                 c = replacement;                         \
  87                 return 1;
  88
  89         switch(c) {
  90         MATCH_TRIGRAPH('=', '#')
  91         MATCH_TRIGRAPH('(', '[')
  92         MATCH_TRIGRAPH('/', '\\')
  93         MATCH_TRIGRAPH(')', ']')
  94         MATCH_TRIGRAPH('\'', '^')
  95         MATCH_TRIGRAPH('<', '{')
  96         MATCH_TRIGRAPH('!', '|')
  97         MATCH_TRIGRAPH('>', '}')
  98         MATCH_TRIGRAPH('-', '~')
  99         default:
 100                 break;
 101         }
 102
 103         return 0;
 104 }
 105
 106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
 107         case '?':                                  \
 108                 next_char();                           \
 109                 if(c != '?') {                         \
 110                         custom_putback;                    \
 111                         put_back(c);                       \
 112                         c = '?';                           \
 113                         no_trigraph_code;                  \
 114                 }                                      \
 115                 next_char();                           \
 116                 if(replace_trigraph()) {               \
 117                         break;                             \
 118                 }                                      \
 119                 custom_putback;                        \
 120                 put_back('?');                         \
 121                 put_back(c);                           \
 122                 c = '?';                               \
 123                 no_trigraph_code;
 124
 125 #define EAT_NEWLINE(newline_code)              \
 126         if(c == '\r') {                            \
 127                 next_char();                           \
 128                 if(c == '\n')                          \
 129                         next_char();                       \
 130                 lexer_token.source_position.linenr++;  \
 131                 newline_code;                          \
 132         } else if(c == '\n') {                     \
 133                 next_char();                           \
 134                 lexer_token.source_position.linenr++;  \
 135                 newline_code;                          \
 136         }
 137
 138 #define SYMBOL_CHARS  \
 139         case 'a':         \
 140         case 'b':         \
 141         case 'c':         \
 142         case 'd':         \
 143         case 'e':         \
 144         case 'f':         \
 145         case 'g':         \
 146         case 'h':         \
 147         case 'i':         \
 148         case 'j':         \
 149         case 'k':         \
 150         case 'l':         \
 151         case 'm':         \
 152         case 'n':         \
 153         case 'o':         \
 154         case 'p':         \
 155         case 'q':         \
 156         case 'r':         \
 157         case 's':         \
 158         case 't':         \
 159         case 'u':         \
 160         case 'v':         \
 161         case 'w':         \
 162         case 'x':         \
 163         case 'y':         \
 164         case 'z':         \
 165         case 'A':         \
 166         case 'B':         \
 167         case 'C':         \
 168         case 'D':         \
 169         case 'E':         \
 170         case 'F':         \
 171         case 'G':         \
 172         case 'H':         \
 173         case 'I':         \
 174         case 'J':         \
 175         case 'K':         \
 176         case 'L':         \
 177         case 'M':         \
 178         case 'N':         \
 179         case 'O':         \
 180         case 'P':         \
 181         case 'Q':         \
 182         case 'R':         \
 183         case 'S':         \
 184         case 'T':         \
 185         case 'U':         \
 186         case 'V':         \
 187         case 'W':         \
 188         case 'X':         \
 189         case 'Y':         \
 190         case 'Z':         \
 191         case '_':
 192
 193 #define DIGITS        \
 194         case '0':         \
 195         case '1':         \
 196         case '2':         \
 197         case '3':         \
 198         case '4':         \
 199         case '5':         \
 200         case '6':         \
 201         case '7':         \
 202         case '8':         \
 203         case '9':
 204
 205 static
 206 void parse_symbol(void)
 207 {
 208         symbol_t *symbol;
 209         char     *string;
 210
 211         obstack_1grow(&symbol_obstack, c);
 212         next_char();
 213
 214         while(1) {
 215                 switch(c) {
 216                 case '\\':
 217                         next_char();
 218                         EAT_NEWLINE(break;)
 219                         goto end_symbol;
 220
 221                 DIGITS
 222                 SYMBOL_CHARS
 223                         obstack_1grow(&symbol_obstack, c);
 224                         next_char();
 225                         break;
 226
 227                 case '?':
 228                         next_char();
 229                         if(c != '?') {
 230                                 put_back(c);
 231                                 c = '?';
 232                                 goto end_symbol;
 233                         }
 234                         next_char();
 235                         if(replace_trigraph())
 236                                 break;
 237                         put_back('?');
 238                         put_back(c);
 239                         c = '?';
 240                         goto end_symbol;
 241
 242                 default:
 243                         goto end_symbol;
 244                 }
 245         }
 246 end_symbol:
 247         obstack_1grow(&symbol_obstack, '\0');
 248
 249         string = obstack_finish(&symbol_obstack);
 250         symbol = symbol_table_insert(string);
 251
 252         lexer_token.type     = symbol->ID;
 253         lexer_token.v.symbol = symbol;
 254
 255         if(symbol->string != string) {
 256                 obstack_free(&symbol_obstack, string);
 257         }
 258 }
 259
 260 static
 261 void parse_number_hex(void)
 262 {
 263         assert(c == 'x' || c == 'X');
 264         next_char();
 265
 266         if (!isdigit(c) &&
 267                 !('A' <= c && c <= 'F') &&
 268                 !('a' <= c && c <= 'f')) {
 269                 parse_error("premature end of hex number literal");
 270                 lexer_token.type = T_ERROR;
 271                 return;
 272         }
 273
 274         int value = 0;
 275         while(1) {
 276                 if (isdigit(c)) {
 277                         value = 16 * value + c - '0';
 278                 } else if ('A' <= c && c <= 'F') {
 279                         value = 16 * value + c - 'A' + 10;
 280                 } else if ('a' <= c && c <= 'f') {
 281                         value = 16 * value + c - 'a' + 10;
 282                 } else {
 283                         lexer_token.type     = T_INTEGER;
 284                         lexer_token.v.intvalue = value;
 285                         return;
 286                 }
 287                 next_char();
 288         }
 289 }
 290
 291 static
 292 void parse_number_oct(void)
 293 {
 294         assert(c == 'o' || c == 'O');
 295         next_char();
 296
 297         int value = 0;
 298         while(1) {
 299                 if ('0' <= c && c <= '7') {
 300                         value = 8 * value + c - '0';
 301                 } else {
 302                         lexer_token.type       = T_INTEGER;
 303                         lexer_token.v.intvalue = value;
 304                         return;
 305                 }
 306                 next_char();
 307         }
 308 }
 309
 310 static
 311 void parse_number_dec(int first_char)
 312 {
 313         int value = 0;
 314         if(first_char > 0) {
 315                 assert(first_char >= '0' && first_char <= '9');
 316                 value = first_char - '0';
 317         }
 318
 319         for(;;) {
 320                 if (isdigit(c)) {
 321                         value = 10 * value + c - '0';
 322                 } else {
 323                         lexer_token.type       = T_INTEGER;
 324                         lexer_token.v.intvalue = value;
 325                         return;
 326                 }
 327                 next_char();
 328         }
 329 }
 330
 331 static
 332 void parse_number(void)
 333 {
 334         // TODO check for overflow
 335         // TODO check for various invalid inputs sequences
 336
 337         if (c == '0') {
 338                 next_char();
 339                 switch (c) {
 340                         case 'X':
 341                         case 'x': parse_number_hex(); break;
 342                         case 'o':
 343                         case 'O': parse_number_oct(); break;
 344                         default:  parse_number_dec('0');
 345                 }
 346         } else {
 347                 parse_number_dec(0);
 348         }
 349         if(c == 'U' || c == 'U') {
 350                 /* TODO do something with the suffixes... */
 351                 next_char();
 352                 if(c == 'L' || c == 'l') {
 353                         next_char();
 354                         if(c == 'L' || c == 'l') {
 355                                 next_char();
 356                         }
 357                 }
 358         } else if(c == 'l' || c == 'L') {
 359                 next_char();
 360                 if(c == 'l' || c == 'L') {
 361                         next_char();
 362                         if(c == 'u' || c == 'U') {
 363                                 next_char();
 364                         }
 365                 } else if(c == 'u' || c == 'U') {
 366                         next_char();
 367                 }
 368         }
 369 }
 370
 371 static int parse_octal_sequence(void)
 372 {
 373         int value = 0;
 374         while(1) {
 375                 if(c < '0' || c > '7')
 376                         break;
 377                 value = 8 * value + c - '0';
 378                 next_char();
 379         }
 380
 381         return value;
 382 }
 383
 384 static int parse_hex_sequence(void)
 385 {
 386         int value = 0;
 387         while(1) {
 388                 if (c >= '0' && c <= '9') {
 389                         value = 16 * value + c - '0';
 390                 } else if ('A' <= c && c <= 'F') {
 391                         value = 16 * value + c - 'A' + 10;
 392                 } else if ('a' <= c && c <= 'f') {
 393                         value = 16 * value + c - 'a' + 10;
 394                 } else {
 395                         break;
 396                 }
 397                 next_char();
 398         }
 399
 400         return value;
 401 }
 402
 403 static int parse_escape_sequence(void)
 404 {
 405         while(1) {
 406                 int ec = c;
 407                 next_char();
 408
 409                 switch(ec) {
 410                 case '"': return '"';
 411                 case '\'': return'\'';
 412                 case '\\':
 413                         EAT_NEWLINE(break;)
 414                         return '\\';
 415                 case 'a': return '\a';
 416                 case 'b': return '\b';
 417                 case 'f': return '\f';
 418                 case 'n': return '\n';
 419                 case 'r': return '\r';
 420                 case 't': return '\t';
 421                 case 'v': return '\v';
 422                 case 'x':
 423                         return parse_hex_sequence();
 424                 case '0':
 425                 case '1':
 426                 case '2':
 427                 case '3':
 428                 case '4':
 429                 case '5':
 430                 case '6':
 431                 case '7':
 432                         return parse_octal_sequence();
 433                 case '?':
 434                         if(c != '?') {
 435                                 return '?';
 436                         }
 437                         /* might be a trigraph */
 438                         next_char();
 439                         if(replace_trigraph()) {
 440                                 break;
 441                         }
 442                         put_back(c);
 443                         c = '?';
 444                         return '?';
 445
 446                 case EOF:
 447                         parse_error("reached end of file while parsing escape sequence");
 448                         return EOF;
 449                 default:
 450                         parse_error("unknown escape sequence");
 451                         return EOF;
 452                 }
 453         }
 454 }
 455
 456 const char *concat_strings(const char *s1, const char *s2)
 457 {
 458         size_t  len1   = strlen(s1);
 459         size_t  len2   = strlen(s2);
 460
 461         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 462         memcpy(concat, s1, len1);
 463         memcpy(concat + len1, s2, len2 + 1);
 464
 465         const char *result = strset_insert(&stringset, concat);
 466         if(result != concat) {
 467                 obstack_free(&symbol_obstack, concat);
 468         }
 469
 470         return result;
 471 }
 472
 473 static
 474 void parse_string_literal(void)
 475 {
 476         unsigned    start_linenr = lexer_token.source_position.linenr;
 477         char       *string;
 478         const char *result;
 479
 480         assert(c == '"');
 481         next_char();
 482
 483         while(1) {
 484                 switch(c) {
 485                 SKIP_TRIGRAPHS(,
 486                         obstack_1grow(&symbol_obstack, '?');
 487                         next_char();
 488                         break;
 489                 )
 490
 491                 case '\\':
 492                         next_char();
 493                         EAT_NEWLINE(break;)
 494                         int ec = parse_escape_sequence();
 495                         obstack_1grow(&symbol_obstack, ec);
 496                         break;
 497
 498                 case EOF:
 499                         error_prefix_at(lexer_token.source_position.input_name,
 500                                         start_linenr);
 501                         fprintf(stderr, "string has no end\n");
 502                         lexer_token.type = T_ERROR;
 503                         return;
 504
 505                 case '"':
 506                         next_char();
 507                         goto end_of_string;
 508
 509                 default:
 510                         obstack_1grow(&symbol_obstack, c);
 511                         next_char();
 512                         break;
 513                 }
 514         }
 515
 516 end_of_string:
 517
 518         /* TODO: concatenate multiple strings separated by whitespace... */
 519
 520         /* add finishing 0 to the string */
 521         obstack_1grow(&symbol_obstack, '\0');
 522         string = obstack_finish(&symbol_obstack);
 523
 524         /* check if there is already a copy of the string */
 525         result = strset_insert(&stringset, string);
 526         if(result != string) {
 527                 obstack_free(&symbol_obstack, string);
 528         }
 529
 530         lexer_token.type     = T_STRING_LITERAL;
 531         lexer_token.v.string = result;
 532 }
 533
 534 #define MATCH_NEWLINE(code)                   \
 535         case '\r':                                \
 536                 next_char();                          \
 537                 if(c == '\n') {                       \
 538                         next_char();                      \
 539                 }                                     \
 540                 lexer_token.source_position.linenr++; \
 541                 code;                                 \
 542         case '\n':                                \
 543                 next_char();                          \
 544                 lexer_token.source_position.linenr++; \
 545                 code;
 546
 547 static
 548 void parse_character_constant(void)
 549 {
 550         assert(c == '\'');
 551         next_char();
 552
 553         int found_char = 0;
 554         while(1) {
 555                 switch(c) {
 556                 SKIP_TRIGRAPHS(,
 557                         next_char();
 558                         found_char = '?';
 559                         break;
 560                 )
 561
 562                 case '\\':
 563                         next_char();
 564                         EAT_NEWLINE(break;)
 565                         found_char = parse_escape_sequence();
 566                         break;
 567
 568                 MATCH_NEWLINE(
 569                         parse_error("newline while parsing character constant");
 570                         break;
 571                 )
 572
 573                 case '\'':
 574                         next_char();
 575                         goto end_of_char_constant;
 576
 577                 case EOF:
 578                         parse_error("EOF while parsing character constant");
 579                         lexer_token.type = T_ERROR;
 580                         return;
 581
 582                 default:
 583                         if(found_char != 0) {
 584                                 parse_error("more than 1 characters in character "
 585                                             "constant");
 586                                 goto end_of_char_constant;
 587                         } else {
 588                                 found_char = c;
 589                                 next_char();
 590                         }
 591                         break;
 592                 }
 593         }
 594
 595 end_of_char_constant:
 596         lexer_token.type       = T_INTEGER;
 597         lexer_token.v.intvalue = found_char;
 598 }
 599
 600 static
 601 void skip_multiline_comment(void)
 602 {
 603         unsigned start_linenr = lexer_token.source_position.linenr;
 604         int had_star = 0;
 605
 606         while(1) {
 607                 switch(c) {
 608                 case '*':
 609                         next_char();
 610                         had_star = 1;
 611                         break;
 612
 613                 case '/':
 614                         next_char();
 615                         if(had_star) {
 616                                 return;
 617                         }
 618                         had_star = 0;
 619                         break;
 620
 621                 case '\\':
 622                         next_char();
 623                         EAT_NEWLINE(break;)
 624                         had_star = 0;
 625                         break;
 626
 627                 case '?':
 628                         next_char();
 629                         if(c != '?') {
 630                                 had_star = 0;
 631                                 break;
 632                         }
 633                         next_char();
 634                         if(replace_trigraph())
 635                                 break;
 636                         put_back(c);
 637                         c = '?';
 638                         had_star = 0;
 639                         /* we don't put back the 2nd ? as the comment text is discarded
 640                          * anyway */
 641                         break;
 642
 643                 MATCH_NEWLINE(had_star = 0; break;)
 644
 645                 case EOF:
 646                         error_prefix_at(lexer_token.source_position.input_name,
 647                                         start_linenr);
 648                         fprintf(stderr, "at end of file while looking for comment end\n");
 649                         return;
 650                 default:
 651                         had_star = 0;
 652                         next_char();
 653                         break;
 654                 }
 655         }
 656 }
 657
 658 static
 659 void skip_line_comment(void)
 660 {
 661         while(1) {
 662                 switch(c) {
 663                 case '?':
 664                         next_char();
 665                         if(c != '?')
 666                                 break;
 667                         next_char();
 668                         if(replace_trigraph())
 669                                 break;
 670                         put_back('?');
 671                         /* we don't put back the 2nd ? as the comment text is discarded
 672                          * anyway */
 673                         break;
 674
 675                 case '\\':
 676                         next_char();
 677                         if(c == '\n') {
 678                                 next_char();
 679                                 lexer_token.source_position.linenr++;
 680                         }
 681                         break;
 682
 683                 case EOF:
 684                 case '\r':
 685                 case '\n':
 686                         return;
 687
 688                 default:
 689                         next_char();
 690                         break;
 691                 }
 692         }
 693 }
 694
 695 static token_t pp_token;
 696
 697 static inline
 698 void next_pp_token(void)
 699 {
 700         lexer_next_preprocessing_token();
 701         pp_token = lexer_token;
 702 }
 703
 704 static
 705 void eat_until_newline(void)
 706 {
 707         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 708                 next_pp_token();
 709         }
 710 }
 711
 712 static
 713 void error_directive(void)
 714 {
 715         error_prefix();
 716         fprintf(stderr, "#error directive: \n");
 717
 718         /* parse pp-tokens until new-line */
 719 }
 720
 721 static
 722 void define_directive(void)
 723 {
 724         lexer_next_preprocessing_token();
 725         if(lexer_token.type != T_IDENTIFIER) {
 726                 parse_error("expected identifier after #define\n");
 727                 eat_until_newline();
 728         }
 729 }
 730
 731 static
 732 void ifdef_directive(int is_ifndef)
 733 {
 734         (void) is_ifndef;
 735         lexer_next_preprocessing_token();
 736         //expect_identifier();
 737         //extect_newline();
 738 }
 739
 740 static
 741 void endif_directive(void)
 742 {
 743         //expect_newline();
 744 }
 745
 746 static
 747 void parse_line_directive(void)
 748 {
 749         if(pp_token.type != T_INTEGER) {
 750                 parse_error("expected integer");
 751         } else {
 752                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 753                 next_pp_token();
 754         }
 755         if(pp_token.type == T_STRING_LITERAL) {
 756                 lexer_token.source_position.input_name = pp_token.v.string;
 757                 next_pp_token();
 758         }
 759
 760         eat_until_newline();
 761 }
 762
 763 static
 764 void parse_preprocessor_identifier(void)
 765 {
 766         assert(pp_token.type == T_IDENTIFIER);
 767         symbol_t *symbol = pp_token.v.symbol;
 768
 769         switch(symbol->pp_ID) {
 770         case TP_include:
 771                 printf("include - enable header name parsing!\n");
 772                 break;
 773         case TP_define:
 774                 define_directive();
 775                 break;
 776         case TP_ifdef:
 777                 ifdef_directive(0);
 778                 break;
 779         case TP_ifndef:
 780                 ifdef_directive(1);
 781                 break;
 782         case TP_endif:
 783                 endif_directive();
 784                 break;
 785         case TP_line:
 786                 next_pp_token();
 787                 parse_line_directive();
 788                 break;
 789         case TP_if:
 790         case TP_else:
 791         case TP_elif:
 792         case TP_undef:
 793         case TP_error:
 794                 error_directive();
 795                 break;
 796         case TP_pragma:
 797                 break;
 798         }
 799 }
 800
 801 static
 802 void parse_preprocessor_directive()
 803 {
 804         next_pp_token();
 805
 806         switch(pp_token.type) {
 807         case T_IDENTIFIER:
 808                 parse_preprocessor_identifier();
 809                 break;
 810         case T_INTEGER:
 811                 parse_line_directive();
 812                 break;
 813         default:
 814                 parse_error("invalid preprocessor directive");
 815                 eat_until_newline();
 816                 break;
 817         }
 818 }
 819
 820 #define MAYBE_PROLOG                                       \
 821                         next_char();                                   \
 822                         while(1) {                                     \
 823                                 switch(c) {
 824
 825 #define MAYBE(ch, set_type)                                \
 826                                 case ch:                                   \
 827                                         next_char();                           \
 828                                         lexer_token.type = set_type;           \
 829                                         return;
 830
 831 #define ELSE_CODE(code)                                    \
 832                                 SKIP_TRIGRAPHS(,                           \
 833                                         code;                                  \
 834                                 )                                          \
 835                                                                                                                    \
 836                                 case '\\':                                 \
 837                                         next_char();                           \
 838                                         EAT_NEWLINE(break;)                    \
 839                                         /* fallthrough */                      \
 840                                 default:                                   \
 841                                         code;                                  \
 842                                 }                                          \
 843                         } /* end of while(1) */                        \
 844                         break;
 845
 846 #define ELSE(set_type)                                     \
 847                 ELSE_CODE(                                         \
 848                         lexer_token.type = set_type;                   \
 849                         return;                                        \
 850                 )
 851
 852 void lexer_next_preprocessing_token(void)
 853 {
 854         while(1) {
 855                 switch(c) {
 856                 case ' ':
 857                 case '\t':
 858                         next_char();
 859                         break;
 860
 861                 MATCH_NEWLINE(
 862                         lexer_token.type = '\n';
 863                         return;
 864                 )
 865
 866                 SYMBOL_CHARS
 867                         parse_symbol();
 868                         return;
 869
 870                 DIGITS
 871                         parse_number();
 872                         return;
 873
 874                 case '"':
 875                         parse_string_literal();
 876                         return;
 877
 878                 case '\'':
 879                         parse_character_constant();
 880                         return;
 881
 882                 case '\\':
 883                         next_char();
 884                         if(c == '\n') {
 885                                 next_char();
 886                                 lexer_token.source_position.linenr++;
 887                                 break;
 888                         } else {
 889                                 parse_error("unexpected '\\' found");
 890                                 lexer_token.type = T_ERROR;
 891                         }
 892                         return;
 893
 894                 case '.':
 895                         MAYBE_PROLOG
 896                                 case '.':
 897                                         MAYBE_PROLOG
 898                                         MAYBE('.', T_DOTDOTDOT)
 899                                         ELSE_CODE(
 900                                                 put_back(c);
 901                                                 c = '.';
 902                                                 lexer_token.type = '.';
 903                                                 return;
 904                                         )
 905                         ELSE('.')
 906                 case '&':
 907                         MAYBE_PROLOG
 908                         MAYBE('&', T_ANDAND)
 909                         MAYBE('=', T_ANDEQUAL)
 910                         ELSE('&')
 911                 case '*':
 912                         MAYBE_PROLOG
 913                         MAYBE('=', T_ASTERISKEQUAL)
 914                         ELSE('*')
 915                 case '+':
 916                         MAYBE_PROLOG
 917                         MAYBE('+', T_PLUSPLUS)
 918                         MAYBE('=', T_PLUSEQUAL)
 919                         ELSE('+')
 920                 case '-':
 921                         MAYBE_PROLOG
 922                         MAYBE('>', T_MINUSGREATER)
 923                         MAYBE('-', T_MINUSMINUS)
 924                         MAYBE('=', T_MINUSEQUAL)
 925                         ELSE('-')
 926                 case '!':
 927                         MAYBE_PROLOG
 928                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 929                         ELSE('!')
 930                 case '/':
 931                         MAYBE_PROLOG
 932                         MAYBE('=', T_SLASHEQUAL)
 933                                 case '*':
 934                                         next_char();
 935                                         skip_multiline_comment();
 936                                         lexer_next_preprocessing_token();
 937                                         return;
 938                                 case '/':
 939                                         next_char();
 940                                         skip_line_comment();
 941                                         lexer_next_preprocessing_token();
 942                                         return;
 943                         ELSE('/')
 944                 case '%':
 945                         MAYBE_PROLOG
 946                         MAYBE('>', T_PERCENTGREATER)
 947                         MAYBE('=', T_PERCENTEQUAL)
 948                                 case ':':
 949                                         MAYBE_PROLOG
 950                                                 case '%':
 951                                                         MAYBE_PROLOG
 952                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 953                                                         ELSE_CODE(
 954                                                                 put_back(c);
 955                                                                 c = '%';
 956                                                                 lexer_token.type = T_PERCENTCOLON;
 957                                                                 return;
 958                                                         )
 959                                         ELSE(T_PERCENTCOLON)
 960                         ELSE('%')
 961                 case '<':
 962                         MAYBE_PROLOG
 963                         MAYBE(':', T_LESSCOLON)
 964                         MAYBE('%', T_LESSPERCENT)
 965                         MAYBE('=', T_LESSEQUAL)
 966                                 case '<':
 967                                         MAYBE_PROLOG
 968                                         MAYBE('=', T_LESSLESSEQUAL)
 969                                         ELSE(T_LESSLESS)
 970                         ELSE('<')
 971                 case '>':
 972                         MAYBE_PROLOG
 973                         MAYBE('=', T_GREATEREQUAL)
 974                                 case '>':
 975                                         MAYBE_PROLOG
 976                                         MAYBE('=', T_GREATERGREATEREQUAL)
 977                                         ELSE(T_GREATERGREATER)
 978                         ELSE('>')
 979                 case '^':
 980                         MAYBE_PROLOG
 981                         MAYBE('=', T_CARETEQUAL)
 982                         ELSE('^')
 983                 case '|':
 984                         MAYBE_PROLOG
 985                         MAYBE('=', T_PIPEEQUAL)
 986                         MAYBE('|', T_PIPEPIPE)
 987                         ELSE('|')
 988                 case ':':
 989                         MAYBE_PROLOG
 990                         MAYBE('>', T_COLONGREATER)
 991                         ELSE(':')
 992                 case '=':
 993                         MAYBE_PROLOG
 994                         MAYBE('=', T_EQUALEQUAL)
 995                         ELSE('=')
 996                 case '#':
 997                         MAYBE_PROLOG
 998                         MAYBE('#', T_HASHHASH)
 999                         ELSE('#')
1000
1001                 case '?':
1002                         next_char();
1003                         /* just a simple ? */
1004                         if(c != '?') {
1005                                 lexer_token.type = '?';
1006                                 return;
1007                         }
1008                         /* might be a trigraph */
1009                         next_char();
1010                         if(replace_trigraph()) {
1011                                 break;
1012                         }
1013                         put_back(c);
1014                         c = '?';
1015                         lexer_token.type = '?';
1016                         return;
1017
1018                 case '[':
1019                 case ']':
1020                 case '(':
1021                 case ')':
1022                 case '{':
1023                 case '}':
1024                 case '~':
1025                 case ';':
1026                 case ',':
1027                         lexer_token.type = c;
1028                         next_char();
1029                         return;
1030
1031                 case EOF:
1032                         lexer_token.type = T_EOF;
1033                         return;
1034
1035                 default:
1036                         next_char();
1037                         error_prefix();
1038                         fprintf(stderr, "unknown character '%c' found\n", c);
1039                         lexer_token.type = T_ERROR;
1040                         return;
1041                 }
1042         }
1043 }
1044
1045 void lexer_next_token(void)
1046 {
1047         lexer_next_preprocessing_token();
1048         if(lexer_token.type != '\n')
1049                 return;
1050
1051 newline_found:
1052         do {
1053                 lexer_next_preprocessing_token();
1054         } while(lexer_token.type == '\n');
1055
1056         if(lexer_token.type == '#') {
1057                 parse_preprocessor_directive();
1058                 goto newline_found;
1059         }
1060 }
1061
1062 void init_lexer(void)
1063 {
1064         strset_init(&stringset);
1065 }
1066
1067 void lexer_open_stream(FILE *stream, const char *input_name)
1068 {
1069         input                                  = stream;
1070         lexer_token.source_position.linenr     = 0;
1071         lexer_token.source_position.input_name = input_name;
1072
1073         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1074          * beginning of a line */
1075         c = '\n';
1076 }
1077
1078 void exit_lexer(void)
1079 {
1080         strset_destroy(&stringset);
1081 }
1082
1083 static __attribute__((unused))
1084 void dbg_pos(const source_position_t source_position)
1085 {
1086         fprintf(stdout, "%s:%d\n", source_position.input_name,
1087                 source_position.linenr);
1088         fflush(stdout);
1089 }