nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer_t.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7
   8 #include <assert.h>
   9 #include <errno.h>
  10 #include <string.h>
  11 #include <ctype.h>
  12
  13 //#define DEBUG_CHARS
  14 #define MAX_PUTBACK 3
  15
  16 static int               c;
  17 source_position_t source_position;
  18 static FILE             *input;
  19 static char              buf[1027];
  20 static const char       *bufend;
  21 static const char       *bufpos;
  22 static strset_t          stringset;
  23 //static FILE            **input_stack;
  24 //static char            **buf_stack;
  25
  26 static
  27 void error_prefix_at(const char *input_name, unsigned linenr)
  28 {
  29         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
  30 }
  31
  32 static
  33 void error_prefix()
  34 {
  35         error_prefix_at(source_position.input_name, source_position.linenr);
  36 }
  37
  38 static
  39 void parse_error(const char *msg)
  40 {
  41         error_prefix();
  42         fprintf(stderr, "%s\n", msg);
  43 }
  44
  45 static inline
  46 void next_char()
  47 {
  48         bufpos++;
  49         if(bufpos >= bufend) {
  50                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  51                                  input);
  52                 if(s == 0) {
  53                         c = EOF;
  54                         return;
  55                 }
  56                 bufpos = buf + MAX_PUTBACK;
  57                 bufend = buf + MAX_PUTBACK + s;
  58         }
  59         c = *(bufpos);
  60 #ifdef DEBUG_CHARS
  61         printf("nchar '%c'\n", c);
  62 #endif
  63 }
  64
  65 static inline
  66 void put_back(int pc)
  67 {
  68         char *p = (char*) bufpos - 1;
  69         bufpos--;
  70         assert(p >= buf);
  71         *p = pc;
  72
  73 #ifdef DEBUG_CHARS
  74         printf("putback '%c'\n", pc);
  75 #endif
  76 }
  77
  78
  79 static
  80 int replace_trigraph(void)
  81 {
  82 #define MATCH_TRIGRAPH(ch,replacement)           \
  83         case ch:                                     \
  84                 c = replacement;                         \
  85                 return 1;
  86
  87         switch(c) {
  88         MATCH_TRIGRAPH('=', '#')
  89         MATCH_TRIGRAPH('(', '[')
  90         MATCH_TRIGRAPH('/', '\\')
  91         MATCH_TRIGRAPH(')', ']')
  92         MATCH_TRIGRAPH('\'', '^')
  93         MATCH_TRIGRAPH('<', '{')
  94         MATCH_TRIGRAPH('!', '|')
  95         MATCH_TRIGRAPH('>', '}')
  96         MATCH_TRIGRAPH('-', '~')
  97         default:
  98                 break;
  99         }
 100
 101         return 0;
 102 }
 103
 104 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
 105         case '?':                                  \
 106                 next_char();                           \
 107                 if(c != '?') {                         \
 108                         custom_putback;                    \
 109                         put_back(c);                       \
 110                         c = '?';                           \
 111                         no_trigraph_code;                  \
 112                 }                                      \
 113                 next_char();                           \
 114                 if(replace_trigraph()) {               \
 115                         break;                             \
 116                 }                                      \
 117                 custom_putback;                        \
 118                 put_back('?');                         \
 119                 put_back(c);                           \
 120                 c = '?';                               \
 121                 no_trigraph_code;
 122
 123 #define EAT_NEWLINE(newline_code)              \
 124         if(c == '\r') {                            \
 125                 next_char();                           \
 126                 if(c == '\n')                          \
 127                         next_char();                       \
 128                 source_position.linenr++;              \
 129                 newline_code;                          \
 130         } else if(c == '\n') {                     \
 131                 next_char();                           \
 132                 source_position.linenr++;              \
 133                 newline_code;                          \
 134         }
 135
 136 #define SYMBOL_CHARS  \
 137         case 'a':         \
 138         case 'b':         \
 139         case 'c':         \
 140         case 'd':         \
 141         case 'e':         \
 142         case 'f':         \
 143         case 'g':         \
 144         case 'h':         \
 145         case 'i':         \
 146         case 'j':         \
 147         case 'k':         \
 148         case 'l':         \
 149         case 'm':         \
 150         case 'n':         \
 151         case 'o':         \
 152         case 'p':         \
 153         case 'q':         \
 154         case 'r':         \
 155         case 's':         \
 156         case 't':         \
 157         case 'u':         \
 158         case 'v':         \
 159         case 'w':         \
 160         case 'x':         \
 161         case 'y':         \
 162         case 'z':         \
 163         case 'A':         \
 164         case 'B':         \
 165         case 'C':         \
 166         case 'D':         \
 167         case 'E':         \
 168         case 'F':         \
 169         case 'G':         \
 170         case 'H':         \
 171         case 'I':         \
 172         case 'J':         \
 173         case 'K':         \
 174         case 'L':         \
 175         case 'M':         \
 176         case 'N':         \
 177         case 'O':         \
 178         case 'P':         \
 179         case 'Q':         \
 180         case 'R':         \
 181         case 'S':         \
 182         case 'T':         \
 183         case 'U':         \
 184         case 'V':         \
 185         case 'W':         \
 186         case 'X':         \
 187         case 'Y':         \
 188         case 'Z':         \
 189         case '_':
 190
 191 #define DIGITS        \
 192         case '0':         \
 193         case '1':         \
 194         case '2':         \
 195         case '3':         \
 196         case '4':         \
 197         case '5':         \
 198         case '6':         \
 199         case '7':         \
 200         case '8':         \
 201         case '9':
 202
 203 static
 204 void parse_symbol(token_t *token)
 205 {
 206         symbol_t *symbol;
 207         char     *string;
 208
 209         obstack_1grow(&symbol_obstack, c);
 210         next_char();
 211
 212         while(1) {
 213                 switch(c) {
 214                 case '\\':
 215                         next_char();
 216                         EAT_NEWLINE(break;)
 217                         goto end_symbol;
 218
 219                 DIGITS
 220                 SYMBOL_CHARS
 221                         obstack_1grow(&symbol_obstack, c);
 222                         next_char();
 223                         break;
 224
 225                 case '?':
 226                         next_char();
 227                         if(c != '?') {
 228                                 put_back(c);
 229                                 c = '?';
 230                                 goto end_symbol;
 231                         }
 232                         next_char();
 233                         if(replace_trigraph())
 234                                 break;
 235                         put_back('?');
 236                         put_back(c);
 237                         c = '?';
 238                         goto end_symbol;
 239
 240                 default:
 241                         goto end_symbol;
 242                 }
 243         }
 244 end_symbol:
 245         obstack_1grow(&symbol_obstack, '\0');
 246
 247         string = obstack_finish(&symbol_obstack);
 248         symbol = symbol_table_insert(string);
 249
 250         token->type     = symbol->ID;
 251         token->v.symbol = symbol;
 252
 253         if(symbol->string != string) {
 254                 obstack_free(&symbol_obstack, string);
 255         }
 256 }
 257
 258 static
 259 void parse_number_hex(token_t *token)
 260 {
 261         assert(c == 'x' || c == 'X');
 262         next_char();
 263
 264         if (!isdigit(c) &&
 265                 !('A' <= c && c <= 'F') &&
 266                 !('a' <= c && c <= 'f')) {
 267                 parse_error("premature end of hex number literal");
 268                 token->type = T_ERROR;
 269                 return;
 270         }
 271
 272         int value = 0;
 273         for(;;) {
 274                 if (isdigit(c)) {
 275                         value = 16 * value + c - '0';
 276                 } else if ('A' <= c && c <= 'F') {
 277                         value = 16 * value + c - 'A' + 10;
 278                 } else if ('a' <= c && c <= 'f') {
 279                         value = 16 * value + c - 'a' + 10;
 280                 } else {
 281                         token->type     = T_INTEGER;
 282                         token->v.intvalue = value;
 283                         return;
 284                 }
 285                 next_char();
 286         }
 287 }
 288
 289 static
 290 void parse_number_oct(token_t *token)
 291 {
 292         assert(c == 'o' || c == 'O');
 293         next_char();
 294
 295         int value = 0;
 296         for(;;) {
 297                 if ('0' <= c && c <= '7') {
 298                         value = 8 * value + c - '0';
 299                 } else {
 300                         token->type     = T_INTEGER;
 301                         token->v.intvalue = value;
 302                         return;
 303                 }
 304                 next_char();
 305         }
 306 }
 307
 308 static
 309 void parse_number_dec(token_t *token, int first_char)
 310 {
 311         int value = 0;
 312         if(first_char > 0) {
 313                 assert(first_char >= '0' && first_char <= '9');
 314                 value = first_char - '0';
 315         }
 316
 317         for(;;) {
 318                 if (isdigit(c)) {
 319                         value = 10 * value + c - '0';
 320                 } else {
 321                         token->type     = T_INTEGER;
 322                         token->v.intvalue = value;
 323                         return;
 324                 }
 325                 next_char();
 326         }
 327 }
 328
 329 static
 330 void parse_number(token_t *token)
 331 {
 332         // TODO check for overflow
 333         // TODO check for various invalid inputs sequences
 334
 335         if (c == '0') {
 336                 next_char();
 337                 switch (c) {
 338                         case 'X':
 339                         case 'x': parse_number_hex(token); break;
 340                         case 'o':
 341                         case 'O': parse_number_oct(token); break;
 342                         default:  parse_number_dec(token, '0');
 343                 }
 344         } else {
 345                 parse_number_dec(token, 0);
 346         }
 347 }
 348
 349 static
 350 int parse_escape_sequence()
 351 {
 352         while(1) {
 353                 int ec = c;
 354                 next_char();
 355
 356                 switch(ec) {
 357                 case '"': return '"';
 358                 case '\'': return'\'';
 359                 case '\\':
 360                         EAT_NEWLINE(break;)
 361                         return '\\';
 362                 case 'a': return '\a';
 363                 case 'b': return '\b';
 364                 case 'f': return '\f';
 365                 case 'n': return '\n';
 366                 case 'r': return '\r';
 367                 case 't': return '\t';
 368                 case 'v': return '\v';
 369                 case 'x': /* TODO parse hex number ... */
 370                         parse_error("hex escape sequences not implemented yet");
 371                         return EOF;
 372                 case '0':
 373                 case '1':
 374                 case '2':
 375                 case '3':
 376                 case '4':
 377                 case '5':
 378                 case '6':
 379                 case '7':
 380                         /* TODO parse octal number ... */
 381                         parse_error("octal escape sequences not implemented yet");
 382                         return EOF;
 383                 case '?':
 384                         if(c != '?') {
 385                                 return '?';
 386                         }
 387                         /* might be a trigraph */
 388                         next_char();
 389                         if(replace_trigraph()) {
 390                                 break;
 391                         }
 392                         put_back(c);
 393                         c = '?';
 394                         return '?';
 395
 396                 case EOF:
 397                         parse_error("reached end of file while parsing escape sequence");
 398                         return EOF;
 399                 default:
 400                         parse_error("unknown escape sequence");
 401                         return EOF;
 402                 }
 403         }
 404 }
 405
 406 static
 407 void parse_string_literal(token_t *token)
 408 {
 409         unsigned    start_linenr = source_position.linenr;
 410         char       *string;
 411         const char *result;
 412
 413         assert(c == '"');
 414         next_char();
 415
 416         while(1) {
 417                 switch(c) {
 418                 SKIP_TRIGRAPHS(,
 419                         obstack_1grow(&symbol_obstack, '?');
 420                         next_char();
 421                         break;
 422                 )
 423
 424                 case '\\':
 425                         next_char();
 426                         EAT_NEWLINE(break;)
 427                         int ec = parse_escape_sequence();
 428                         obstack_1grow(&symbol_obstack, ec);
 429                         break;
 430
 431                 case EOF:
 432                         error_prefix_at(source_position.input_name, start_linenr);
 433                         fprintf(stderr, "string has no end\n");
 434                         token->type = T_ERROR;
 435                         return;
 436
 437                 case '"':
 438                         next_char();
 439                         goto end_of_string;
 440
 441                 default:
 442                         obstack_1grow(&symbol_obstack, c);
 443                         next_char();
 444                         break;
 445                 }
 446         }
 447
 448 end_of_string:
 449
 450         /* TODO: concatenate multiple strings separated by whitespace... */
 451
 452         /* add finishing 0 to the string */
 453         obstack_1grow(&symbol_obstack, '\0');
 454         string = obstack_finish(&symbol_obstack);
 455
 456         /* check if there is already a copy of the string */
 457         result = strset_insert(&stringset, string);
 458         if(result != string) {
 459                 obstack_free(&symbol_obstack, string);
 460         }
 461
 462         token->type     = T_STRING_LITERAL;
 463         token->v.string = result;
 464 }
 465
 466 #define MATCH_NEWLINE(code)                 \
 467         case '\r':                              \
 468                 next_char();                        \
 469                 if(c == '\n') {                     \
 470                         next_char();                    \
 471                 }                                   \
 472                 source_position.linenr++;           \
 473                 code;                               \
 474         case '\n':                              \
 475                 next_char();                        \
 476                 source_position.linenr++;           \
 477                 code;
 478
 479 static
 480 void parse_character_constant(token_t *token)
 481 {
 482         assert(c == '\'');
 483         next_char();
 484
 485         int found_char = 0;
 486         while(1) {
 487                 switch(c) {
 488                 SKIP_TRIGRAPHS(,
 489                         found_char = '?';
 490                         break;
 491                 )
 492
 493                 case '\\':
 494                         next_char();
 495                         EAT_NEWLINE(break;)
 496                         found_char = '\\';
 497                         break;
 498
 499                 MATCH_NEWLINE(
 500                         parse_error("newline while parsing character constant");
 501                         break;
 502                 )
 503
 504                 case '\'':
 505                         next_char();
 506                         goto end_of_char_constant;
 507
 508                 case EOF:
 509                         parse_error("EOF while parsing character constant");
 510                         token->type = T_ERROR;
 511                         return;
 512
 513                 default:
 514                         if(found_char != 0) {
 515                                 parse_error("more than 1 characters in character "
 516                                             "constant");
 517                                 goto end_of_char_constant;
 518                         } else {
 519                                 found_char = c;
 520                                 next_char();
 521                         }
 522                         break;
 523                 }
 524         }
 525
 526 end_of_char_constant:
 527         token->type       = T_INTEGER;
 528         token->v.intvalue = found_char;
 529 }
 530
 531 static
 532 void skip_multiline_comment(void)
 533 {
 534         unsigned start_linenr = source_position.linenr;
 535         int had_star = 0;
 536
 537         while(1) {
 538                 switch(c) {
 539                 case '*':
 540                         next_char();
 541                         had_star = 1;
 542                         break;
 543
 544                 case '/':
 545                         next_char();
 546                         if(had_star) {
 547                                 return;
 548                         }
 549                         had_star = 0;
 550                         break;
 551
 552                 case '\\':
 553                         next_char();
 554                         EAT_NEWLINE(break;)
 555                         had_star = 0;
 556                         break;
 557
 558                 case '?':
 559                         next_char();
 560                         if(c != '?') {
 561                                 had_star = 0;
 562                                 break;
 563                         }
 564                         next_char();
 565                         if(replace_trigraph())
 566                                 break;
 567                         put_back(c);
 568                         c = '?';
 569                         had_star = 0;
 570                         /* we don't put back the 2nd ? as the comment text is discarded
 571                          * anyway */
 572                         break;
 573
 574                 MATCH_NEWLINE(had_star = 0; break;)
 575
 576                 case EOF:
 577                         error_prefix_at(source_position.input_name, start_linenr);
 578                         fprintf(stderr, "at end of file while looking for comment end\n");
 579                         return;
 580                 default:
 581                         had_star = 0;
 582                         next_char();
 583                         break;
 584                 }
 585         }
 586 }
 587
 588 static
 589 void skip_line_comment(void)
 590 {
 591         while(1) {
 592                 switch(c) {
 593                 case '?':
 594                         next_char();
 595                         if(c != '?')
 596                                 break;
 597                         next_char();
 598                         if(replace_trigraph())
 599                                 break;
 600                         put_back('?');
 601                         /* we don't put back the 2nd ? as the comment text is discarded
 602                          * anyway */
 603                         break;
 604
 605                 case '\\':
 606                         next_char();
 607                         if(c == '\n') {
 608                                 next_char();
 609                                 source_position.linenr++;
 610                         }
 611                         break;
 612
 613                 case EOF:
 614                 case '\r':
 615                 case '\n':
 616                         return;
 617
 618                 default:
 619                         next_char();
 620                         break;
 621                 }
 622         }
 623 }
 624
 625 static token_t pp_token;
 626
 627 static inline
 628 void next_pp_token(void)
 629 {
 630         lexer_next_preprocessing_token(&pp_token);
 631 }
 632
 633 static
 634 void eat_until_newline(void)
 635 {
 636         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 637                 next_pp_token();
 638         }
 639 }
 640
 641 static
 642 void error_directive(void)
 643 {
 644         error_prefix();
 645         fprintf(stderr, "#error directive: \n");
 646
 647         /* parse pp-tokens until new-line */
 648 }
 649
 650 static
 651 void define_directive(void)
 652 {
 653         token_t temptoken;
 654
 655         lexer_next_preprocessing_token(&temptoken);
 656         if(temptoken.type != T_IDENTIFIER) {
 657                 parse_error("expected identifier after #define\n");
 658                 eat_until_newline();
 659         }
 660 }
 661
 662 static
 663 void ifdef_directive(int is_ifndef)
 664 {
 665         (void) is_ifndef;
 666         token_t temptoken;
 667         lexer_next_preprocessing_token(&temptoken);
 668         //expect_identifier();
 669         //extect_newline();
 670 }
 671
 672 static
 673 void endif_directive(void)
 674 {
 675         //expect_newline();
 676 }
 677
 678 static
 679 void parse_line_directive(void)
 680 {
 681         if(pp_token.type != T_INTEGER) {
 682                 parse_error("expected integer");
 683         } else {
 684                 source_position.linenr = pp_token.v.intvalue - 1;
 685                 next_pp_token();
 686         }
 687         if(pp_token.type == T_STRING_LITERAL) {
 688                 source_position.input_name = pp_token.v.string;
 689                 next_pp_token();
 690         }
 691
 692         eat_until_newline();
 693 }
 694
 695 static
 696 void parse_preprocessor_identifier(void)
 697 {
 698         assert(pp_token.type == T_IDENTIFIER);
 699         symbol_t *symbol = pp_token.v.symbol;
 700
 701         switch(symbol->pp_ID) {
 702         case TP_include:
 703                 printf("include - enable header name parsing!\n");
 704                 break;
 705         case TP_define:
 706                 define_directive();
 707                 break;
 708         case TP_ifdef:
 709                 ifdef_directive(0);
 710                 break;
 711         case TP_ifndef:
 712                 ifdef_directive(1);
 713                 break;
 714         case TP_endif:
 715                 endif_directive();
 716                 break;
 717         case TP_line:
 718                 next_pp_token();
 719                 parse_line_directive();
 720                 break;
 721         case TP_if:
 722         case TP_else:
 723         case TP_elif:
 724         case TP_undef:
 725         case TP_error:
 726                 error_directive();
 727                 break;
 728         case TP_pragma:
 729                 break;
 730         }
 731 }
 732
 733 static
 734 void parse_preprocessor_directive()
 735 {
 736         next_pp_token();
 737
 738         switch(pp_token.type) {
 739         case T_IDENTIFIER:
 740                 parse_preprocessor_identifier();
 741                 break;
 742         case T_INTEGER:
 743                 parse_line_directive();
 744                 break;
 745         default:
 746                 parse_error("invalid preprocessor directive");
 747                 eat_until_newline();
 748                 break;
 749         }
 750 }
 751
 752 #define MAYBE_PROLOG                                       \
 753                         next_char();                                   \
 754                         while(1) {                                     \
 755                                 switch(c) {
 756
 757 #define MAYBE(ch, set_type)                                \
 758                                 case ch:                                   \
 759                                         next_char();                           \
 760                                         token->type = set_type;                \
 761                                         return;
 762
 763 #define ELSE_CODE(code)                                    \
 764                                 SKIP_TRIGRAPHS(,                           \
 765                                         code;                                  \
 766                                 )                                          \
 767                                                                                                                    \
 768                                 case '\\':                                 \
 769                                         next_char();                           \
 770                                         EAT_NEWLINE(break;)                    \
 771                                         /* fallthrough */                      \
 772                                 default:                                   \
 773                                         code;                                  \
 774                                 }                                          \
 775                         } /* end of while(1) */                        \
 776                         break;
 777
 778 #define ELSE(set_type)                                     \
 779                 ELSE_CODE(                                         \
 780                         token->type = set_type;                        \
 781                         return;                                        \
 782                 )
 783
 784 void lexer_next_preprocessing_token(token_t *token)
 785 {
 786         while(1) {
 787                 switch(c) {
 788                 case ' ':
 789                 case '\t':
 790                         next_char();
 791                         break;
 792
 793                 MATCH_NEWLINE(
 794                         token->type = '\n';
 795                         return;
 796                 )
 797
 798                 SYMBOL_CHARS
 799                         parse_symbol(token);
 800                         return;
 801
 802                 DIGITS
 803                         parse_number(token);
 804                         return;
 805
 806                 case '"':
 807                         parse_string_literal(token);
 808                         return;
 809
 810                 case '\'':
 811                         parse_character_constant(token);
 812                         return;
 813
 814                 case '\\':
 815                         next_char();
 816                         if(c == '\n') {
 817                                 next_char();
 818                                 source_position.linenr++;
 819                                 break;
 820                         } else {
 821                                 parse_error("unexpected '\\' found");
 822                                 token->type = T_ERROR;
 823                         }
 824                         return;
 825
 826                 case '.':
 827                         MAYBE_PROLOG
 828                                 case '.':
 829                                         MAYBE_PROLOG
 830                                         MAYBE('.', T_DOTDOTDOT)
 831                                         ELSE_CODE(
 832                                                 put_back(c);
 833                                                 c = '.';
 834                                                 token->type = '.';
 835                                                 return;
 836                                         )
 837                         ELSE('.')
 838                 case '&':
 839                         MAYBE_PROLOG
 840                         MAYBE('&', T_ANDAND)
 841                         MAYBE('=', T_ANDEQUAL)
 842                         ELSE('&')
 843                 case '*':
 844                         MAYBE_PROLOG
 845                         MAYBE('=', T_ASTERISKEQUAL)
 846                         ELSE('*')
 847                 case '+':
 848                         MAYBE_PROLOG
 849                         MAYBE('+', T_PLUSPLUS)
 850                         MAYBE('=', T_PLUSEQUAL)
 851                         ELSE('+')
 852                 case '-':
 853                         MAYBE_PROLOG
 854                         MAYBE('-', T_MINUSMINUS)
 855                         MAYBE('=', T_MINUSEQUAL)
 856                         ELSE('-')
 857                 case '!':
 858                         MAYBE_PROLOG
 859                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 860                         ELSE('!')
 861                 case '/':
 862                         MAYBE_PROLOG
 863                         MAYBE('=', T_SLASHEQUAL)
 864                                 case '*':
 865                                         next_char();
 866                                         skip_multiline_comment();
 867                                         lexer_next_preprocessing_token(token);
 868                                         return;
 869                                 case '/':
 870                                         next_char();
 871                                         skip_line_comment();
 872                                         lexer_next_preprocessing_token(token);
 873                                         return;
 874                         ELSE('/')
 875                 case '%':
 876                         MAYBE_PROLOG
 877                         MAYBE('>', T_PERCENTGREATER)
 878                         MAYBE('=', T_PERCENTEQUAL)
 879                                 case ':':
 880                                         MAYBE_PROLOG
 881                                                 case '%':
 882                                                         MAYBE_PROLOG
 883                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 884                                                         ELSE_CODE(
 885                                                                 put_back(c);
 886                                                                 c = '%';
 887                                                                 token->type = T_PERCENTCOLON;
 888                                                                 return;
 889                                                         )
 890                                         ELSE(T_PERCENTCOLON)
 891                         ELSE('%')
 892                 case '<':
 893                         MAYBE_PROLOG
 894                         MAYBE(':', T_LESSCOLON)
 895                         MAYBE('%', T_LESSPERCENT)
 896                                 case '<':
 897                                         MAYBE_PROLOG
 898                                         MAYBE('=', T_LESSLESSEQUAL)
 899                                         ELSE(T_LESSLESS)
 900                         ELSE('<')
 901                 case '>':
 902                         MAYBE_PROLOG
 903                                 case '>':
 904                                         MAYBE_PROLOG
 905                                         MAYBE('=', T_GREATERGREATEREQUAL)
 906                                         ELSE(T_GREATERGREATER)
 907                         ELSE('>')
 908                 case '^':
 909                         MAYBE_PROLOG
 910                         MAYBE('=', T_CARETEQUAL)
 911                         ELSE('^')
 912                 case '|':
 913                         MAYBE_PROLOG
 914                         MAYBE('=', T_PIPEEQUAL)
 915                         MAYBE('|', T_PIPEPIPE)
 916                         ELSE('|')
 917                 case ':':
 918                         MAYBE_PROLOG
 919                         MAYBE('>', T_COLONGREATER)
 920                         ELSE(':')
 921                 case '=':
 922                         MAYBE_PROLOG
 923                         MAYBE('=', T_EQUALEQUAL)
 924                         ELSE('=')
 925                 case '#':
 926                         MAYBE_PROLOG
 927                         MAYBE('#', T_HASHHASH)
 928                         ELSE('#')
 929
 930                 case '?':
 931                         next_char();
 932                         /* just a simple ? */
 933                         if(c != '?') {
 934                                 token->type = '?';
 935                                 return;
 936                         }
 937                         /* might be a trigraph */
 938                         next_char();
 939                         if(replace_trigraph()) {
 940                                 break;
 941                         }
 942                         put_back(c);
 943                         c = '?';
 944                         token->type = '?';
 945                         return;
 946
 947                 case '[':
 948                 case ']':
 949                 case '(':
 950                 case ')':
 951                 case '{':
 952                 case '}':
 953                 case '~':
 954                 case ';':
 955                 case ',':
 956                         token->type = c;
 957                         next_char();
 958                         return;
 959
 960                 case EOF:
 961                         token->type = T_EOF;
 962                         return;
 963
 964                 default:
 965                         next_char();
 966                         error_prefix();
 967                         fprintf(stderr, "unknown character '%c' found\n", c);
 968                         token->type = T_ERROR;
 969                         return;
 970                 }
 971         }
 972 }
 973
 974 void lexer_next_token(token_t *token)
 975 {
 976         lexer_next_preprocessing_token(token);
 977         if(token->type != '\n')
 978                 return;
 979
 980 newline_found:
 981         do {
 982                 lexer_next_preprocessing_token(token);
 983         } while(token->type == '\n');
 984
 985         if(token->type == '#') {
 986                 parse_preprocessor_directive();
 987                 goto newline_found;
 988         }
 989 }
 990
 991 void init_lexer(void)
 992 {
 993         strset_init(&stringset);
 994 }
 995
 996 void lexer_open_stream(FILE *stream, const char *input_name)
 997 {
 998         input                      = stream;
 999         source_position.linenr     = 0;
1000         source_position.input_name = input_name;
1001
1002         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1003          * beginning of a line */
1004         c = '\n';
1005 }
1006
1007 void exit_lexer(void)
1008 {
1009         strset_destroy(&stringset);
1010 }
1011
1012 static __attribute__((unused))
1013 void dbg_pos(const source_position_t source_position)
1014 {
1015         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
1016         fflush(stdout);
1017 }