nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8
   9 #include <assert.h>
  10 #include <errno.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13
  14 //#define DEBUG_CHARS
  15 #define MAX_PUTBACK 3
  16
  17 static int         c;
  18 token_t            lexer_token;
  19 static FILE       *input;
  20 static char        buf[1024 + MAX_PUTBACK];
  21 static const char *bufend;
  22 static const char *bufpos;
  23 static strset_t    stringset;
  24 //static FILE      **input_stack;
  25 //static char      **buf_stack;
  26
  27 static
  28 void error_prefix_at(const char *input_name, unsigned linenr)
  29 {
  30         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
  31 }
  32
  33 static
  34 void error_prefix(void)
  35 {
  36         error_prefix_at(lexer_token.source_position.input_name,
  37                         lexer_token.source_position.linenr);
  38 }
  39
  40 static
  41 void parse_error(const char *msg)
  42 {
  43         error_prefix();
  44         fprintf(stderr, "%s\n", msg);
  45 }
  46
  47 static inline
  48 void next_char(void)
  49 {
  50         bufpos++;
  51         if(bufpos >= bufend) {
  52                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  53                                  input);
  54                 if(s == 0) {
  55                         c = EOF;
  56                         return;
  57                 }
  58                 bufpos = buf + MAX_PUTBACK;
  59                 bufend = buf + MAX_PUTBACK + s;
  60         }
  61         c = *(bufpos);
  62 #ifdef DEBUG_CHARS
  63         printf("nchar '%c'\n", c);
  64 #endif
  65 }
  66
  67 static inline
  68 void put_back(int pc)
  69 {
  70         char *p = (char*) bufpos - 1;
  71         bufpos--;
  72         assert(p >= buf);
  73         *p = pc;
  74
  75 #ifdef DEBUG_CHARS
  76         printf("putback '%c'\n", pc);
  77 #endif
  78 }
  79
  80
  81 static
  82 int replace_trigraph(void)
  83 {
  84 #define MATCH_TRIGRAPH(ch,replacement)           \
  85         case ch:                                     \
  86                 c = replacement;                         \
  87                 return 1;
  88
  89         switch(c) {
  90         MATCH_TRIGRAPH('=', '#')
  91         MATCH_TRIGRAPH('(', '[')
  92         MATCH_TRIGRAPH('/', '\\')
  93         MATCH_TRIGRAPH(')', ']')
  94         MATCH_TRIGRAPH('\'', '^')
  95         MATCH_TRIGRAPH('<', '{')
  96         MATCH_TRIGRAPH('!', '|')
  97         MATCH_TRIGRAPH('>', '}')
  98         MATCH_TRIGRAPH('-', '~')
  99         default:
 100                 break;
 101         }
 102
 103         return 0;
 104 }
 105
 106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
 107         case '?':                                  \
 108                 next_char();                           \
 109                 if(c != '?') {                         \
 110                         custom_putback;                    \
 111                         put_back(c);                       \
 112                         c = '?';                           \
 113                         no_trigraph_code;                  \
 114                 }                                      \
 115                 next_char();                           \
 116                 if(replace_trigraph()) {               \
 117                         break;                             \
 118                 }                                      \
 119                 custom_putback;                        \
 120                 put_back('?');                         \
 121                 put_back(c);                           \
 122                 c = '?';                               \
 123                 no_trigraph_code;
 124
 125 #define EAT_NEWLINE(newline_code)              \
 126         if(c == '\r') {                            \
 127                 next_char();                           \
 128                 if(c == '\n')                          \
 129                         next_char();                       \
 130                 lexer_token.source_position.linenr++;  \
 131                 newline_code;                          \
 132         } else if(c == '\n') {                     \
 133                 next_char();                           \
 134                 lexer_token.source_position.linenr++;  \
 135                 newline_code;                          \
 136         }
 137
 138 #define SYMBOL_CHARS  \
 139         case 'a':         \
 140         case 'b':         \
 141         case 'c':         \
 142         case 'd':         \
 143         case 'e':         \
 144         case 'f':         \
 145         case 'g':         \
 146         case 'h':         \
 147         case 'i':         \
 148         case 'j':         \
 149         case 'k':         \
 150         case 'l':         \
 151         case 'm':         \
 152         case 'n':         \
 153         case 'o':         \
 154         case 'p':         \
 155         case 'q':         \
 156         case 'r':         \
 157         case 's':         \
 158         case 't':         \
 159         case 'u':         \
 160         case 'v':         \
 161         case 'w':         \
 162         case 'x':         \
 163         case 'y':         \
 164         case 'z':         \
 165         case 'A':         \
 166         case 'B':         \
 167         case 'C':         \
 168         case 'D':         \
 169         case 'E':         \
 170         case 'F':         \
 171         case 'G':         \
 172         case 'H':         \
 173         case 'I':         \
 174         case 'J':         \
 175         case 'K':         \
 176         case 'L':         \
 177         case 'M':         \
 178         case 'N':         \
 179         case 'O':         \
 180         case 'P':         \
 181         case 'Q':         \
 182         case 'R':         \
 183         case 'S':         \
 184         case 'T':         \
 185         case 'U':         \
 186         case 'V':         \
 187         case 'W':         \
 188         case 'X':         \
 189         case 'Y':         \
 190         case 'Z':         \
 191         case '_':
 192
 193 #define DIGITS        \
 194         case '0':         \
 195         case '1':         \
 196         case '2':         \
 197         case '3':         \
 198         case '4':         \
 199         case '5':         \
 200         case '6':         \
 201         case '7':         \
 202         case '8':         \
 203         case '9':
 204
 205 static
 206 void parse_symbol(void)
 207 {
 208         symbol_t *symbol;
 209         char     *string;
 210
 211         obstack_1grow(&symbol_obstack, c);
 212         next_char();
 213
 214         while(1) {
 215                 switch(c) {
 216                 case '\\':
 217                         next_char();
 218                         EAT_NEWLINE(break;)
 219                         goto end_symbol;
 220
 221                 DIGITS
 222                 SYMBOL_CHARS
 223                         obstack_1grow(&symbol_obstack, c);
 224                         next_char();
 225                         break;
 226
 227                 case '?':
 228                         next_char();
 229                         if(c != '?') {
 230                                 put_back(c);
 231                                 c = '?';
 232                                 goto end_symbol;
 233                         }
 234                         next_char();
 235                         if(replace_trigraph())
 236                                 break;
 237                         put_back('?');
 238                         put_back(c);
 239                         c = '?';
 240                         goto end_symbol;
 241
 242                 default:
 243                         goto end_symbol;
 244                 }
 245         }
 246 end_symbol:
 247         obstack_1grow(&symbol_obstack, '\0');
 248
 249         string = obstack_finish(&symbol_obstack);
 250         symbol = symbol_table_insert(string);
 251
 252         lexer_token.type     = symbol->ID;
 253         lexer_token.v.symbol = symbol;
 254
 255         if(symbol->string != string) {
 256                 obstack_free(&symbol_obstack, string);
 257         }
 258 }
 259
 260 static
 261 void parse_number_hex(void)
 262 {
 263         assert(c == 'x' || c == 'X');
 264         next_char();
 265
 266         if (!isdigit(c) &&
 267                 !('A' <= c && c <= 'F') &&
 268                 !('a' <= c && c <= 'f')) {
 269                 parse_error("premature end of hex number literal");
 270                 lexer_token.type = T_ERROR;
 271                 return;
 272         }
 273
 274         int value = 0;
 275         for(;;) {
 276                 if (isdigit(c)) {
 277                         value = 16 * value + c - '0';
 278                 } else if ('A' <= c && c <= 'F') {
 279                         value = 16 * value + c - 'A' + 10;
 280                 } else if ('a' <= c && c <= 'f') {
 281                         value = 16 * value + c - 'a' + 10;
 282                 } else {
 283                         lexer_token.type     = T_INTEGER;
 284                         lexer_token.v.intvalue = value;
 285                         return;
 286                 }
 287                 next_char();
 288         }
 289 }
 290
 291 static
 292 void parse_number_oct(void)
 293 {
 294         assert(c == 'o' || c == 'O');
 295         next_char();
 296
 297         int value = 0;
 298         for(;;) {
 299                 if ('0' <= c && c <= '7') {
 300                         value = 8 * value + c - '0';
 301                 } else {
 302                         lexer_token.type       = T_INTEGER;
 303                         lexer_token.v.intvalue = value;
 304                         return;
 305                 }
 306                 next_char();
 307         }
 308 }
 309
 310 static
 311 void parse_number_dec(int first_char)
 312 {
 313         int value = 0;
 314         if(first_char > 0) {
 315                 assert(first_char >= '0' && first_char <= '9');
 316                 value = first_char - '0';
 317         }
 318
 319         for(;;) {
 320                 if (isdigit(c)) {
 321                         value = 10 * value + c - '0';
 322                 } else {
 323                         lexer_token.type       = T_INTEGER;
 324                         lexer_token.v.intvalue = value;
 325                         return;
 326                 }
 327                 next_char();
 328         }
 329 }
 330
 331 static
 332 void parse_number(void)
 333 {
 334         // TODO check for overflow
 335         // TODO check for various invalid inputs sequences
 336
 337         if (c == '0') {
 338                 next_char();
 339                 switch (c) {
 340                         case 'X':
 341                         case 'x': parse_number_hex(); break;
 342                         case 'o':
 343                         case 'O': parse_number_oct(); break;
 344                         default:  parse_number_dec('0');
 345                 }
 346         } else {
 347                 parse_number_dec(0);
 348         }
 349 }
 350
 351 static
 352 int parse_escape_sequence(void)
 353 {
 354         while(1) {
 355                 int ec = c;
 356                 next_char();
 357
 358                 switch(ec) {
 359                 case '"': return '"';
 360                 case '\'': return'\'';
 361                 case '\\':
 362                         EAT_NEWLINE(break;)
 363                         return '\\';
 364                 case 'a': return '\a';
 365                 case 'b': return '\b';
 366                 case 'f': return '\f';
 367                 case 'n': return '\n';
 368                 case 'r': return '\r';
 369                 case 't': return '\t';
 370                 case 'v': return '\v';
 371                 case 'x': /* TODO parse hex number ... */
 372                         parse_error("hex escape sequences not implemented yet");
 373                         return EOF;
 374                 case '0':
 375                 case '1':
 376                 case '2':
 377                 case '3':
 378                 case '4':
 379                 case '5':
 380                 case '6':
 381                 case '7':
 382                         /* TODO parse octal number ... */
 383                         parse_error("octal escape sequences not implemented yet");
 384                         return EOF;
 385                 case '?':
 386                         if(c != '?') {
 387                                 return '?';
 388                         }
 389                         /* might be a trigraph */
 390                         next_char();
 391                         if(replace_trigraph()) {
 392                                 break;
 393                         }
 394                         put_back(c);
 395                         c = '?';
 396                         return '?';
 397
 398                 case EOF:
 399                         parse_error("reached end of file while parsing escape sequence");
 400                         return EOF;
 401                 default:
 402                         parse_error("unknown escape sequence");
 403                         return EOF;
 404                 }
 405         }
 406 }
 407
 408 static
 409 void parse_string_literal(void)
 410 {
 411         unsigned    start_linenr = lexer_token.source_position.linenr;
 412         char       *string;
 413         const char *result;
 414
 415         assert(c == '"');
 416         next_char();
 417
 418         while(1) {
 419                 switch(c) {
 420                 SKIP_TRIGRAPHS(,
 421                         obstack_1grow(&symbol_obstack, '?');
 422                         next_char();
 423                         break;
 424                 )
 425
 426                 case '\\':
 427                         next_char();
 428                         EAT_NEWLINE(break;)
 429                         int ec = parse_escape_sequence();
 430                         obstack_1grow(&symbol_obstack, ec);
 431                         break;
 432
 433                 case EOF:
 434                         error_prefix_at(lexer_token.source_position.input_name,
 435                                         start_linenr);
 436                         fprintf(stderr, "string has no end\n");
 437                         lexer_token.type = T_ERROR;
 438                         return;
 439
 440                 case '"':
 441                         next_char();
 442                         goto end_of_string;
 443
 444                 default:
 445                         obstack_1grow(&symbol_obstack, c);
 446                         next_char();
 447                         break;
 448                 }
 449         }
 450
 451 end_of_string:
 452
 453         /* TODO: concatenate multiple strings separated by whitespace... */
 454
 455         /* add finishing 0 to the string */
 456         obstack_1grow(&symbol_obstack, '\0');
 457         string = obstack_finish(&symbol_obstack);
 458
 459         /* check if there is already a copy of the string */
 460         result = strset_insert(&stringset, string);
 461         if(result != string) {
 462                 obstack_free(&symbol_obstack, string);
 463         }
 464
 465         lexer_token.type     = T_STRING_LITERAL;
 466         lexer_token.v.string = result;
 467 }
 468
 469 #define MATCH_NEWLINE(code)                   \
 470         case '\r':                                \
 471                 next_char();                          \
 472                 if(c == '\n') {                       \
 473                         next_char();                      \
 474                 }                                     \
 475                 lexer_token.source_position.linenr++; \
 476                 code;                                 \
 477         case '\n':                                \
 478                 next_char();                          \
 479                 lexer_token.source_position.linenr++; \
 480                 code;
 481
 482 static
 483 void parse_character_constant(void)
 484 {
 485         assert(c == '\'');
 486         next_char();
 487
 488         int found_char = 0;
 489         while(1) {
 490                 switch(c) {
 491                 SKIP_TRIGRAPHS(,
 492                         found_char = '?';
 493                         break;
 494                 )
 495
 496                 case '\\':
 497                         next_char();
 498                         EAT_NEWLINE(break;)
 499                         found_char = '\\';
 500                         break;
 501
 502                 MATCH_NEWLINE(
 503                         parse_error("newline while parsing character constant");
 504                         break;
 505                 )
 506
 507                 case '\'':
 508                         next_char();
 509                         goto end_of_char_constant;
 510
 511                 case EOF:
 512                         parse_error("EOF while parsing character constant");
 513                         lexer_token.type = T_ERROR;
 514                         return;
 515
 516                 default:
 517                         if(found_char != 0) {
 518                                 parse_error("more than 1 characters in character "
 519                                             "constant");
 520                                 goto end_of_char_constant;
 521                         } else {
 522                                 found_char = c;
 523                                 next_char();
 524                         }
 525                         break;
 526                 }
 527         }
 528
 529 end_of_char_constant:
 530         lexer_token.type       = T_INTEGER;
 531         lexer_token.v.intvalue = found_char;
 532 }
 533
 534 static
 535 void skip_multiline_comment(void)
 536 {
 537         unsigned start_linenr = lexer_token.source_position.linenr;
 538         int had_star = 0;
 539
 540         while(1) {
 541                 switch(c) {
 542                 case '*':
 543                         next_char();
 544                         had_star = 1;
 545                         break;
 546
 547                 case '/':
 548                         next_char();
 549                         if(had_star) {
 550                                 return;
 551                         }
 552                         had_star = 0;
 553                         break;
 554
 555                 case '\\':
 556                         next_char();
 557                         EAT_NEWLINE(break;)
 558                         had_star = 0;
 559                         break;
 560
 561                 case '?':
 562                         next_char();
 563                         if(c != '?') {
 564                                 had_star = 0;
 565                                 break;
 566                         }
 567                         next_char();
 568                         if(replace_trigraph())
 569                                 break;
 570                         put_back(c);
 571                         c = '?';
 572                         had_star = 0;
 573                         /* we don't put back the 2nd ? as the comment text is discarded
 574                          * anyway */
 575                         break;
 576
 577                 MATCH_NEWLINE(had_star = 0; break;)
 578
 579                 case EOF:
 580                         error_prefix_at(lexer_token.source_position.input_name,
 581                                         start_linenr);
 582                         fprintf(stderr, "at end of file while looking for comment end\n");
 583                         return;
 584                 default:
 585                         had_star = 0;
 586                         next_char();
 587                         break;
 588                 }
 589         }
 590 }
 591
 592 static
 593 void skip_line_comment(void)
 594 {
 595         while(1) {
 596                 switch(c) {
 597                 case '?':
 598                         next_char();
 599                         if(c != '?')
 600                                 break;
 601                         next_char();
 602                         if(replace_trigraph())
 603                                 break;
 604                         put_back('?');
 605                         /* we don't put back the 2nd ? as the comment text is discarded
 606                          * anyway */
 607                         break;
 608
 609                 case '\\':
 610                         next_char();
 611                         if(c == '\n') {
 612                                 next_char();
 613                                 lexer_token.source_position.linenr++;
 614                         }
 615                         break;
 616
 617                 case EOF:
 618                 case '\r':
 619                 case '\n':
 620                         return;
 621
 622                 default:
 623                         next_char();
 624                         break;
 625                 }
 626         }
 627 }
 628
 629 static token_t pp_token;
 630
 631 static inline
 632 void next_pp_token(void)
 633 {
 634         lexer_next_preprocessing_token();
 635         pp_token = lexer_token;
 636 }
 637
 638 static
 639 void eat_until_newline(void)
 640 {
 641         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 642                 next_pp_token();
 643         }
 644 }
 645
 646 static
 647 void error_directive(void)
 648 {
 649         error_prefix();
 650         fprintf(stderr, "#error directive: \n");
 651
 652         /* parse pp-tokens until new-line */
 653 }
 654
 655 static
 656 void define_directive(void)
 657 {
 658         lexer_next_preprocessing_token();
 659         if(lexer_token.type != T_IDENTIFIER) {
 660                 parse_error("expected identifier after #define\n");
 661                 eat_until_newline();
 662         }
 663 }
 664
 665 static
 666 void ifdef_directive(int is_ifndef)
 667 {
 668         (void) is_ifndef;
 669         lexer_next_preprocessing_token();
 670         //expect_identifier();
 671         //extect_newline();
 672 }
 673
 674 static
 675 void endif_directive(void)
 676 {
 677         //expect_newline();
 678 }
 679
 680 static
 681 void parse_line_directive(void)
 682 {
 683         if(pp_token.type != T_INTEGER) {
 684                 parse_error("expected integer");
 685         } else {
 686                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 687                 next_pp_token();
 688         }
 689         if(pp_token.type == T_STRING_LITERAL) {
 690                 lexer_token.source_position.input_name = pp_token.v.string;
 691                 next_pp_token();
 692         }
 693
 694         eat_until_newline();
 695 }
 696
 697 static
 698 void parse_preprocessor_identifier(void)
 699 {
 700         assert(pp_token.type == T_IDENTIFIER);
 701         symbol_t *symbol = pp_token.v.symbol;
 702
 703         switch(symbol->pp_ID) {
 704         case TP_include:
 705                 printf("include - enable header name parsing!\n");
 706                 break;
 707         case TP_define:
 708                 define_directive();
 709                 break;
 710         case TP_ifdef:
 711                 ifdef_directive(0);
 712                 break;
 713         case TP_ifndef:
 714                 ifdef_directive(1);
 715                 break;
 716         case TP_endif:
 717                 endif_directive();
 718                 break;
 719         case TP_line:
 720                 next_pp_token();
 721                 parse_line_directive();
 722                 break;
 723         case TP_if:
 724         case TP_else:
 725         case TP_elif:
 726         case TP_undef:
 727         case TP_error:
 728                 error_directive();
 729                 break;
 730         case TP_pragma:
 731                 break;
 732         }
 733 }
 734
 735 static
 736 void parse_preprocessor_directive()
 737 {
 738         next_pp_token();
 739
 740         switch(pp_token.type) {
 741         case T_IDENTIFIER:
 742                 parse_preprocessor_identifier();
 743                 break;
 744         case T_INTEGER:
 745                 parse_line_directive();
 746                 break;
 747         default:
 748                 parse_error("invalid preprocessor directive");
 749                 eat_until_newline();
 750                 break;
 751         }
 752 }
 753
 754 #define MAYBE_PROLOG                                       \
 755                         next_char();                                   \
 756                         while(1) {                                     \
 757                                 switch(c) {
 758
 759 #define MAYBE(ch, set_type)                                \
 760                                 case ch:                                   \
 761                                         next_char();                           \
 762                                         lexer_token.type = set_type;           \
 763                                         return;
 764
 765 #define ELSE_CODE(code)                                    \
 766                                 SKIP_TRIGRAPHS(,                           \
 767                                         code;                                  \
 768                                 )                                          \
 769                                                                                                                    \
 770                                 case '\\':                                 \
 771                                         next_char();                           \
 772                                         EAT_NEWLINE(break;)                    \
 773                                         /* fallthrough */                      \
 774                                 default:                                   \
 775                                         code;                                  \
 776                                 }                                          \
 777                         } /* end of while(1) */                        \
 778                         break;
 779
 780 #define ELSE(set_type)                                     \
 781                 ELSE_CODE(                                         \
 782                         lexer_token.type = set_type;                   \
 783                         return;                                        \
 784                 )
 785
 786 void lexer_next_preprocessing_token(void)
 787 {
 788         while(1) {
 789                 switch(c) {
 790                 case ' ':
 791                 case '\t':
 792                         next_char();
 793                         break;
 794
 795                 MATCH_NEWLINE(
 796                         lexer_token.type = '\n';
 797                         return;
 798                 )
 799
 800                 SYMBOL_CHARS
 801                         parse_symbol();
 802                         return;
 803
 804                 DIGITS
 805                         parse_number();
 806                         return;
 807
 808                 case '"':
 809                         parse_string_literal();
 810                         return;
 811
 812                 case '\'':
 813                         parse_character_constant();
 814                         return;
 815
 816                 case '\\':
 817                         next_char();
 818                         if(c == '\n') {
 819                                 next_char();
 820                                 lexer_token.source_position.linenr++;
 821                                 break;
 822                         } else {
 823                                 parse_error("unexpected '\\' found");
 824                                 lexer_token.type = T_ERROR;
 825                         }
 826                         return;
 827
 828                 case '.':
 829                         MAYBE_PROLOG
 830                                 case '.':
 831                                         MAYBE_PROLOG
 832                                         MAYBE('.', T_DOTDOTDOT)
 833                                         ELSE_CODE(
 834                                                 put_back(c);
 835                                                 c = '.';
 836                                                 lexer_token.type = '.';
 837                                                 return;
 838                                         )
 839                         ELSE('.')
 840                 case '&':
 841                         MAYBE_PROLOG
 842                         MAYBE('&', T_ANDAND)
 843                         MAYBE('=', T_ANDEQUAL)
 844                         ELSE('&')
 845                 case '*':
 846                         MAYBE_PROLOG
 847                         MAYBE('=', T_ASTERISKEQUAL)
 848                         ELSE('*')
 849                 case '+':
 850                         MAYBE_PROLOG
 851                         MAYBE('+', T_PLUSPLUS)
 852                         MAYBE('=', T_PLUSEQUAL)
 853                         ELSE('+')
 854                 case '-':
 855                         MAYBE_PROLOG
 856                         MAYBE('-', T_MINUSMINUS)
 857                         MAYBE('=', T_MINUSEQUAL)
 858                         ELSE('-')
 859                 case '!':
 860                         MAYBE_PROLOG
 861                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 862                         ELSE('!')
 863                 case '/':
 864                         MAYBE_PROLOG
 865                         MAYBE('=', T_SLASHEQUAL)
 866                                 case '*':
 867                                         next_char();
 868                                         skip_multiline_comment();
 869                                         lexer_next_preprocessing_token();
 870                                         return;
 871                                 case '/':
 872                                         next_char();
 873                                         skip_line_comment();
 874                                         lexer_next_preprocessing_token();
 875                                         return;
 876                         ELSE('/')
 877                 case '%':
 878                         MAYBE_PROLOG
 879                         MAYBE('>', T_PERCENTGREATER)
 880                         MAYBE('=', T_PERCENTEQUAL)
 881                                 case ':':
 882                                         MAYBE_PROLOG
 883                                                 case '%':
 884                                                         MAYBE_PROLOG
 885                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 886                                                         ELSE_CODE(
 887                                                                 put_back(c);
 888                                                                 c = '%';
 889                                                                 lexer_token.type = T_PERCENTCOLON;
 890                                                                 return;
 891                                                         )
 892                                         ELSE(T_PERCENTCOLON)
 893                         ELSE('%')
 894                 case '<':
 895                         MAYBE_PROLOG
 896                         MAYBE(':', T_LESSCOLON)
 897                         MAYBE('%', T_LESSPERCENT)
 898                                 case '<':
 899                                         MAYBE_PROLOG
 900                                         MAYBE('=', T_LESSLESSEQUAL)
 901                                         ELSE(T_LESSLESS)
 902                         ELSE('<')
 903                 case '>':
 904                         MAYBE_PROLOG
 905                                 case '>':
 906                                         MAYBE_PROLOG
 907                                         MAYBE('=', T_GREATERGREATEREQUAL)
 908                                         ELSE(T_GREATERGREATER)
 909                         ELSE('>')
 910                 case '^':
 911                         MAYBE_PROLOG
 912                         MAYBE('=', T_CARETEQUAL)
 913                         ELSE('^')
 914                 case '|':
 915                         MAYBE_PROLOG
 916                         MAYBE('=', T_PIPEEQUAL)
 917                         MAYBE('|', T_PIPEPIPE)
 918                         ELSE('|')
 919                 case ':':
 920                         MAYBE_PROLOG
 921                         MAYBE('>', T_COLONGREATER)
 922                         ELSE(':')
 923                 case '=':
 924                         MAYBE_PROLOG
 925                         MAYBE('=', T_EQUALEQUAL)
 926                         ELSE('=')
 927                 case '#':
 928                         MAYBE_PROLOG
 929                         MAYBE('#', T_HASHHASH)
 930                         ELSE('#')
 931
 932                 case '?':
 933                         next_char();
 934                         /* just a simple ? */
 935                         if(c != '?') {
 936                                 lexer_token.type = '?';
 937                                 return;
 938                         }
 939                         /* might be a trigraph */
 940                         next_char();
 941                         if(replace_trigraph()) {
 942                                 break;
 943                         }
 944                         put_back(c);
 945                         c = '?';
 946                         lexer_token.type = '?';
 947                         return;
 948
 949                 case '[':
 950                 case ']':
 951                 case '(':
 952                 case ')':
 953                 case '{':
 954                 case '}':
 955                 case '~':
 956                 case ';':
 957                 case ',':
 958                         lexer_token.type = c;
 959                         next_char();
 960                         return;
 961
 962                 case EOF:
 963                         lexer_token.type = T_EOF;
 964                         return;
 965
 966                 default:
 967                         next_char();
 968                         error_prefix();
 969                         fprintf(stderr, "unknown character '%c' found\n", c);
 970                         lexer_token.type = T_ERROR;
 971                         return;
 972                 }
 973         }
 974 }
 975
 976 void lexer_next_token(void)
 977 {
 978         lexer_next_preprocessing_token();
 979         if(lexer_token.type != '\n')
 980                 return;
 981
 982 newline_found:
 983         do {
 984                 lexer_next_preprocessing_token();
 985         } while(lexer_token.type == '\n');
 986
 987         if(lexer_token.type == '#') {
 988                 parse_preprocessor_directive();
 989                 goto newline_found;
 990         }
 991 }
 992
 993 void init_lexer(void)
 994 {
 995         strset_init(&stringset);
 996 }
 997
 998 void lexer_open_stream(FILE *stream, const char *input_name)
 999 {
1000         input                                  = stream;
1001         lexer_token.source_position.linenr     = 0;
1002         lexer_token.source_position.input_name = input_name;
1003
1004         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1005          * beginning of a line */
1006         c = '\n';
1007 }
1008
1009 void exit_lexer(void)
1010 {
1011         strset_destroy(&stringset);
1012 }
1013
1014 static __attribute__((unused))
1015 void dbg_pos(const source_position_t source_position)
1016 {
1017         fprintf(stdout, "%s:%d\n", source_position.input_name,
1018                 source_position.linenr);
1019         fflush(stdout);
1020 }