nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer_t.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7
   8 #include <assert.h>
   9 #include <errno.h>
  10 #include <string.h>
  11 #include <ctype.h>
  12
  13 //#define DEBUG_CHARS
  14 #define MAX_PUTBACK 3
  15
  16 static int               c;
  17 source_position_t source_position;
  18 static FILE             *input;
  19 static char              buf[1027];
  20 static const char       *bufend;
  21 static const char       *bufpos;
  22 static strset_t          stringset;
  23 //static FILE            **input_stack;
  24 //static char            **buf_stack;
  25
  26 static
  27 void error_prefix_at(const char *input_name, unsigned linenr)
  28 {
  29         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
  30 }
  31
  32 static
  33 void error_prefix()
  34 {
  35         error_prefix_at(source_position.input_name, source_position.linenr);
  36 }
  37
  38 static
  39 void parse_error(const char *msg)
  40 {
  41         error_prefix();
  42         fprintf(stderr, "%s\n", msg);
  43 }
  44
  45 static inline
  46 void next_char()
  47 {
  48         bufpos++;
  49         if(bufpos >= bufend) {
  50                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  51                                  input);
  52                 if(s == 0) {
  53                         c = EOF;
  54                         return;
  55                 }
  56                 bufpos = buf + MAX_PUTBACK;
  57                 bufend = buf + MAX_PUTBACK + s;
  58         }
  59         c = *(bufpos);
  60 #ifdef DEBUG_CHARS
  61         printf("nchar '%c'\n", c);
  62 #endif
  63 }
  64
  65 static inline
  66 void put_back(int pc)
  67 {
  68         char *p = (char*) bufpos - 1;
  69         bufpos--;
  70         assert(p >= buf);
  71         *p = pc;
  72
  73 #ifdef DEBUG_CHARS
  74         printf("putback '%c'\n", pc);
  75 #endif
  76 }
  77
  78
  79 static
  80 int replace_trigraph(void)
  81 {
  82 #define MATCH_TRIGRAPH(ch,replacement)           \
  83         case ch:                                     \
  84                 c = replacement;                         \
  85                 return 1;
  86
  87         switch(c) {
  88         MATCH_TRIGRAPH('=', '#')
  89         MATCH_TRIGRAPH('(', '[')
  90         MATCH_TRIGRAPH('/', '\\')
  91         MATCH_TRIGRAPH(')', ']')
  92         MATCH_TRIGRAPH('\'', '^')
  93         MATCH_TRIGRAPH('<', '{')
  94         MATCH_TRIGRAPH('!', '|')
  95         MATCH_TRIGRAPH('>', '}')
  96         MATCH_TRIGRAPH('-', '~')
  97         default:
  98                 break;
  99         }
 100
 101         return 0;
 102 }
 103
 104 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
 105         case '?':                                  \
 106                 next_char();                           \
 107                 if(c != '?') {                         \
 108                         custom_putback;                    \
 109                         put_back(c);                       \
 110                         c = '?';                           \
 111                         no_trigraph_code;                  \
 112                 }                                      \
 113                 next_char();                           \
 114                 if(replace_trigraph()) {               \
 115                         break;                             \
 116                 }                                      \
 117                 custom_putback;                        \
 118                 put_back('?');                         \
 119                 put_back(c);                           \
 120                 c = '?';                               \
 121                 no_trigraph_code;
 122
 123 #define EAT_NEWLINE(newline_code)              \
 124         if(c == '\r') {                            \
 125                 next_char();                           \
 126                 if(c == '\n')                          \
 127                         next_char();                       \
 128                 source_position.linenr++;              \
 129                 newline_code;                          \
 130         } else if(c == '\n') {                     \
 131                 next_char();                           \
 132                 source_position.linenr++;              \
 133                 newline_code;                          \
 134         }
 135
 136 #define SYMBOL_CHARS  \
 137         case 'a':         \
 138         case 'b':         \
 139         case 'c':         \
 140         case 'd':         \
 141         case 'e':         \
 142         case 'f':         \
 143         case 'g':         \
 144         case 'h':         \
 145         case 'i':         \
 146         case 'j':         \
 147         case 'k':         \
 148         case 'l':         \
 149         case 'm':         \
 150         case 'n':         \
 151         case 'o':         \
 152         case 'p':         \
 153         case 'q':         \
 154         case 'r':         \
 155         case 's':         \
 156         case 't':         \
 157         case 'u':         \
 158         case 'v':         \
 159         case 'w':         \
 160         case 'x':         \
 161         case 'y':         \
 162         case 'z':         \
 163         case 'A':         \
 164         case 'B':         \
 165         case 'C':         \
 166         case 'D':         \
 167         case 'E':         \
 168         case 'F':         \
 169         case 'G':         \
 170         case 'H':         \
 171         case 'I':         \
 172         case 'J':         \
 173         case 'K':         \
 174         case 'L':         \
 175         case 'M':         \
 176         case 'N':         \
 177         case 'O':         \
 178         case 'P':         \
 179         case 'Q':         \
 180         case 'R':         \
 181         case 'S':         \
 182         case 'T':         \
 183         case 'U':         \
 184         case 'V':         \
 185         case 'W':         \
 186         case 'X':         \
 187         case 'Y':         \
 188         case 'Z':         \
 189         case '_':
 190
 191 static
 192 void parse_symbol(token_t *token)
 193 {
 194         symbol_t *symbol;
 195         char     *string;
 196
 197         obstack_1grow(&symbol_obstack, c);
 198         next_char();
 199
 200         while(1) {
 201                 switch(c) {
 202                 case '\\':
 203                         next_char();
 204                         EAT_NEWLINE(break;)
 205                         goto end_symbol;
 206
 207                 SYMBOL_CHARS
 208                         obstack_1grow(&symbol_obstack, c);
 209                         next_char();
 210                         break;
 211
 212                 case '?':
 213                         next_char();
 214                         if(c != '?') {
 215                                 put_back(c);
 216                                 c = '?';
 217                                 goto end_symbol;
 218                         }
 219                         next_char();
 220                         if(replace_trigraph())
 221                                 break;
 222                         put_back('?');
 223                         put_back(c);
 224                         c = '?';
 225                         goto end_symbol;
 226
 227                 default:
 228                         goto end_symbol;
 229                 }
 230         }
 231 end_symbol:
 232         obstack_1grow(&symbol_obstack, '\0');
 233
 234         string = obstack_finish(&symbol_obstack);
 235         symbol = symbol_table_insert(string);
 236
 237         token->type     = symbol->ID;
 238         token->v.symbol = symbol;
 239
 240         if(symbol->string != string) {
 241                 obstack_free(&symbol_obstack, string);
 242         }
 243 }
 244
 245 static
 246 void parse_number_hex(token_t *token)
 247 {
 248         assert(c == 'x' || c == 'X');
 249         next_char();
 250
 251         if (!isdigit(c) &&
 252                 !('A' <= c && c <= 'F') &&
 253                 !('a' <= c && c <= 'f')) {
 254                 parse_error("premature end of hex number literal");
 255                 token->type = T_ERROR;
 256                 return;
 257         }
 258
 259         int value = 0;
 260         for(;;) {
 261                 if (isdigit(c)) {
 262                         value = 16 * value + c - '0';
 263                 } else if ('A' <= c && c <= 'F') {
 264                         value = 16 * value + c - 'A' + 10;
 265                 } else if ('a' <= c && c <= 'f') {
 266                         value = 16 * value + c - 'a' + 10;
 267                 } else {
 268                         token->type     = T_INTEGER;
 269                         token->v.intvalue = value;
 270                         return;
 271                 }
 272                 next_char();
 273         }
 274 }
 275
 276 static
 277 void parse_number_oct(token_t *token)
 278 {
 279         assert(c == 'o' || c == 'O');
 280         next_char();
 281
 282         int value = 0;
 283         for(;;) {
 284                 if ('0' <= c && c <= '7') {
 285                         value = 8 * value + c - '0';
 286                 } else {
 287                         token->type     = T_INTEGER;
 288                         token->v.intvalue = value;
 289                         return;
 290                 }
 291                 next_char();
 292         }
 293 }
 294
 295 static
 296 void parse_number_dec(token_t *token, int first_char)
 297 {
 298         int value = 0;
 299         if(first_char > 0) {
 300                 assert(first_char >= '0' && first_char <= '9');
 301                 value = first_char - '0';
 302         }
 303
 304         for(;;) {
 305                 if (isdigit(c)) {
 306                         value = 10 * value + c - '0';
 307                 } else {
 308                         token->type     = T_INTEGER;
 309                         token->v.intvalue = value;
 310                         return;
 311                 }
 312                 next_char();
 313         }
 314 }
 315
 316 static
 317 void parse_number(token_t *token)
 318 {
 319         // TODO check for overflow
 320         // TODO check for various invalid inputs sequences
 321
 322         if (c == '0') {
 323                 next_char();
 324                 switch (c) {
 325                         case 'X':
 326                         case 'x': parse_number_hex(token); break;
 327                         case 'o':
 328                         case 'O': parse_number_oct(token); break;
 329                         default:  parse_number_dec(token, '0');
 330                 }
 331         } else {
 332                 parse_number_dec(token, 0);
 333         }
 334 }
 335
 336 static
 337 int parse_escape_sequence()
 338 {
 339         while(1) {
 340                 int ec = c;
 341                 next_char();
 342
 343                 switch(ec) {
 344                 case '"': return '"';
 345                 case '\'': return'\'';
 346                 case '\\':
 347                         EAT_NEWLINE(break;)
 348                         return '\\';
 349                 case 'a': return '\a';
 350                 case 'b': return '\b';
 351                 case 'f': return '\f';
 352                 case 'n': return '\n';
 353                 case 'r': return '\r';
 354                 case 't': return '\t';
 355                 case 'v': return '\v';
 356                 case 'x': /* TODO parse hex number ... */
 357                         parse_error("hex escape sequences not implemented yet");
 358                         return EOF;
 359                 case 0:
 360                 case 1:
 361                 case 2:
 362                 case 3:
 363                 case 4:
 364                 case 5:
 365                 case 6:
 366                 case 7:
 367                         /* TODO parse octal number ... */
 368                         parse_error("octal escape sequences not implemented yet");
 369                         return EOF;
 370                 case '?':
 371                         if(c != '?') {
 372                                 return '?';
 373                         }
 374                         /* might be a trigraph */
 375                         next_char();
 376                         if(replace_trigraph()) {
 377                                 break;
 378                         }
 379                         put_back(c);
 380                         c = '?';
 381                         return '?';
 382
 383                 case EOF:
 384                         parse_error("reached end of file while parsing escape sequence");
 385                         return EOF;
 386                 default:
 387                         parse_error("unknown escape sequence");
 388                         return EOF;
 389                 }
 390         }
 391 }
 392
 393 static
 394 void parse_string_literal(token_t *token)
 395 {
 396         unsigned    start_linenr = source_position.linenr;
 397         char       *string;
 398         const char *result;
 399
 400         assert(c == '"');
 401         next_char();
 402
 403         while(1) {
 404                 switch(c) {
 405                 SKIP_TRIGRAPHS(,
 406                         obstack_1grow(&symbol_obstack, '?');
 407                         next_char();
 408                         break;
 409                 )
 410
 411                 case '\\':
 412                         next_char();
 413                         EAT_NEWLINE(break;)
 414                         int ec = parse_escape_sequence();
 415                         obstack_1grow(&symbol_obstack, ec);
 416                         break;
 417
 418                 case EOF:
 419                         error_prefix_at(source_position.input_name, start_linenr);
 420                         fprintf(stderr, "string has no end\n");
 421                         token->type = T_ERROR;
 422                         return;
 423
 424                 case '"':
 425                         next_char();
 426                         goto end_of_string;
 427
 428                 default:
 429                         obstack_1grow(&symbol_obstack, c);
 430                         next_char();
 431                         break;
 432                 }
 433         }
 434
 435 end_of_string:
 436
 437         /* TODO: concatenate multiple strings separated by whitespace... */
 438
 439         /* add finishing 0 to the string */
 440         obstack_1grow(&symbol_obstack, '\0');
 441         string = obstack_finish(&symbol_obstack);
 442
 443         /* check if there is already a copy of the string */
 444         result = strset_insert(&stringset, string);
 445         if(result != string) {
 446                 obstack_free(&symbol_obstack, string);
 447         }
 448
 449         token->type     = T_STRING_LITERAL;
 450         token->v.string = result;
 451 }
 452
 453 #define MATCH_NEWLINE(code)                 \
 454         case '\r':                              \
 455                 next_char();                        \
 456                 if(c == '\n') {                     \
 457                         next_char();                    \
 458                 }                                   \
 459                 source_position.linenr++;           \
 460                 code;                               \
 461         case '\n':                              \
 462                 next_char();                        \
 463                 source_position.linenr++;           \
 464                 code;
 465
 466 static
 467 void parse_character_constant(token_t *token)
 468 {
 469         assert(c == '\'');
 470         next_char();
 471
 472         int found_char = 0;
 473         while(1) {
 474                 switch(c) {
 475                 SKIP_TRIGRAPHS(,
 476                         found_char = '?';
 477                         break;
 478                 )
 479
 480                 case '\\':
 481                         next_char();
 482                         EAT_NEWLINE(break;)
 483                         found_char = '\\';
 484                         break;
 485
 486                 MATCH_NEWLINE(
 487                         parse_error("newline while parsing character constant");
 488                         break;
 489                 )
 490
 491                 case '\'':
 492                         next_char();
 493                         goto end_of_char_constant;
 494
 495                 case EOF:
 496                         parse_error("EOF while parsing character constant");
 497                         token->type = T_ERROR;
 498                         return;
 499
 500                 default:
 501                         if(found_char != 0) {
 502                                 parse_error("more than 1 characters in character "
 503                                             "constant");
 504                                 goto end_of_char_constant;
 505                         } else {
 506                                 found_char = c;
 507                                 next_char();
 508                         }
 509                         break;
 510                 }
 511         }
 512
 513 end_of_char_constant:
 514         token->type       = T_INTEGER;
 515         token->v.intvalue = found_char;
 516 }
 517
 518 static
 519 void skip_multiline_comment(void)
 520 {
 521         unsigned start_linenr = source_position.linenr;
 522         int had_star = 0;
 523
 524         while(1) {
 525                 switch(c) {
 526                 case '*':
 527                         next_char();
 528                         had_star = 1;
 529                         break;
 530
 531                 case '/':
 532                         next_char();
 533                         if(had_star) {
 534                                 return;
 535                         }
 536                         had_star = 0;
 537                         break;
 538
 539                 case '\\':
 540                         next_char();
 541                         EAT_NEWLINE(break;)
 542                         had_star = 0;
 543                         break;
 544
 545                 case '?':
 546                         next_char();
 547                         if(c != '?') {
 548                                 had_star = 0;
 549                                 break;
 550                         }
 551                         next_char();
 552                         if(replace_trigraph())
 553                                 break;
 554                         put_back(c);
 555                         c = '?';
 556                         had_star = 0;
 557                         /* we don't put back the 2nd ? as the comment text is discarded
 558                          * anyway */
 559                         break;
 560
 561                 MATCH_NEWLINE(had_star = 0; break;)
 562
 563                 case EOF:
 564                         error_prefix_at(source_position.input_name, start_linenr);
 565                         fprintf(stderr, "at end of file while looking for comment end\n");
 566                         return;
 567                 default:
 568                         had_star = 0;
 569                         next_char();
 570                         break;
 571                 }
 572         }
 573 }
 574
 575 static
 576 void skip_line_comment(void)
 577 {
 578         while(1) {
 579                 switch(c) {
 580                 case '?':
 581                         next_char();
 582                         if(c != '?')
 583                                 break;
 584                         next_char();
 585                         if(replace_trigraph())
 586                                 break;
 587                         put_back('?');
 588                         /* we don't put back the 2nd ? as the comment text is discarded
 589                          * anyway */
 590                         break;
 591
 592                 case '\\':
 593                         next_char();
 594                         if(c == '\n') {
 595                                 next_char();
 596                                 source_position.linenr++;
 597                         }
 598                         break;
 599
 600                 case EOF:
 601                 case '\r':
 602                 case '\n':
 603                         return;
 604
 605                 default:
 606                         next_char();
 607                         break;
 608                 }
 609         }
 610 }
 611
 612 static
 613 void lexer_next_preprocessing_token(token_t *token);
 614
 615 static token_t pp_token;
 616
 617 static inline
 618 void next_pp_token(void)
 619 {
 620         lexer_next_preprocessing_token(&pp_token);
 621 }
 622
 623 static
 624 void eat_until_newline(void)
 625 {
 626         /* TODO */
 627 }
 628
 629 static
 630 void error_directive(void)
 631 {
 632         error_prefix();
 633         fprintf(stderr, "#error directive: \n");
 634
 635         /* parse pp-tokens until new-line */
 636 }
 637
 638 static
 639 void define_directive(void)
 640 {
 641         token_t temptoken;
 642
 643         lexer_next_preprocessing_token(&temptoken);
 644         if(temptoken.type != T_IDENTIFIER) {
 645                 parse_error("expected identifier after #define\n");
 646                 eat_until_newline();
 647         }
 648 }
 649
 650 static
 651 void ifdef_directive(int is_ifndef)
 652 {
 653         (void) is_ifndef;
 654         token_t temptoken;
 655         lexer_next_preprocessing_token(&temptoken);
 656         //expect_identifier();
 657         //extect_newline();
 658 }
 659
 660 static
 661 void endif_directive(void)
 662 {
 663         //expect_newline();
 664 }
 665
 666 static
 667 void parse_line_directive(void)
 668 {
 669         if(pp_token.type != T_INTEGER) {
 670                 parse_error("expected integer");
 671         } else {
 672                 source_position.linenr = pp_token.v.intvalue - 1;
 673                 next_pp_token();
 674         }
 675         if(pp_token.type == T_STRING_LITERAL) {
 676                 source_position.input_name = pp_token.v.string;
 677                 next_pp_token();
 678         }
 679
 680         while(pp_token.type != T_EOF && pp_token.type != '\n') {
 681                 next_pp_token();
 682         }
 683 }
 684
 685 static
 686 void parse_preprocessor_identifier(void)
 687 {
 688         assert(pp_token.type == T_IDENTIFIER);
 689         symbol_t *symbol = pp_token.v.symbol;
 690
 691         switch(symbol->pp_ID) {
 692         case TP_include:
 693                 printf("include - enable header name parsing!\n");
 694                 break;
 695         case TP_define:
 696                 define_directive();
 697                 break;
 698         case TP_ifdef:
 699                 ifdef_directive(0);
 700                 break;
 701         case TP_ifndef:
 702                 ifdef_directive(1);
 703                 break;
 704         case TP_endif:
 705                 endif_directive();
 706                 break;
 707         case TP_line:
 708                 next_pp_token();
 709                 parse_line_directive();
 710                 break;
 711         case TP_if:
 712         case TP_else:
 713         case TP_elif:
 714         case TP_undef:
 715         case TP_error:
 716                 error_directive();
 717                 break;
 718         case TP_pragma:
 719                 break;
 720         }
 721 }
 722
 723 static
 724 void parse_preprocessor_directive(token_t *result_token)
 725 {
 726         next_pp_token();
 727
 728         switch(pp_token.type) {
 729         case T_IDENTIFIER:
 730                 parse_preprocessor_identifier();
 731                 break;
 732         case T_INTEGER:
 733                 parse_line_directive();
 734                 break;
 735         }
 736
 737         lexer_next_token(result_token);
 738 }
 739
 740 #define MAYBE_PROLOG                                       \
 741                         next_char();                                   \
 742                         while(1) {                                     \
 743                                 switch(c) {
 744
 745 #define MAYBE(ch, set_type)                                \
 746                                 case ch:                                   \
 747                                         next_char();                           \
 748                                         token->type = set_type;                \
 749                                         return;
 750
 751 #define ELSE_CODE(code)                                    \
 752                                 SKIP_TRIGRAPHS(,                           \
 753                                         code;                                  \
 754                                 )                                          \
 755                                                                                                                    \
 756                                 case '\\':                                 \
 757                                         next_char();                           \
 758                                         EAT_NEWLINE(break;)                    \
 759                                         /* fallthrough */                      \
 760                                 default:                                   \
 761                                         code;                                  \
 762                                 }                                          \
 763                         } /* end of while(1) */                        \
 764                         break;
 765
 766 #define ELSE(set_type)                                     \
 767                 ELSE_CODE(                                         \
 768                         token->type = set_type;                        \
 769                         return;                                        \
 770                 )
 771
 772 static
 773 void eat_whitespace()
 774 {
 775         while(1) {
 776                 switch(c) {
 777                 case ' ':
 778                 case '\t':
 779                         next_char();
 780                         break;
 781
 782                 case '\r':
 783                 case '\n':
 784                         return;
 785
 786                 case '\\':
 787                         next_char();
 788                         if(c == '\n') {
 789                                 next_char();
 790                                 source_position.linenr++;
 791                                 break;
 792                         }
 793
 794                         put_back(c);
 795                         c = '\\';
 796                         return;
 797
 798                 SKIP_TRIGRAPHS(,
 799                         return;
 800                 )
 801
 802                 case '/':
 803                         next_char();
 804                         while(1) {
 805                                 switch(c) {
 806                                 case '*':
 807                                         next_char();
 808                                         skip_multiline_comment();
 809                                         eat_whitespace();
 810                                         return;
 811                                 case '/':
 812                                         next_char();
 813                                         skip_line_comment();
 814                                         eat_whitespace();
 815                                         return;
 816
 817                                 SKIP_TRIGRAPHS(
 818                                                 put_back('?');
 819                                         ,
 820                                                 c = '/';
 821                                                 return;
 822                                 )
 823
 824                                 case '\\':
 825                                         next_char();
 826                                         EAT_NEWLINE(break;)
 827                                         /* fallthrough */
 828                                 default:
 829                                         return;
 830                                 }
 831                         }
 832                         break;
 833
 834                 default:
 835                         return;
 836                 }
 837         }
 838 }
 839
 840 static
 841 void lexer_next_preprocessing_token(token_t *token)
 842 {
 843         while(1) {
 844                 switch(c) {
 845                 case ' ':
 846                 case '\t':
 847                         next_char();
 848                         break;
 849
 850                 MATCH_NEWLINE(
 851                         eat_whitespace();
 852                         if(c == '#') {
 853                                 next_char();
 854                                 parse_preprocessor_directive(token);
 855                                 return;
 856                         }
 857                         token->type = '\n';
 858                         return;
 859                 )
 860
 861                 SYMBOL_CHARS
 862                         parse_symbol(token);
 863                         return;
 864
 865                 case '0':
 866                 case '1':
 867                 case '2':
 868                 case '3':
 869                 case '4':
 870                 case '5':
 871                 case '6':
 872                 case '7':
 873                 case '8':
 874                 case '9':
 875                         parse_number(token);
 876                         return;
 877
 878                 case '"':
 879                         parse_string_literal(token);
 880                         return;
 881
 882                 case '\'':
 883                         parse_character_constant(token);
 884                         return;
 885
 886                 case '\\':
 887                         next_char();
 888                         if(c == '\n') {
 889                                 next_char();
 890                                 source_position.linenr++;
 891                                 break;
 892                         } else {
 893                                 parse_error("unexpected '\\' found");
 894                                 token->type = T_ERROR;
 895                         }
 896                         return;
 897
 898                 case '.':
 899                         MAYBE_PROLOG
 900                                 case '.':
 901                                         MAYBE_PROLOG
 902                                         MAYBE('.', T_DOTDOTDOT)
 903                                         ELSE_CODE(
 904                                                 put_back(c);
 905                                                 c = '.';
 906                                                 token->type = '.';
 907                                                 return;
 908                                         )
 909                         ELSE('.')
 910                 case '&':
 911                         MAYBE_PROLOG
 912                         MAYBE('&', T_ANDAND)
 913                         MAYBE('=', T_ANDEQUAL)
 914                         ELSE('&')
 915                 case '*':
 916                         MAYBE_PROLOG
 917                         MAYBE('=', T_ASTERISKEQUAL)
 918                         ELSE('*')
 919                 case '+':
 920                         MAYBE_PROLOG
 921                         MAYBE('+', T_PLUSPLUS)
 922                         MAYBE('=', T_PLUSEQUAL)
 923                         ELSE('+')
 924                 case '-':
 925                         MAYBE_PROLOG
 926                         MAYBE('-', T_MINUSMINUS)
 927                         MAYBE('=', T_MINUSEQUAL)
 928                         ELSE('-')
 929                 case '!':
 930                         MAYBE_PROLOG
 931                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 932                         ELSE('!')
 933                 case '/':
 934                         MAYBE_PROLOG
 935                         MAYBE('=', T_SLASHEQUAL)
 936                                 case '*':
 937                                         next_char();
 938                                         skip_multiline_comment();
 939                                         lexer_next_preprocessing_token(token);
 940                                         return;
 941                                 case '/':
 942                                         next_char();
 943                                         skip_line_comment();
 944                                         lexer_next_preprocessing_token(token);
 945                                         return;
 946                         ELSE('/')
 947                 case '%':
 948                         MAYBE_PROLOG
 949                         MAYBE('>', T_PERCENTGREATER)
 950                         MAYBE('=', T_PERCENTEQUAL)
 951                                 case ':':
 952                                         MAYBE_PROLOG
 953                                                 case '%':
 954                                                         MAYBE_PROLOG
 955                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 956                                                         ELSE_CODE(
 957                                                                 put_back(c);
 958                                                                 c = '%';
 959                                                                 token->type = T_PERCENTCOLON;
 960                                                                 return;
 961                                                         )
 962                                         ELSE(T_PERCENTCOLON)
 963                         ELSE('%')
 964                 case '<':
 965                         MAYBE_PROLOG
 966                         MAYBE(':', T_LESSCOLON)
 967                         MAYBE('%', T_LESSPERCENT)
 968                                 case '<':
 969                                         MAYBE_PROLOG
 970                                         MAYBE('=', T_LESSLESSEQUAL)
 971                                         ELSE(T_LESSLESS)
 972                         ELSE('<')
 973                 case '>':
 974                         MAYBE_PROLOG
 975                                 case '>':
 976                                         MAYBE_PROLOG
 977                                         MAYBE('=', T_GREATERGREATEREQUAL)
 978                                         ELSE(T_GREATERGREATER)
 979                         ELSE('>')
 980                 case '^':
 981                         MAYBE_PROLOG
 982                         MAYBE('=', T_CARETEQUAL)
 983                         ELSE('^')
 984                 case '|':
 985                         MAYBE_PROLOG
 986                         MAYBE('=', T_PIPEEQUAL)
 987                         MAYBE('|', T_PIPEPIPE)
 988                         ELSE('|')
 989                 case ':':
 990                         MAYBE_PROLOG
 991                         MAYBE('>', T_COLONGREATER)
 992                         ELSE(':')
 993                 case '=':
 994                         MAYBE_PROLOG
 995                         MAYBE('=', T_EQUALEQUAL)
 996                         ELSE('=')
 997                 case '#':
 998                         MAYBE_PROLOG
 999                         MAYBE('#', T_HASHHASH)
1000                         ELSE('#')
1001
1002                 case '?':
1003                         next_char();
1004                         /* just a simple ? */
1005                         if(c != '?') {
1006                                 token->type = '?';
1007                                 return;
1008                         }
1009                         /* might be a trigraph */
1010                         next_char();
1011                         if(replace_trigraph()) {
1012                                 break;
1013                         }
1014                         put_back(c);
1015                         c = '?';
1016                         token->type = '?';
1017                         return;
1018
1019                 case '[':
1020                 case ']':
1021                 case '(':
1022                 case ')':
1023                 case '{':
1024                 case '}':
1025                 case '~':
1026                 case ';':
1027                 case ',':
1028                         token->type = c;
1029                         next_char();
1030                         return;
1031
1032                 case EOF:
1033                         token->type = T_EOF;
1034                         return;
1035
1036                 default:
1037                         next_char();
1038                         error_prefix();
1039                         fprintf(stderr, "unknown character '%c' found\n", c);
1040                         token->type = T_ERROR;
1041                         return;
1042                 }
1043         }
1044 }
1045
1046 void lexer_next_token(token_t *token)
1047 {
1048         do {
1049                 lexer_next_preprocessing_token(token);
1050         } while(token->type == '\n');
1051 }
1052
1053 void init_lexer(void)
1054 {
1055         strset_init(&stringset);
1056 }
1057
1058 void lexer_open_stream(FILE *stream, const char *input_name)
1059 {
1060         input                      = stream;
1061         source_position.linenr     = 0;
1062         source_position.input_name = input_name;
1063
1064         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1065          * beginning of a line */
1066         c = '\n';
1067 }
1068
1069 void exit_lexer(void)
1070 {
1071         strset_destroy(&stringset);
1072 }
1073
1074 static __attribute__((unused))
1075 void dbg_pos(const source_position_t source_position)
1076 {
1077         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
1078         fflush(stdout);
1079 }