nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8
   9 #include <assert.h>
  10 #include <errno.h>
  11 #include <string.h>
  12 #include <ctype.h>
  13
  14 //#define DEBUG_CHARS
  15 #define MAX_PUTBACK 3
  16
  17 static int         c;
  18 token_t            lexer_token;
  19 static FILE       *input;
  20 static char        buf[1024 + MAX_PUTBACK];
  21 static const char *bufend;
  22 static const char *bufpos;
  23 static strset_t    stringset;
  24 //static FILE      **input_stack;
  25 //static char      **buf_stack;
  26
  27 static
  28 void error_prefix_at(const char *input_name, unsigned linenr)
  29 {
  30         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
  31 }
  32
  33 static
  34 void error_prefix(void)
  35 {
  36         error_prefix_at(lexer_token.source_position.input_name,
  37                         lexer_token.source_position.linenr);
  38 }
  39
  40 static
  41 void parse_error(const char *msg)
  42 {
  43         error_prefix();
  44         fprintf(stderr, "%s\n", msg);
  45 }
  46
  47 static inline
  48 void next_char(void)
  49 {
  50         bufpos++;
  51         if(bufpos >= bufend) {
  52                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  53                                  input);
  54                 if(s == 0) {
  55                         c = EOF;
  56                         return;
  57                 }
  58                 bufpos = buf + MAX_PUTBACK;
  59                 bufend = buf + MAX_PUTBACK + s;
  60         }
  61         c = *(bufpos);
  62 #ifdef DEBUG_CHARS
  63         printf("nchar '%c'\n", c);
  64 #endif
  65 }
  66
  67 static inline
  68 void put_back(int pc)
  69 {
  70         char *p = (char*) bufpos - 1;
  71         bufpos--;
  72         assert(p >= buf);
  73         *p = pc;
  74
  75 #ifdef DEBUG_CHARS
  76         printf("putback '%c'\n", pc);
  77 #endif
  78 }
  79
  80
  81 static
  82 int replace_trigraph(void)
  83 {
  84 #define MATCH_TRIGRAPH(ch,replacement)           \
  85         case ch:                                     \
  86                 c = replacement;                         \
  87                 return 1;
  88
  89         switch(c) {
  90         MATCH_TRIGRAPH('=', '#')
  91         MATCH_TRIGRAPH('(', '[')
  92         MATCH_TRIGRAPH('/', '\\')
  93         MATCH_TRIGRAPH(')', ']')
  94         MATCH_TRIGRAPH('\'', '^')
  95         MATCH_TRIGRAPH('<', '{')
  96         MATCH_TRIGRAPH('!', '|')
  97         MATCH_TRIGRAPH('>', '}')
  98         MATCH_TRIGRAPH('-', '~')
  99         default:
 100                 break;
 101         }
 102
 103         return 0;
 104 }
 105
 106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
 107         case '?':                                  \
 108                 next_char();                           \
 109                 if(c != '?') {                         \
 110                         custom_putback;                    \
 111                         put_back(c);                       \
 112                         c = '?';                           \
 113                         no_trigraph_code;                  \
 114                 }                                      \
 115                 next_char();                           \
 116                 if(replace_trigraph()) {               \
 117                         break;                             \
 118                 }                                      \
 119                 custom_putback;                        \
 120                 put_back('?');                         \
 121                 put_back(c);                           \
 122                 c = '?';                               \
 123                 no_trigraph_code;
 124
 125 #define EAT_NEWLINE(newline_code)              \
 126         if(c == '\r') {                            \
 127                 next_char();                           \
 128                 if(c == '\n')                          \
 129                         next_char();                       \
 130                 lexer_token.source_position.linenr++;  \
 131                 newline_code;                          \
 132         } else if(c == '\n') {                     \
 133                 next_char();                           \
 134                 lexer_token.source_position.linenr++;  \
 135                 newline_code;                          \
 136         }
 137
 138 #define SYMBOL_CHARS  \
 139         case 'a':         \
 140         case 'b':         \
 141         case 'c':         \
 142         case 'd':         \
 143         case 'e':         \
 144         case 'f':         \
 145         case 'g':         \
 146         case 'h':         \
 147         case 'i':         \
 148         case 'j':         \
 149         case 'k':         \
 150         case 'l':         \
 151         case 'm':         \
 152         case 'n':         \
 153         case 'o':         \
 154         case 'p':         \
 155         case 'q':         \
 156         case 'r':         \
 157         case 's':         \
 158         case 't':         \
 159         case 'u':         \
 160         case 'v':         \
 161         case 'w':         \
 162         case 'x':         \
 163         case 'y':         \
 164         case 'z':         \
 165         case 'A':         \
 166         case 'B':         \
 167         case 'C':         \
 168         case 'D':         \
 169         case 'E':         \
 170         case 'F':         \
 171         case 'G':         \
 172         case 'H':         \
 173         case 'I':         \
 174         case 'J':         \
 175         case 'K':         \
 176         case 'L':         \
 177         case 'M':         \
 178         case 'N':         \
 179         case 'O':         \
 180         case 'P':         \
 181         case 'Q':         \
 182         case 'R':         \
 183         case 'S':         \
 184         case 'T':         \
 185         case 'U':         \
 186         case 'V':         \
 187         case 'W':         \
 188         case 'X':         \
 189         case 'Y':         \
 190         case 'Z':         \
 191         case '_':
 192
 193 #define DIGITS        \
 194         case '0':         \
 195         case '1':         \
 196         case '2':         \
 197         case '3':         \
 198         case '4':         \
 199         case '5':         \
 200         case '6':         \
 201         case '7':         \
 202         case '8':         \
 203         case '9':
 204
 205 static
 206 void parse_symbol(void)
 207 {
 208         symbol_t *symbol;
 209         char     *string;
 210
 211         obstack_1grow(&symbol_obstack, c);
 212         next_char();
 213
 214         while(1) {
 215                 switch(c) {
 216                 case '\\':
 217                         next_char();
 218                         EAT_NEWLINE(break;)
 219                         goto end_symbol;
 220
 221                 DIGITS
 222                 SYMBOL_CHARS
 223                         obstack_1grow(&symbol_obstack, c);
 224                         next_char();
 225                         break;
 226
 227                 case '?':
 228                         next_char();
 229                         if(c != '?') {
 230                                 put_back(c);
 231                                 c = '?';
 232                                 goto end_symbol;
 233                         }
 234                         next_char();
 235                         if(replace_trigraph())
 236                                 break;
 237                         put_back('?');
 238                         put_back(c);
 239                         c = '?';
 240                         goto end_symbol;
 241
 242                 default:
 243                         goto end_symbol;
 244                 }
 245         }
 246 end_symbol:
 247         obstack_1grow(&symbol_obstack, '\0');
 248
 249         string = obstack_finish(&symbol_obstack);
 250         symbol = symbol_table_insert(string);
 251
 252         lexer_token.type     = symbol->ID;
 253         lexer_token.v.symbol = symbol;
 254
 255         if(symbol->string != string) {
 256                 obstack_free(&symbol_obstack, string);
 257         }
 258 }
 259
 260 static
 261 void parse_number_hex(void)
 262 {
 263         assert(c == 'x' || c == 'X');
 264         next_char();
 265
 266         if (!isdigit(c) &&
 267                 !('A' <= c && c <= 'F') &&
 268                 !('a' <= c && c <= 'f')) {
 269                 parse_error("premature end of hex number literal");
 270                 lexer_token.type = T_ERROR;
 271                 return;
 272         }
 273
 274         int value = 0;
 275         for(;;) {
 276                 if (isdigit(c)) {
 277                         value = 16 * value + c - '0';
 278                 } else if ('A' <= c && c <= 'F') {
 279                         value = 16 * value + c - 'A' + 10;
 280                 } else if ('a' <= c && c <= 'f') {
 281                         value = 16 * value + c - 'a' + 10;
 282                 } else {
 283                         lexer_token.type     = T_INTEGER;
 284                         lexer_token.v.intvalue = value;
 285                         return;
 286                 }
 287                 next_char();
 288         }
 289 }
 290
 291 static
 292 void parse_number_oct(void)
 293 {
 294         assert(c == 'o' || c == 'O');
 295         next_char();
 296
 297         int value = 0;
 298         for(;;) {
 299                 if ('0' <= c && c <= '7') {
 300                         value = 8 * value + c - '0';
 301                 } else {
 302                         lexer_token.type       = T_INTEGER;
 303                         lexer_token.v.intvalue = value;
 304                         return;
 305                 }
 306                 next_char();
 307         }
 308 }
 309
 310 static
 311 void parse_number_dec(int first_char)
 312 {
 313         int value = 0;
 314         if(first_char > 0) {
 315                 assert(first_char >= '0' && first_char <= '9');
 316                 value = first_char - '0';
 317         }
 318
 319         for(;;) {
 320                 if (isdigit(c)) {
 321                         value = 10 * value + c - '0';
 322                 } else {
 323                         lexer_token.type       = T_INTEGER;
 324                         lexer_token.v.intvalue = value;
 325                         return;
 326                 }
 327                 next_char();
 328         }
 329 }
 330
 331 static
 332 void parse_number(void)
 333 {
 334         // TODO check for overflow
 335         // TODO check for various invalid inputs sequences
 336
 337         if (c == '0') {
 338                 next_char();
 339                 switch (c) {
 340                         case 'X':
 341                         case 'x': parse_number_hex(); break;
 342                         case 'o':
 343                         case 'O': parse_number_oct(); break;
 344                         default:  parse_number_dec('0');
 345                 }
 346         } else {
 347                 parse_number_dec(0);
 348         }
 349 }
 350
 351 static
 352 int parse_escape_sequence(void)
 353 {
 354         while(1) {
 355                 int ec = c;
 356                 next_char();
 357
 358                 switch(ec) {
 359                 case '"': return '"';
 360                 case '\'': return'\'';
 361                 case '\\':
 362                         EAT_NEWLINE(break;)
 363                         return '\\';
 364                 case 'a': return '\a';
 365                 case 'b': return '\b';
 366                 case 'f': return '\f';
 367                 case 'n': return '\n';
 368                 case 'r': return '\r';
 369                 case 't': return '\t';
 370                 case 'v': return '\v';
 371                 case 'x': /* TODO parse hex number ... */
 372                         parse_error("hex escape sequences not implemented yet");
 373                         return EOF;
 374                 case '0':
 375                 case '1':
 376                 case '2':
 377                 case '3':
 378                 case '4':
 379                 case '5':
 380                 case '6':
 381                 case '7':
 382                         /* TODO parse octal number ... */
 383                         parse_error("octal escape sequences not implemented yet");
 384                         return EOF;
 385                 case '?':
 386                         if(c != '?') {
 387                                 return '?';
 388                         }
 389                         /* might be a trigraph */
 390                         next_char();
 391                         if(replace_trigraph()) {
 392                                 break;
 393                         }
 394                         put_back(c);
 395                         c = '?';
 396                         return '?';
 397
 398                 case EOF:
 399                         parse_error("reached end of file while parsing escape sequence");
 400                         return EOF;
 401                 default:
 402                         parse_error("unknown escape sequence");
 403                         return EOF;
 404                 }
 405         }
 406 }
 407
 408 const char *concat_strings(const char *s1, const char *s2)
 409 {
 410         size_t  len1   = strlen(s1);
 411         size_t  len2   = strlen(s2);
 412
 413         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 414         memcpy(concat, s1, len1);
 415         memcpy(concat + len1, s2, len2 + 1);
 416
 417         const char *result = strset_insert(&stringset, concat);
 418         if(result != concat) {
 419                 obstack_free(&symbol_obstack, concat);
 420         }
 421
 422         return result;
 423 }
 424
 425 static
 426 void parse_string_literal(void)
 427 {
 428         unsigned    start_linenr = lexer_token.source_position.linenr;
 429         char       *string;
 430         const char *result;
 431
 432         assert(c == '"');
 433         next_char();
 434
 435         while(1) {
 436                 switch(c) {
 437                 SKIP_TRIGRAPHS(,
 438                         obstack_1grow(&symbol_obstack, '?');
 439                         next_char();
 440                         break;
 441                 )
 442
 443                 case '\\':
 444                         next_char();
 445                         EAT_NEWLINE(break;)
 446                         int ec = parse_escape_sequence();
 447                         obstack_1grow(&symbol_obstack, ec);
 448                         break;
 449
 450                 case EOF:
 451                         error_prefix_at(lexer_token.source_position.input_name,
 452                                         start_linenr);
 453                         fprintf(stderr, "string has no end\n");
 454                         lexer_token.type = T_ERROR;
 455                         return;
 456
 457                 case '"':
 458                         next_char();
 459                         goto end_of_string;
 460
 461                 default:
 462                         obstack_1grow(&symbol_obstack, c);
 463                         next_char();
 464                         break;
 465                 }
 466         }
 467
 468 end_of_string:
 469
 470         /* TODO: concatenate multiple strings separated by whitespace... */
 471
 472         /* add finishing 0 to the string */
 473         obstack_1grow(&symbol_obstack, '\0');
 474         string = obstack_finish(&symbol_obstack);
 475
 476         /* check if there is already a copy of the string */
 477         result = strset_insert(&stringset, string);
 478         if(result != string) {
 479                 obstack_free(&symbol_obstack, string);
 480         }
 481
 482         lexer_token.type     = T_STRING_LITERAL;
 483         lexer_token.v.string = result;
 484 }
 485
 486 #define MATCH_NEWLINE(code)                   \
 487         case '\r':                                \
 488                 next_char();                          \
 489                 if(c == '\n') {                       \
 490                         next_char();                      \
 491                 }                                     \
 492                 lexer_token.source_position.linenr++; \
 493                 code;                                 \
 494         case '\n':                                \
 495                 next_char();                          \
 496                 lexer_token.source_position.linenr++; \
 497                 code;
 498
 499 static
 500 void parse_character_constant(void)
 501 {
 502         assert(c == '\'');
 503         next_char();
 504
 505         int found_char = 0;
 506         while(1) {
 507                 switch(c) {
 508                 SKIP_TRIGRAPHS(,
 509                         found_char = '?';
 510                         break;
 511                 )
 512
 513                 case '\\':
 514                         next_char();
 515                         EAT_NEWLINE(break;)
 516                         found_char = '\\';
 517                         break;
 518
 519                 MATCH_NEWLINE(
 520                         parse_error("newline while parsing character constant");
 521                         break;
 522                 )
 523
 524                 case '\'':
 525                         next_char();
 526                         goto end_of_char_constant;
 527
 528                 case EOF:
 529                         parse_error("EOF while parsing character constant");
 530                         lexer_token.type = T_ERROR;
 531                         return;
 532
 533                 default:
 534                         if(found_char != 0) {
 535                                 parse_error("more than 1 characters in character "
 536                                             "constant");
 537                                 goto end_of_char_constant;
 538                         } else {
 539                                 found_char = c;
 540                                 next_char();
 541                         }
 542                         break;
 543                 }
 544         }
 545
 546 end_of_char_constant:
 547         lexer_token.type       = T_INTEGER;
 548         lexer_token.v.intvalue = found_char;
 549 }
 550
 551 static
 552 void skip_multiline_comment(void)
 553 {
 554         unsigned start_linenr = lexer_token.source_position.linenr;
 555         int had_star = 0;
 556
 557         while(1) {
 558                 switch(c) {
 559                 case '*':
 560                         next_char();
 561                         had_star = 1;
 562                         break;
 563
 564                 case '/':
 565                         next_char();
 566                         if(had_star) {
 567                                 return;
 568                         }
 569                         had_star = 0;
 570                         break;
 571
 572                 case '\\':
 573                         next_char();
 574                         EAT_NEWLINE(break;)
 575                         had_star = 0;
 576                         break;
 577
 578                 case '?':
 579                         next_char();
 580                         if(c != '?') {
 581                                 had_star = 0;
 582                                 break;
 583                         }
 584                         next_char();
 585                         if(replace_trigraph())
 586                                 break;
 587                         put_back(c);
 588                         c = '?';
 589                         had_star = 0;
 590                         /* we don't put back the 2nd ? as the comment text is discarded
 591                          * anyway */
 592                         break;
 593
 594                 MATCH_NEWLINE(had_star = 0; break;)
 595
 596                 case EOF:
 597                         error_prefix_at(lexer_token.source_position.input_name,
 598                                         start_linenr);
 599                         fprintf(stderr, "at end of file while looking for comment end\n");
 600                         return;
 601                 default:
 602                         had_star = 0;
 603                         next_char();
 604                         break;
 605                 }
 606         }
 607 }
 608
 609 static
 610 void skip_line_comment(void)
 611 {
 612         while(1) {
 613                 switch(c) {
 614                 case '?':
 615                         next_char();
 616                         if(c != '?')
 617                                 break;
 618                         next_char();
 619                         if(replace_trigraph())
 620                                 break;
 621                         put_back('?');
 622                         /* we don't put back the 2nd ? as the comment text is discarded
 623                          * anyway */
 624                         break;
 625
 626                 case '\\':
 627                         next_char();
 628                         if(c == '\n') {
 629                                 next_char();
 630                                 lexer_token.source_position.linenr++;
 631                         }
 632                         break;
 633
 634                 case EOF:
 635                 case '\r':
 636                 case '\n':
 637                         return;
 638
 639                 default:
 640                         next_char();
 641                         break;
 642                 }
 643         }
 644 }
 645
 646 static token_t pp_token;
 647
 648 static inline
 649 void next_pp_token(void)
 650 {
 651         lexer_next_preprocessing_token();
 652         pp_token = lexer_token;
 653 }
 654
 655 static
 656 void eat_until_newline(void)
 657 {
 658         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 659                 next_pp_token();
 660         }
 661 }
 662
 663 static
 664 void error_directive(void)
 665 {
 666         error_prefix();
 667         fprintf(stderr, "#error directive: \n");
 668
 669         /* parse pp-tokens until new-line */
 670 }
 671
 672 static
 673 void define_directive(void)
 674 {
 675         lexer_next_preprocessing_token();
 676         if(lexer_token.type != T_IDENTIFIER) {
 677                 parse_error("expected identifier after #define\n");
 678                 eat_until_newline();
 679         }
 680 }
 681
 682 static
 683 void ifdef_directive(int is_ifndef)
 684 {
 685         (void) is_ifndef;
 686         lexer_next_preprocessing_token();
 687         //expect_identifier();
 688         //extect_newline();
 689 }
 690
 691 static
 692 void endif_directive(void)
 693 {
 694         //expect_newline();
 695 }
 696
 697 static
 698 void parse_line_directive(void)
 699 {
 700         if(pp_token.type != T_INTEGER) {
 701                 parse_error("expected integer");
 702         } else {
 703                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 704                 next_pp_token();
 705         }
 706         if(pp_token.type == T_STRING_LITERAL) {
 707                 lexer_token.source_position.input_name = pp_token.v.string;
 708                 next_pp_token();
 709         }
 710
 711         eat_until_newline();
 712 }
 713
 714 static
 715 void parse_preprocessor_identifier(void)
 716 {
 717         assert(pp_token.type == T_IDENTIFIER);
 718         symbol_t *symbol = pp_token.v.symbol;
 719
 720         switch(symbol->pp_ID) {
 721         case TP_include:
 722                 printf("include - enable header name parsing!\n");
 723                 break;
 724         case TP_define:
 725                 define_directive();
 726                 break;
 727         case TP_ifdef:
 728                 ifdef_directive(0);
 729                 break;
 730         case TP_ifndef:
 731                 ifdef_directive(1);
 732                 break;
 733         case TP_endif:
 734                 endif_directive();
 735                 break;
 736         case TP_line:
 737                 next_pp_token();
 738                 parse_line_directive();
 739                 break;
 740         case TP_if:
 741         case TP_else:
 742         case TP_elif:
 743         case TP_undef:
 744         case TP_error:
 745                 error_directive();
 746                 break;
 747         case TP_pragma:
 748                 break;
 749         }
 750 }
 751
 752 static
 753 void parse_preprocessor_directive()
 754 {
 755         next_pp_token();
 756
 757         switch(pp_token.type) {
 758         case T_IDENTIFIER:
 759                 parse_preprocessor_identifier();
 760                 break;
 761         case T_INTEGER:
 762                 parse_line_directive();
 763                 break;
 764         default:
 765                 parse_error("invalid preprocessor directive");
 766                 eat_until_newline();
 767                 break;
 768         }
 769 }
 770
 771 #define MAYBE_PROLOG                                       \
 772                         next_char();                                   \
 773                         while(1) {                                     \
 774                                 switch(c) {
 775
 776 #define MAYBE(ch, set_type)                                \
 777                                 case ch:                                   \
 778                                         next_char();                           \
 779                                         lexer_token.type = set_type;           \
 780                                         return;
 781
 782 #define ELSE_CODE(code)                                    \
 783                                 SKIP_TRIGRAPHS(,                           \
 784                                         code;                                  \
 785                                 )                                          \
 786                                                                                                                    \
 787                                 case '\\':                                 \
 788                                         next_char();                           \
 789                                         EAT_NEWLINE(break;)                    \
 790                                         /* fallthrough */                      \
 791                                 default:                                   \
 792                                         code;                                  \
 793                                 }                                          \
 794                         } /* end of while(1) */                        \
 795                         break;
 796
 797 #define ELSE(set_type)                                     \
 798                 ELSE_CODE(                                         \
 799                         lexer_token.type = set_type;                   \
 800                         return;                                        \
 801                 )
 802
 803 void lexer_next_preprocessing_token(void)
 804 {
 805         while(1) {
 806                 switch(c) {
 807                 case ' ':
 808                 case '\t':
 809                         next_char();
 810                         break;
 811
 812                 MATCH_NEWLINE(
 813                         lexer_token.type = '\n';
 814                         return;
 815                 )
 816
 817                 SYMBOL_CHARS
 818                         parse_symbol();
 819                         return;
 820
 821                 DIGITS
 822                         parse_number();
 823                         return;
 824
 825                 case '"':
 826                         parse_string_literal();
 827                         return;
 828
 829                 case '\'':
 830                         parse_character_constant();
 831                         return;
 832
 833                 case '\\':
 834                         next_char();
 835                         if(c == '\n') {
 836                                 next_char();
 837                                 lexer_token.source_position.linenr++;
 838                                 break;
 839                         } else {
 840                                 parse_error("unexpected '\\' found");
 841                                 lexer_token.type = T_ERROR;
 842                         }
 843                         return;
 844
 845                 case '.':
 846                         MAYBE_PROLOG
 847                                 case '.':
 848                                         MAYBE_PROLOG
 849                                         MAYBE('.', T_DOTDOTDOT)
 850                                         ELSE_CODE(
 851                                                 put_back(c);
 852                                                 c = '.';
 853                                                 lexer_token.type = '.';
 854                                                 return;
 855                                         )
 856                         ELSE('.')
 857                 case '&':
 858                         MAYBE_PROLOG
 859                         MAYBE('&', T_ANDAND)
 860                         MAYBE('=', T_ANDEQUAL)
 861                         ELSE('&')
 862                 case '*':
 863                         MAYBE_PROLOG
 864                         MAYBE('=', T_ASTERISKEQUAL)
 865                         ELSE('*')
 866                 case '+':
 867                         MAYBE_PROLOG
 868                         MAYBE('+', T_PLUSPLUS)
 869                         MAYBE('=', T_PLUSEQUAL)
 870                         ELSE('+')
 871                 case '-':
 872                         MAYBE_PROLOG
 873                         MAYBE('-', T_MINUSMINUS)
 874                         MAYBE('=', T_MINUSEQUAL)
 875                         ELSE('-')
 876                 case '!':
 877                         MAYBE_PROLOG
 878                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 879                         ELSE('!')
 880                 case '/':
 881                         MAYBE_PROLOG
 882                         MAYBE('=', T_SLASHEQUAL)
 883                                 case '*':
 884                                         next_char();
 885                                         skip_multiline_comment();
 886                                         lexer_next_preprocessing_token();
 887                                         return;
 888                                 case '/':
 889                                         next_char();
 890                                         skip_line_comment();
 891                                         lexer_next_preprocessing_token();
 892                                         return;
 893                         ELSE('/')
 894                 case '%':
 895                         MAYBE_PROLOG
 896                         MAYBE('>', T_PERCENTGREATER)
 897                         MAYBE('=', T_PERCENTEQUAL)
 898                                 case ':':
 899                                         MAYBE_PROLOG
 900                                                 case '%':
 901                                                         MAYBE_PROLOG
 902                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 903                                                         ELSE_CODE(
 904                                                                 put_back(c);
 905                                                                 c = '%';
 906                                                                 lexer_token.type = T_PERCENTCOLON;
 907                                                                 return;
 908                                                         )
 909                                         ELSE(T_PERCENTCOLON)
 910                         ELSE('%')
 911                 case '<':
 912                         MAYBE_PROLOG
 913                         MAYBE(':', T_LESSCOLON)
 914                         MAYBE('%', T_LESSPERCENT)
 915                                 case '<':
 916                                         MAYBE_PROLOG
 917                                         MAYBE('=', T_LESSLESSEQUAL)
 918                                         ELSE(T_LESSLESS)
 919                         ELSE('<')
 920                 case '>':
 921                         MAYBE_PROLOG
 922                                 case '>':
 923                                         MAYBE_PROLOG
 924                                         MAYBE('=', T_GREATERGREATEREQUAL)
 925                                         ELSE(T_GREATERGREATER)
 926                         ELSE('>')
 927                 case '^':
 928                         MAYBE_PROLOG
 929                         MAYBE('=', T_CARETEQUAL)
 930                         ELSE('^')
 931                 case '|':
 932                         MAYBE_PROLOG
 933                         MAYBE('=', T_PIPEEQUAL)
 934                         MAYBE('|', T_PIPEPIPE)
 935                         ELSE('|')
 936                 case ':':
 937                         MAYBE_PROLOG
 938                         MAYBE('>', T_COLONGREATER)
 939                         ELSE(':')
 940                 case '=':
 941                         MAYBE_PROLOG
 942                         MAYBE('=', T_EQUALEQUAL)
 943                         ELSE('=')
 944                 case '#':
 945                         MAYBE_PROLOG
 946                         MAYBE('#', T_HASHHASH)
 947                         ELSE('#')
 948
 949                 case '?':
 950                         next_char();
 951                         /* just a simple ? */
 952                         if(c != '?') {
 953                                 lexer_token.type = '?';
 954                                 return;
 955                         }
 956                         /* might be a trigraph */
 957                         next_char();
 958                         if(replace_trigraph()) {
 959                                 break;
 960                         }
 961                         put_back(c);
 962                         c = '?';
 963                         lexer_token.type = '?';
 964                         return;
 965
 966                 case '[':
 967                 case ']':
 968                 case '(':
 969                 case ')':
 970                 case '{':
 971                 case '}':
 972                 case '~':
 973                 case ';':
 974                 case ',':
 975                         lexer_token.type = c;
 976                         next_char();
 977                         return;
 978
 979                 case EOF:
 980                         lexer_token.type = T_EOF;
 981                         return;
 982
 983                 default:
 984                         next_char();
 985                         error_prefix();
 986                         fprintf(stderr, "unknown character '%c' found\n", c);
 987                         lexer_token.type = T_ERROR;
 988                         return;
 989                 }
 990         }
 991 }
 992
 993 void lexer_next_token(void)
 994 {
 995         lexer_next_preprocessing_token();
 996         if(lexer_token.type != '\n')
 997                 return;
 998
 999 newline_found:
1000         do {
1001                 lexer_next_preprocessing_token();
1002         } while(lexer_token.type == '\n');
1003
1004         if(lexer_token.type == '#') {
1005                 parse_preprocessor_directive();
1006                 goto newline_found;
1007         }
1008 }
1009
1010 void init_lexer(void)
1011 {
1012         strset_init(&stringset);
1013 }
1014
1015 void lexer_open_stream(FILE *stream, const char *input_name)
1016 {
1017         input                                  = stream;
1018         lexer_token.source_position.linenr     = 0;
1019         lexer_token.source_position.input_name = input_name;
1020
1021         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1022          * beginning of a line */
1023         c = '\n';
1024 }
1025
1026 void exit_lexer(void)
1027 {
1028         strset_destroy(&stringset);
1029 }
1030
1031 static __attribute__((unused))
1032 void dbg_pos(const source_position_t source_position)
1033 {
1034         fprintf(stdout, "%s:%d\n", source_position.input_name,
1035                 source_position.linenr);
1036         fflush(stdout);
1037 }