nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8 #include "adt/util.h"
   9
  10 #include <assert.h>
  11 #include <errno.h>
  12 #include <string.h>
  13 #include <ctype.h>
  14
  15 //#define DEBUG_CHARS
  16 #define MAX_PUTBACK 3
  17
  18 static int         c;
  19 token_t            lexer_token;
  20 static FILE       *input;
  21 static char        buf[1024 + MAX_PUTBACK];
  22 static const char *bufend;
  23 static const char *bufpos;
  24 static strset_t    stringset;
  25
  26 static void error_prefix_at(const char *input_name, unsigned linenr)
  27 {
  28         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  29 }
  30
  31 static void error_prefix(void)
  32 {
  33         error_prefix_at(lexer_token.source_position.input_name,
  34                         lexer_token.source_position.linenr);
  35 }
  36
  37 static void parse_error(const char *msg)
  38 {
  39         error_prefix();
  40         fprintf(stderr, "%s\n", msg);
  41 }
  42
  43 static inline void next_real_char(void)
  44 {
  45         bufpos++;
  46         if(bufpos >= bufend) {
  47                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  48                                  input);
  49                 if(s == 0) {
  50                         c = EOF;
  51                         return;
  52                 }
  53                 bufpos = buf + MAX_PUTBACK;
  54                 bufend = buf + MAX_PUTBACK + s;
  55         }
  56         c = *(bufpos);
  57 }
  58
  59 static inline void put_back(int pc)
  60 {
  61         assert(bufpos >= buf);
  62         assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
  63
  64         char *p = buf + (bufpos - buf);
  65         *p = pc;
  66
  67         /* going backwards in the buffer is legal as long as it's not more often
  68          * than MAX_PUTBACK */
  69         bufpos--;
  70
  71 #ifdef DEBUG_CHARS
  72         printf("putback '%c'\n", pc);
  73 #endif
  74 }
  75
  76 static inline void next_char(void);
  77
  78 #define MATCH_NEWLINE(code)                   \
  79         case '\r':                                \
  80                 next_char();                          \
  81                 if(c == '\n') {                       \
  82                         next_char();                      \
  83                 }                                     \
  84                 lexer_token.source_position.linenr++; \
  85                 code;                                 \
  86         case '\n':                                \
  87                 next_char();                          \
  88                 lexer_token.source_position.linenr++; \
  89                 code;
  90
  91 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
  92
  93 static void maybe_concat_lines(void)
  94 {
  95         eat('\\');
  96
  97         switch(c) {
  98         MATCH_NEWLINE(return;)
  99
 100         default:
 101                 break;
 102         }
 103
 104         put_back(c);
 105         c = '\\';
 106 }
 107
 108 static inline void next_char(void)
 109 {
 110         next_real_char();
 111
 112 #if 0
 113         /* filter trigraphs */
 114         if(UNLIKELY(c == '\\')) {
 115                 maybe_concat_lines();
 116                 goto end_of_next_char;
 117         }
 118
 119         if(LIKELY(c != '?'))
 120                 goto end_of_next_char;
 121
 122         next_real_char();
 123         if(LIKELY(c != '?')) {
 124                 put_back(c);
 125                 c = '?';
 126                 goto end_of_next_char;
 127         }
 128
 129         next_real_char();
 130         switch(c) {
 131         case '=': c = '#'; break;
 132         case '(': c = '['; break;
 133         case '/': c = '\\'; maybe_concat_lines(); break;
 134         case ')': c = ']'; break;
 135         case '\'': c = '^'; break;
 136         case '<': c = '{'; break;
 137         case '!': c = '|'; break;
 138         case '>': c = '}'; break;
 139         case '-': c = '~'; break;
 140         default:
 141                 put_back('?');
 142                 put_back(c);
 143                 c = '?';
 144                 break;
 145         }
 146
 147 end_of_next_char:
 148 #endif
 149         (void) maybe_concat_lines;
 150 #ifdef DEBUG_CHARS
 151         printf("nchar '%c'\n", c);
 152 #else
 153         ;
 154 #endif
 155 }
 156
 157 #define SYMBOL_CHARS  \
 158         case 'a':         \
 159         case 'b':         \
 160         case 'c':         \
 161         case 'd':         \
 162         case 'e':         \
 163         case 'f':         \
 164         case 'g':         \
 165         case 'h':         \
 166         case 'i':         \
 167         case 'j':         \
 168         case 'k':         \
 169         case 'l':         \
 170         case 'm':         \
 171         case 'n':         \
 172         case 'o':         \
 173         case 'p':         \
 174         case 'q':         \
 175         case 'r':         \
 176         case 's':         \
 177         case 't':         \
 178         case 'u':         \
 179         case 'v':         \
 180         case 'w':         \
 181         case 'x':         \
 182         case 'y':         \
 183         case 'z':         \
 184         case 'A':         \
 185         case 'B':         \
 186         case 'C':         \
 187         case 'D':         \
 188         case 'E':         \
 189         case 'F':         \
 190         case 'G':         \
 191         case 'H':         \
 192         case 'I':         \
 193         case 'J':         \
 194         case 'K':         \
 195         case 'L':         \
 196         case 'M':         \
 197         case 'N':         \
 198         case 'O':         \
 199         case 'P':         \
 200         case 'Q':         \
 201         case 'R':         \
 202         case 'S':         \
 203         case 'T':         \
 204         case 'U':         \
 205         case 'V':         \
 206         case 'W':         \
 207         case 'X':         \
 208         case 'Y':         \
 209         case 'Z':         \
 210         case '_':
 211
 212 #define DIGITS        \
 213         case '0':         \
 214         case '1':         \
 215         case '2':         \
 216         case '3':         \
 217         case '4':         \
 218         case '5':         \
 219         case '6':         \
 220         case '7':         \
 221         case '8':         \
 222         case '9':
 223
 224 static void parse_symbol(void)
 225 {
 226         symbol_t *symbol;
 227         char     *string;
 228
 229         obstack_1grow(&symbol_obstack, c);
 230         next_char();
 231
 232         while(1) {
 233                 switch(c) {
 234                 DIGITS
 235                 SYMBOL_CHARS
 236                         obstack_1grow(&symbol_obstack, c);
 237                         next_char();
 238                         break;
 239
 240                 default:
 241                         goto end_symbol;
 242                 }
 243         }
 244
 245 end_symbol:
 246         obstack_1grow(&symbol_obstack, '\0');
 247
 248         string = obstack_finish(&symbol_obstack);
 249         symbol = symbol_table_insert(string);
 250
 251         lexer_token.type     = symbol->ID;
 252         lexer_token.v.symbol = symbol;
 253
 254         if(symbol->string != string) {
 255                 obstack_free(&symbol_obstack, string);
 256         }
 257 }
 258
 259 static void parse_integer_suffix(void)
 260 {
 261         if(c == 'U' || c == 'U') {
 262                 /* TODO do something with the suffixes... */
 263                 next_char();
 264                 if(c == 'L' || c == 'l') {
 265                         next_char();
 266                         if(c == 'L' || c == 'l') {
 267                                 next_char();
 268                         }
 269                 }
 270         } else if(c == 'l' || c == 'L') {
 271                 next_char();
 272                 if(c == 'l' || c == 'L') {
 273                         next_char();
 274                         if(c == 'u' || c == 'U') {
 275                                 next_char();
 276                         }
 277                 } else if(c == 'u' || c == 'U') {
 278                         next_char();
 279                 }
 280         }
 281 }
 282
 283 static void parse_floating_suffix(void)
 284 {
 285         switch(c) {
 286         /* TODO: do something usefull with the suffixes... */
 287         case 'f':
 288         case 'F':
 289         case 'l':
 290         case 'L':
 291                 next_char();
 292                 break;
 293         default:
 294                 break;
 295         }
 296 }
 297
 298 static void parse_number_hex(void)
 299 {
 300         assert(c == 'x' || c == 'X');
 301         next_char();
 302
 303         if (!isdigit(c) &&
 304                 !('A' <= c && c <= 'F') &&
 305                 !('a' <= c && c <= 'f')) {
 306                 parse_error("premature end of hex number literal");
 307                 lexer_token.type = T_ERROR;
 308                 return;
 309         }
 310
 311         int value = 0;
 312         while(1) {
 313                 if (isdigit(c)) {
 314                         value = 16 * value + c - '0';
 315                 } else if ('A' <= c && c <= 'F') {
 316                         value = 16 * value + c - 'A' + 10;
 317                 } else if ('a' <= c && c <= 'f') {
 318                         value = 16 * value + c - 'a' + 10;
 319                 } else {
 320                         parse_integer_suffix();
 321
 322                         lexer_token.type       = T_INTEGER;
 323                         lexer_token.v.intvalue = value;
 324                         return;
 325                 }
 326                 next_char();
 327         }
 328
 329         if(c == '.' || c == 'p' || c == 'P') {
 330                 next_char();
 331                 panic("Hex floating point numbers not implemented yet");
 332         }
 333 }
 334
 335 static void parse_number_oct(void)
 336 {
 337         int value = 0;
 338         while(c >= '0' && c <= '7') {
 339                 value = 8 * value + c - '0';
 340                 next_char();
 341         }
 342         if (c == '8' || c == '9') {
 343                 parse_error("invalid octal number");
 344                 lexer_token.type = T_ERROR;
 345                 return;
 346         }
 347
 348         lexer_token.type       = T_INTEGER;
 349         lexer_token.v.intvalue = value;
 350
 351         parse_integer_suffix();
 352 }
 353
 354 static void parse_floatingpoint_exponent(long double value)
 355 {
 356         unsigned int expo = 0;
 357         long double  factor = 10.;
 358
 359         if(c == '-') {
 360                 next_char();
 361                 factor = 0.1;
 362         } else if(c == '+') {
 363                 next_char();
 364         }
 365
 366         while(c >= '0' && c <= '9') {
 367                 expo = 10 * expo + (c - '0');
 368                 next_char();
 369         }
 370
 371         while(1) {
 372                 if(expo & 1)
 373                         value *= factor;
 374                 expo >>= 1;
 375                 if(expo == 0)
 376                         break;
 377                 factor *= factor;
 378         }
 379
 380         lexer_token.type         = T_FLOATINGPOINT;
 381         lexer_token.v.floatvalue = value;
 382
 383         parse_floating_suffix();
 384 }
 385
 386 static void parse_floatingpoint_fract(int integer_part)
 387 {
 388         long double value  = integer_part;
 389         long double factor = 1.;
 390
 391         while(c >= '0' && c <= '9') {
 392                 factor *= 0.1;
 393                 value  += (c - '0') * factor;
 394                 next_char();
 395         }
 396
 397         if(c == 'e' || c == 'E') {
 398                 next_char();
 399                 parse_floatingpoint_exponent(value);
 400                 return;
 401         }
 402
 403         lexer_token.type         = T_FLOATINGPOINT;
 404         lexer_token.v.floatvalue = value;
 405
 406         parse_floating_suffix();
 407 }
 408
 409 static void parse_number_dec(void)
 410 {
 411         int value = 0;
 412
 413         while(isdigit(c)) {
 414                 value = 10 * value + c - '0';
 415                 next_char();
 416         }
 417
 418         if(c == '.') {
 419                 next_char();
 420                 parse_floatingpoint_fract(value);
 421                 return;
 422         }
 423         if(c == 'e' || c == 'E') {
 424                 next_char();
 425                 parse_floatingpoint_exponent(value);
 426                 return;
 427         }
 428         parse_integer_suffix();
 429
 430         lexer_token.type       = T_INTEGER;
 431         lexer_token.v.intvalue = value;
 432 }
 433
 434 static void parse_number(void)
 435 {
 436         if (c == '0') {
 437                 next_char();
 438                 switch (c) {
 439                         case 'X':
 440                         case 'x':
 441                                 parse_number_hex();
 442                                 break;
 443                         case '0':
 444                         case '1':
 445                         case '2':
 446                         case '3':
 447                         case '4':
 448                         case '5':
 449                         case '6':
 450                         case '7':
 451                                 parse_number_oct();
 452                                 break;
 453                         case '.':
 454                                 next_char();
 455                                 parse_floatingpoint_fract(0);
 456                                 break;
 457                         case 'e':
 458                         case 'E':
 459                                 parse_floatingpoint_exponent(0);
 460                                 break;
 461                         case '8':
 462                         case '9':
 463                                 next_char();
 464                                 parse_error("invalid octal number");
 465                                 lexer_token.type = T_ERROR;
 466                                 return;
 467                         default:
 468                                 put_back(c);
 469                                 c = '0';
 470                                 parse_number_dec();
 471                                 return;
 472                 }
 473         } else {
 474                 parse_number_dec();
 475         }
 476 }
 477
 478 static int parse_octal_sequence(void)
 479 {
 480         int value = 0;
 481         while(1) {
 482                 if(c < '0' || c > '7')
 483                         break;
 484                 value = 8 * value + c - '0';
 485                 next_char();
 486         }
 487
 488         return value;
 489 }
 490
 491 static int parse_hex_sequence(void)
 492 {
 493         int value = 0;
 494         while(1) {
 495                 if (c >= '0' && c <= '9') {
 496                         value = 16 * value + c - '0';
 497                 } else if ('A' <= c && c <= 'F') {
 498                         value = 16 * value + c - 'A' + 10;
 499                 } else if ('a' <= c && c <= 'f') {
 500                         value = 16 * value + c - 'a' + 10;
 501                 } else {
 502                         break;
 503                 }
 504                 next_char();
 505         }
 506
 507         return value;
 508 }
 509
 510 static int parse_escape_sequence(void)
 511 {
 512         eat('\\');
 513
 514         int ec = c;
 515         next_char();
 516
 517         switch(ec) {
 518         case '"':  return '"';
 519         case '\'': return'\'';
 520         case '\\': return '\\';
 521         case '?': return '\?';
 522         case 'a': return '\a';
 523         case 'b': return '\b';
 524         case 'f': return '\f';
 525         case 'n': return '\n';
 526         case 'r': return '\r';
 527         case 't': return '\t';
 528         case 'v': return '\v';
 529         case 'x':
 530                 return parse_hex_sequence();
 531         case '0':
 532         case '1':
 533         case '2':
 534         case '3':
 535         case '4':
 536         case '5':
 537         case '6':
 538         case '7':
 539                 return parse_octal_sequence();
 540         case EOF:
 541                 parse_error("reached end of file while parsing escape sequence");
 542                 return EOF;
 543         default:
 544                 parse_error("unknown escape sequence");
 545                 return EOF;
 546         }
 547 }
 548
 549 const char *concat_strings(const char *s1, const char *s2)
 550 {
 551         size_t  len1   = strlen(s1);
 552         size_t  len2   = strlen(s2);
 553
 554         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 555         memcpy(concat, s1, len1);
 556         memcpy(concat + len1, s2, len2 + 1);
 557
 558         const char *result = strset_insert(&stringset, concat);
 559         if(result != concat) {
 560                 obstack_free(&symbol_obstack, concat);
 561         }
 562
 563         return result;
 564 }
 565
 566 static void parse_string_literal(void)
 567 {
 568         unsigned    start_linenr = lexer_token.source_position.linenr;
 569         char       *string;
 570         const char *result;
 571
 572         assert(c == '"');
 573         next_char();
 574
 575         int tc;
 576         while(1) {
 577                 switch(c) {
 578                 case '\\':
 579                         tc = parse_escape_sequence();
 580                         obstack_1grow(&symbol_obstack, tc);
 581                         break;
 582
 583                 case EOF:
 584                         error_prefix_at(lexer_token.source_position.input_name,
 585                                         start_linenr);
 586                         fprintf(stderr, "string has no end\n");
 587                         lexer_token.type = T_ERROR;
 588                         return;
 589
 590                 case '"':
 591                         next_char();
 592                         goto end_of_string;
 593
 594                 default:
 595                         obstack_1grow(&symbol_obstack, c);
 596                         next_char();
 597                         break;
 598                 }
 599         }
 600
 601 end_of_string:
 602
 603         /* TODO: concatenate multiple strings separated by whitespace... */
 604
 605         /* add finishing 0 to the string */
 606         obstack_1grow(&symbol_obstack, '\0');
 607         string = obstack_finish(&symbol_obstack);
 608
 609         /* check if there is already a copy of the string */
 610         result = strset_insert(&stringset, string);
 611         if(result != string) {
 612                 obstack_free(&symbol_obstack, string);
 613         }
 614
 615         lexer_token.type     = T_STRING_LITERAL;
 616         lexer_token.v.string = result;
 617 }
 618
 619 static void parse_character_constant(void)
 620 {
 621         eat('\'');
 622
 623         int found_char = 0;
 624         while(1) {
 625                 switch(c) {
 626                 case '\\':
 627                         found_char = parse_escape_sequence();
 628                         break;
 629
 630                 MATCH_NEWLINE(
 631                         parse_error("newline while parsing character constant");
 632                         break;
 633                 )
 634
 635                 case '\'':
 636                         next_char();
 637                         goto end_of_char_constant;
 638
 639                 case EOF:
 640                         parse_error("EOF while parsing character constant");
 641                         lexer_token.type = T_ERROR;
 642                         return;
 643
 644                 default:
 645                         if(found_char != 0) {
 646                                 parse_error("more than 1 characters in character "
 647                                             "constant");
 648                                 goto end_of_char_constant;
 649                         } else {
 650                                 found_char = c;
 651                                 next_char();
 652                         }
 653                         break;
 654                 }
 655         }
 656
 657 end_of_char_constant:
 658         lexer_token.type       = T_INTEGER;
 659         lexer_token.v.intvalue = found_char;
 660 }
 661
 662 static void skip_multiline_comment(void)
 663 {
 664         unsigned start_linenr = lexer_token.source_position.linenr;
 665
 666         while(1) {
 667                 switch(c) {
 668                 case '*':
 669                         next_char();
 670                         if(c == '/') {
 671                                 next_char();
 672                                 return;
 673                         }
 674                         break;
 675
 676                 MATCH_NEWLINE(break;)
 677
 678                 case EOF:
 679                         error_prefix_at(lexer_token.source_position.input_name,
 680                                         start_linenr);
 681                         fprintf(stderr, "at end of file while looking for comment end\n");
 682                         return;
 683
 684                 default:
 685                         next_char();
 686                         break;
 687                 }
 688         }
 689 }
 690
 691 static void skip_line_comment(void)
 692 {
 693         while(1) {
 694                 switch(c) {
 695                 case EOF:
 696                         return;
 697
 698                 case '\n':
 699                 case '\r':
 700                         return;
 701
 702                 default:
 703                         next_char();
 704                         break;
 705                 }
 706         }
 707 }
 708
 709 static token_t pp_token;
 710
 711 static inline void next_pp_token(void)
 712 {
 713         lexer_next_preprocessing_token();
 714         pp_token = lexer_token;
 715 }
 716
 717 static void eat_until_newline(void)
 718 {
 719         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 720                 next_pp_token();
 721         }
 722 }
 723
 724 static void error_directive(void)
 725 {
 726         error_prefix();
 727         fprintf(stderr, "#error directive: \n");
 728
 729         /* parse pp-tokens until new-line */
 730 }
 731
 732 static void define_directive(void)
 733 {
 734         lexer_next_preprocessing_token();
 735         if(lexer_token.type != T_IDENTIFIER) {
 736                 parse_error("expected identifier after #define\n");
 737                 eat_until_newline();
 738         }
 739 }
 740
 741 static void ifdef_directive(int is_ifndef)
 742 {
 743         (void) is_ifndef;
 744         lexer_next_preprocessing_token();
 745         //expect_identifier();
 746         //extect_newline();
 747 }
 748
 749 static void endif_directive(void)
 750 {
 751         //expect_newline();
 752 }
 753
 754 static void parse_line_directive(void)
 755 {
 756         if(pp_token.type != T_INTEGER) {
 757                 parse_error("expected integer");
 758         } else {
 759                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 760                 next_pp_token();
 761         }
 762         if(pp_token.type == T_STRING_LITERAL) {
 763                 lexer_token.source_position.input_name = pp_token.v.string;
 764                 next_pp_token();
 765         }
 766
 767         eat_until_newline();
 768 }
 769
 770 static void parse_preprocessor_identifier(void)
 771 {
 772         assert(pp_token.type == T_IDENTIFIER);
 773         symbol_t *symbol = pp_token.v.symbol;
 774
 775         switch(symbol->pp_ID) {
 776         case TP_include:
 777                 printf("include - enable header name parsing!\n");
 778                 break;
 779         case TP_define:
 780                 define_directive();
 781                 break;
 782         case TP_ifdef:
 783                 ifdef_directive(0);
 784                 break;
 785         case TP_ifndef:
 786                 ifdef_directive(1);
 787                 break;
 788         case TP_endif:
 789                 endif_directive();
 790                 break;
 791         case TP_line:
 792                 next_pp_token();
 793                 parse_line_directive();
 794                 break;
 795         case TP_if:
 796         case TP_else:
 797         case TP_elif:
 798         case TP_undef:
 799         case TP_error:
 800                 error_directive();
 801                 break;
 802         case TP_pragma:
 803                 break;
 804         }
 805 }
 806
 807 static void parse_preprocessor_directive(void)
 808 {
 809         next_pp_token();
 810
 811         switch(pp_token.type) {
 812         case T_IDENTIFIER:
 813                 parse_preprocessor_identifier();
 814                 break;
 815         case T_INTEGER:
 816                 parse_line_directive();
 817                 break;
 818         default:
 819                 parse_error("invalid preprocessor directive");
 820                 eat_until_newline();
 821                 break;
 822         }
 823 }
 824
 825 #define MAYBE_PROLOG                                       \
 826                         next_char();                                   \
 827                         while(1) {                                     \
 828                                 switch(c) {
 829
 830 #define MAYBE(ch, set_type)                                \
 831                                 case ch:                                   \
 832                                         next_char();                           \
 833                                         lexer_token.type = set_type;           \
 834                                         return;
 835
 836 #define ELSE_CODE(code)                                    \
 837                                 default:                                   \
 838                                         code;                                  \
 839                                 }                                          \
 840                         } /* end of while(1) */                        \
 841                         break;
 842
 843 #define ELSE(set_type)                                     \
 844                 ELSE_CODE(                                         \
 845                         lexer_token.type = set_type;                   \
 846                         return;                                        \
 847                 )
 848
 849 void lexer_next_preprocessing_token(void)
 850 {
 851         while(1) {
 852                 switch(c) {
 853                 case ' ':
 854                 case '\t':
 855                         next_char();
 856                         break;
 857
 858                 MATCH_NEWLINE(
 859                         lexer_token.type = '\n';
 860                         return;
 861                 )
 862
 863                 SYMBOL_CHARS
 864                         parse_symbol();
 865                         return;
 866
 867                 DIGITS
 868                         parse_number();
 869                         return;
 870
 871                 case '"':
 872                         parse_string_literal();
 873                         return;
 874
 875                 case '\'':
 876                         parse_character_constant();
 877                         return;
 878
 879                 case '.':
 880                         MAYBE_PROLOG
 881                                 case '.':
 882                                         MAYBE_PROLOG
 883                                         MAYBE('.', T_DOTDOTDOT)
 884                                         ELSE_CODE(
 885                                                 put_back(c);
 886                                                 c = '.';
 887                                                 lexer_token.type = '.';
 888                                                 return;
 889                                         )
 890                         ELSE('.')
 891                 case '&':
 892                         MAYBE_PROLOG
 893                         MAYBE('&', T_ANDAND)
 894                         MAYBE('=', T_ANDEQUAL)
 895                         ELSE('&')
 896                 case '*':
 897                         MAYBE_PROLOG
 898                         MAYBE('=', T_ASTERISKEQUAL)
 899                         ELSE('*')
 900                 case '+':
 901                         MAYBE_PROLOG
 902                         MAYBE('+', T_PLUSPLUS)
 903                         MAYBE('=', T_PLUSEQUAL)
 904                         ELSE('+')
 905                 case '-':
 906                         MAYBE_PROLOG
 907                         MAYBE('>', T_MINUSGREATER)
 908                         MAYBE('-', T_MINUSMINUS)
 909                         MAYBE('=', T_MINUSEQUAL)
 910                         ELSE('-')
 911                 case '!':
 912                         MAYBE_PROLOG
 913                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 914                         ELSE('!')
 915                 case '/':
 916                         MAYBE_PROLOG
 917                         MAYBE('=', T_SLASHEQUAL)
 918                                 case '*':
 919                                         next_char();
 920                                         skip_multiline_comment();
 921                                         lexer_next_preprocessing_token();
 922                                         return;
 923                                 case '/':
 924                                         next_char();
 925                                         skip_line_comment();
 926                                         lexer_next_preprocessing_token();
 927                                         return;
 928                         ELSE('/')
 929                 case '%':
 930                         MAYBE_PROLOG
 931                         MAYBE('>', T_PERCENTGREATER)
 932                         MAYBE('=', T_PERCENTEQUAL)
 933                                 case ':':
 934                                         MAYBE_PROLOG
 935                                                 case '%':
 936                                                         MAYBE_PROLOG
 937                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 938                                                         ELSE_CODE(
 939                                                                 put_back(c);
 940                                                                 c = '%';
 941                                                                 lexer_token.type = T_PERCENTCOLON;
 942                                                                 return;
 943                                                         )
 944                                         ELSE(T_PERCENTCOLON)
 945                         ELSE('%')
 946                 case '<':
 947                         MAYBE_PROLOG
 948                         MAYBE(':', T_LESSCOLON)
 949                         MAYBE('%', T_LESSPERCENT)
 950                         MAYBE('=', T_LESSEQUAL)
 951                                 case '<':
 952                                         MAYBE_PROLOG
 953                                         MAYBE('=', T_LESSLESSEQUAL)
 954                                         ELSE(T_LESSLESS)
 955                         ELSE('<')
 956                 case '>':
 957                         MAYBE_PROLOG
 958                         MAYBE('=', T_GREATEREQUAL)
 959                                 case '>':
 960                                         MAYBE_PROLOG
 961                                         MAYBE('=', T_GREATERGREATEREQUAL)
 962                                         ELSE(T_GREATERGREATER)
 963                         ELSE('>')
 964                 case '^':
 965                         MAYBE_PROLOG
 966                         MAYBE('=', T_CARETEQUAL)
 967                         ELSE('^')
 968                 case '|':
 969                         MAYBE_PROLOG
 970                         MAYBE('=', T_PIPEEQUAL)
 971                         MAYBE('|', T_PIPEPIPE)
 972                         ELSE('|')
 973                 case ':':
 974                         MAYBE_PROLOG
 975                         MAYBE('>', T_COLONGREATER)
 976                         ELSE(':')
 977                 case '=':
 978                         MAYBE_PROLOG
 979                         MAYBE('=', T_EQUALEQUAL)
 980                         ELSE('=')
 981                 case '#':
 982                         MAYBE_PROLOG
 983                         MAYBE('#', T_HASHHASH)
 984                         ELSE('#')
 985
 986                 case '?':
 987                 case '[':
 988                 case ']':
 989                 case '(':
 990                 case ')':
 991                 case '{':
 992                 case '}':
 993                 case '~':
 994                 case ';':
 995                 case ',':
 996                 case '\\':
 997                         lexer_token.type = c;
 998                         next_char();
 999                         return;
1000
1001                 case EOF:
1002                         lexer_token.type = T_EOF;
1003                         return;
1004
1005                 default:
1006                         next_char();
1007                         error_prefix();
1008                         fprintf(stderr, "unknown character '%c' found\n", c);
1009                         lexer_token.type = T_ERROR;
1010                         return;
1011                 }
1012         }
1013 }
1014
1015 void lexer_next_token(void)
1016 {
1017         lexer_next_preprocessing_token();
1018         if(lexer_token.type != '\n')
1019                 return;
1020
1021 newline_found:
1022         do {
1023                 lexer_next_preprocessing_token();
1024         } while(lexer_token.type == '\n');
1025
1026         if(lexer_token.type == '#') {
1027                 parse_preprocessor_directive();
1028                 goto newline_found;
1029         }
1030 }
1031
1032 void init_lexer(void)
1033 {
1034         strset_init(&stringset);
1035 }
1036
1037 void lexer_open_stream(FILE *stream, const char *input_name)
1038 {
1039         input                                  = stream;
1040         lexer_token.source_position.linenr     = 0;
1041         lexer_token.source_position.input_name = input_name;
1042
1043         /* place a virtual \n at the beginning so the lexer knows that we're
1044          * at the beginning of a line */
1045         c = '\n';
1046 }
1047
1048 void exit_lexer(void)
1049 {
1050         strset_destroy(&stringset);
1051 }
1052
1053 static __attribute__((unused))
1054 void dbg_pos(const source_position_t source_position)
1055 {
1056         fprintf(stdout, "%s:%d\n", source_position.input_name,
1057                 source_position.linenr);
1058         fflush(stdout);
1059 }