nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8 #include "adt/util.h"
   9
  10 #include <assert.h>
  11 #include <errno.h>
  12 #include <string.h>
  13 #include <ctype.h>
  14
  15 //#define DEBUG_CHARS
  16 #define MAX_PUTBACK 3
  17
  18 static int         c;
  19 token_t            lexer_token;
  20 symbol_t          *symbol_L;
  21 static FILE       *input;
  22 static char        buf[1024 + MAX_PUTBACK];
  23 static const char *bufend;
  24 static const char *bufpos;
  25 static strset_t    stringset;
  26
  27 static void error_prefix_at(const char *input_name, unsigned linenr)
  28 {
  29         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  30 }
  31
  32 static void error_prefix(void)
  33 {
  34         error_prefix_at(lexer_token.source_position.input_name,
  35                         lexer_token.source_position.linenr);
  36 }
  37
  38 static void parse_error(const char *msg)
  39 {
  40         error_prefix();
  41         fprintf(stderr, "%s\n", msg);
  42 }
  43
  44 static inline void next_real_char(void)
  45 {
  46         bufpos++;
  47         if(bufpos >= bufend) {
  48                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  49                                  input);
  50                 if(s == 0) {
  51                         c = EOF;
  52                         return;
  53                 }
  54                 bufpos = buf + MAX_PUTBACK;
  55                 bufend = buf + MAX_PUTBACK + s;
  56         }
  57         c = *(bufpos);
  58 }
  59
  60 static inline void put_back(int pc)
  61 {
  62         assert(bufpos >= buf);
  63         assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
  64
  65         char *p = buf + (bufpos - buf);
  66         *p = pc;
  67
  68         /* going backwards in the buffer is legal as long as it's not more often
  69          * than MAX_PUTBACK */
  70         bufpos--;
  71
  72 #ifdef DEBUG_CHARS
  73         printf("putback '%c'\n", pc);
  74 #endif
  75 }
  76
  77 static inline void next_char(void);
  78
  79 #define MATCH_NEWLINE(code)                   \
  80         case '\r':                                \
  81                 next_char();                          \
  82                 if(c == '\n') {                       \
  83                         next_char();                      \
  84                 }                                     \
  85                 lexer_token.source_position.linenr++; \
  86                 code;                                 \
  87         case '\n':                                \
  88                 next_char();                          \
  89                 lexer_token.source_position.linenr++; \
  90                 code;
  91
  92 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
  93
  94 static void maybe_concat_lines(void)
  95 {
  96         eat('\\');
  97
  98         switch(c) {
  99         MATCH_NEWLINE(return;)
 100
 101         default:
 102                 break;
 103         }
 104
 105         put_back(c);
 106         c = '\\';
 107 }
 108
 109 static inline void next_char(void)
 110 {
 111         next_real_char();
 112
 113 #if 0
 114         /* filter trigraphs */
 115         if(UNLIKELY(c == '\\')) {
 116                 maybe_concat_lines();
 117                 goto end_of_next_char;
 118         }
 119
 120         if(LIKELY(c != '?'))
 121                 goto end_of_next_char;
 122
 123         next_real_char();
 124         if(LIKELY(c != '?')) {
 125                 put_back(c);
 126                 c = '?';
 127                 goto end_of_next_char;
 128         }
 129
 130         next_real_char();
 131         switch(c) {
 132         case '=': c = '#'; break;
 133         case '(': c = '['; break;
 134         case '/': c = '\\'; maybe_concat_lines(); break;
 135         case ')': c = ']'; break;
 136         case '\'': c = '^'; break;
 137         case '<': c = '{'; break;
 138         case '!': c = '|'; break;
 139         case '>': c = '}'; break;
 140         case '-': c = '~'; break;
 141         default:
 142                 put_back('?');
 143                 put_back(c);
 144                 c = '?';
 145                 break;
 146         }
 147
 148 end_of_next_char:;
 149 #endif
 150         (void) maybe_concat_lines;
 151 #ifdef DEBUG_CHARS
 152         printf("nchar '%c'\n", c);
 153 #endif
 154 }
 155
 156 #define SYMBOL_CHARS  \
 157         case 'a':         \
 158         case 'b':         \
 159         case 'c':         \
 160         case 'd':         \
 161         case 'e':         \
 162         case 'f':         \
 163         case 'g':         \
 164         case 'h':         \
 165         case 'i':         \
 166         case 'j':         \
 167         case 'k':         \
 168         case 'l':         \
 169         case 'm':         \
 170         case 'n':         \
 171         case 'o':         \
 172         case 'p':         \
 173         case 'q':         \
 174         case 'r':         \
 175         case 's':         \
 176         case 't':         \
 177         case 'u':         \
 178         case 'v':         \
 179         case 'w':         \
 180         case 'x':         \
 181         case 'y':         \
 182         case 'z':         \
 183         case 'A':         \
 184         case 'B':         \
 185         case 'C':         \
 186         case 'D':         \
 187         case 'E':         \
 188         case 'F':         \
 189         case 'G':         \
 190         case 'H':         \
 191         case 'I':         \
 192         case 'J':         \
 193         case 'K':         \
 194         case 'L':         \
 195         case 'M':         \
 196         case 'N':         \
 197         case 'O':         \
 198         case 'P':         \
 199         case 'Q':         \
 200         case 'R':         \
 201         case 'S':         \
 202         case 'T':         \
 203         case 'U':         \
 204         case 'V':         \
 205         case 'W':         \
 206         case 'X':         \
 207         case 'Y':         \
 208         case 'Z':         \
 209         case '_':
 210
 211 #define DIGITS        \
 212         case '0':         \
 213         case '1':         \
 214         case '2':         \
 215         case '3':         \
 216         case '4':         \
 217         case '5':         \
 218         case '6':         \
 219         case '7':         \
 220         case '8':         \
 221         case '9':
 222
 223 static void parse_symbol(void)
 224 {
 225         symbol_t *symbol;
 226         char     *string;
 227
 228         obstack_1grow(&symbol_obstack, c);
 229         next_char();
 230
 231         while(1) {
 232                 switch(c) {
 233                 DIGITS
 234                 SYMBOL_CHARS
 235                         obstack_1grow(&symbol_obstack, c);
 236                         next_char();
 237                         break;
 238
 239                 default:
 240                         goto end_symbol;
 241                 }
 242         }
 243
 244 end_symbol:
 245         obstack_1grow(&symbol_obstack, '\0');
 246
 247         string = obstack_finish(&symbol_obstack);
 248         symbol = symbol_table_insert(string);
 249
 250         lexer_token.type     = symbol->ID;
 251         lexer_token.v.symbol = symbol;
 252
 253         if(symbol->string != string) {
 254                 obstack_free(&symbol_obstack, string);
 255         }
 256 }
 257
 258 static void parse_integer_suffix(void)
 259 {
 260         if(c == 'U' || c == 'U') {
 261                 /* TODO do something with the suffixes... */
 262                 next_char();
 263                 if(c == 'L' || c == 'l') {
 264                         next_char();
 265                         if(c == 'L' || c == 'l') {
 266                                 next_char();
 267                         }
 268                 }
 269         } else if(c == 'l' || c == 'L') {
 270                 next_char();
 271                 if(c == 'l' || c == 'L') {
 272                         next_char();
 273                         if(c == 'u' || c == 'U') {
 274                                 next_char();
 275                         }
 276                 } else if(c == 'u' || c == 'U') {
 277                         next_char();
 278                 }
 279         }
 280 }
 281
 282 static void parse_floating_suffix(void)
 283 {
 284         switch(c) {
 285         /* TODO: do something usefull with the suffixes... */
 286         case 'f':
 287         case 'F':
 288         case 'l':
 289         case 'L':
 290                 next_char();
 291                 break;
 292         default:
 293                 break;
 294         }
 295 }
 296
 297 static void parse_number_hex(void)
 298 {
 299         assert(c == 'x' || c == 'X');
 300         next_char();
 301
 302         if (!isdigit(c) &&
 303                 !('A' <= c && c <= 'F') &&
 304                 !('a' <= c && c <= 'f')) {
 305                 parse_error("premature end of hex number literal");
 306                 lexer_token.type = T_ERROR;
 307                 return;
 308         }
 309
 310         int value = 0;
 311         while(1) {
 312                 if (isdigit(c)) {
 313                         value = 16 * value + c - '0';
 314                 } else if ('A' <= c && c <= 'F') {
 315                         value = 16 * value + c - 'A' + 10;
 316                 } else if ('a' <= c && c <= 'f') {
 317                         value = 16 * value + c - 'a' + 10;
 318                 } else {
 319                         parse_integer_suffix();
 320
 321                         lexer_token.type       = T_INTEGER;
 322                         lexer_token.v.intvalue = value;
 323                         return;
 324                 }
 325                 next_char();
 326         }
 327
 328         if(c == '.' || c == 'p' || c == 'P') {
 329                 next_char();
 330                 panic("Hex floating point numbers not implemented yet");
 331         }
 332 }
 333
 334 static void parse_number_oct(void)
 335 {
 336         int value = 0;
 337         while(c >= '0' && c <= '7') {
 338                 value = 8 * value + c - '0';
 339                 next_char();
 340         }
 341         if (c == '8' || c == '9') {
 342                 parse_error("invalid octal number");
 343                 lexer_token.type = T_ERROR;
 344                 return;
 345         }
 346
 347         lexer_token.type       = T_INTEGER;
 348         lexer_token.v.intvalue = value;
 349
 350         parse_integer_suffix();
 351 }
 352
 353 static void parse_floatingpoint_exponent(long double value)
 354 {
 355         unsigned int expo = 0;
 356         long double  factor = 10.;
 357
 358         if(c == '-') {
 359                 next_char();
 360                 factor = 0.1;
 361         } else if(c == '+') {
 362                 next_char();
 363         }
 364
 365         while(c >= '0' && c <= '9') {
 366                 expo = 10 * expo + (c - '0');
 367                 next_char();
 368         }
 369
 370         while(1) {
 371                 if(expo & 1)
 372                         value *= factor;
 373                 expo >>= 1;
 374                 if(expo == 0)
 375                         break;
 376                 factor *= factor;
 377         }
 378
 379         lexer_token.type         = T_FLOATINGPOINT;
 380         lexer_token.v.floatvalue = value;
 381
 382         parse_floating_suffix();
 383 }
 384
 385 static void parse_floatingpoint_fract(int integer_part)
 386 {
 387         long double value  = integer_part;
 388         long double factor = 1.;
 389
 390         while(c >= '0' && c <= '9') {
 391                 factor *= 0.1;
 392                 value  += (c - '0') * factor;
 393                 next_char();
 394         }
 395
 396         if(c == 'e' || c == 'E') {
 397                 next_char();
 398                 parse_floatingpoint_exponent(value);
 399                 return;
 400         }
 401
 402         lexer_token.type         = T_FLOATINGPOINT;
 403         lexer_token.v.floatvalue = value;
 404
 405         parse_floating_suffix();
 406 }
 407
 408 static void parse_number_dec(void)
 409 {
 410         int value = 0;
 411
 412         while(isdigit(c)) {
 413                 value = 10 * value + c - '0';
 414                 next_char();
 415         }
 416
 417         if(c == '.') {
 418                 next_char();
 419                 parse_floatingpoint_fract(value);
 420                 return;
 421         }
 422         if(c == 'e' || c == 'E') {
 423                 next_char();
 424                 parse_floatingpoint_exponent(value);
 425                 return;
 426         }
 427         parse_integer_suffix();
 428
 429         lexer_token.type       = T_INTEGER;
 430         lexer_token.v.intvalue = value;
 431 }
 432
 433 static void parse_number(void)
 434 {
 435         if (c == '0') {
 436                 next_char();
 437                 switch (c) {
 438                         case 'X':
 439                         case 'x':
 440                                 parse_number_hex();
 441                                 break;
 442                         case '0':
 443                         case '1':
 444                         case '2':
 445                         case '3':
 446                         case '4':
 447                         case '5':
 448                         case '6':
 449                         case '7':
 450                                 parse_number_oct();
 451                                 break;
 452                         case '.':
 453                                 next_char();
 454                                 parse_floatingpoint_fract(0);
 455                                 break;
 456                         case 'e':
 457                         case 'E':
 458                                 parse_floatingpoint_exponent(0);
 459                                 break;
 460                         case '8':
 461                         case '9':
 462                                 next_char();
 463                                 parse_error("invalid octal number");
 464                                 lexer_token.type = T_ERROR;
 465                                 return;
 466                         default:
 467                                 put_back(c);
 468                                 c = '0';
 469                                 parse_number_dec();
 470                                 return;
 471                 }
 472         } else {
 473                 parse_number_dec();
 474         }
 475 }
 476
 477 static inline int is_octal_digit(int chr)
 478 {
 479         return '0' <= chr && chr <= '7';
 480 }
 481
 482 static int parse_octal_sequence(const int first_digit)
 483 {
 484         assert(is_octal_digit(first_digit));
 485         int value = first_digit - '0';
 486         if (!is_octal_digit(c)) return value;
 487         value = 8 * value + c - '0';
 488         next_char();
 489         if (!is_octal_digit(c)) return value;
 490         value = 8 * value + c - '0';
 491         next_char();
 492         return value;
 493 }
 494
 495 static int parse_hex_sequence(void)
 496 {
 497         int value = 0;
 498         while(1) {
 499                 if (c >= '0' && c <= '9') {
 500                         value = 16 * value + c - '0';
 501                 } else if ('A' <= c && c <= 'F') {
 502                         value = 16 * value + c - 'A' + 10;
 503                 } else if ('a' <= c && c <= 'f') {
 504                         value = 16 * value + c - 'a' + 10;
 505                 } else {
 506                         break;
 507                 }
 508                 next_char();
 509         }
 510
 511         return value;
 512 }
 513
 514 static int parse_escape_sequence(void)
 515 {
 516         eat('\\');
 517
 518         int ec = c;
 519         next_char();
 520
 521         switch(ec) {
 522         case '"':  return '"';
 523         case '\'': return '\'';
 524         case '\\': return '\\';
 525         case '?': return '\?';
 526         case 'a': return '\a';
 527         case 'b': return '\b';
 528         case 'f': return '\f';
 529         case 'n': return '\n';
 530         case 'r': return '\r';
 531         case 't': return '\t';
 532         case 'v': return '\v';
 533         case 'x':
 534                 return parse_hex_sequence();
 535         case '0':
 536         case '1':
 537         case '2':
 538         case '3':
 539         case '4':
 540         case '5':
 541         case '6':
 542         case '7':
 543                 return parse_octal_sequence(ec);
 544         case EOF:
 545                 parse_error("reached end of file while parsing escape sequence");
 546                 return EOF;
 547         default:
 548                 parse_error("unknown escape sequence");
 549                 return EOF;
 550         }
 551 }
 552
 553 const char *concat_strings(const char *s1, const char *s2)
 554 {
 555         size_t  len1   = strlen(s1);
 556         size_t  len2   = strlen(s2);
 557
 558         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 559         memcpy(concat, s1, len1);
 560         memcpy(concat + len1, s2, len2 + 1);
 561
 562         const char *result = strset_insert(&stringset, concat);
 563         if(result != concat) {
 564                 obstack_free(&symbol_obstack, concat);
 565         }
 566
 567         return result;
 568 }
 569
 570 static void parse_string_literal(void)
 571 {
 572         unsigned    start_linenr = lexer_token.source_position.linenr;
 573         char       *string;
 574         const char *result;
 575
 576         assert(c == '"');
 577         next_char();
 578
 579         int tc;
 580         while(1) {
 581                 switch(c) {
 582                 case '\\':
 583                         tc = parse_escape_sequence();
 584                         obstack_1grow(&symbol_obstack, tc);
 585                         break;
 586
 587                 case EOF:
 588                         error_prefix_at(lexer_token.source_position.input_name,
 589                                         start_linenr);
 590                         fprintf(stderr, "string has no end\n");
 591                         lexer_token.type = T_ERROR;
 592                         return;
 593
 594                 case '"':
 595                         next_char();
 596                         goto end_of_string;
 597
 598                 default:
 599                         obstack_1grow(&symbol_obstack, c);
 600                         next_char();
 601                         break;
 602                 }
 603         }
 604
 605 end_of_string:
 606
 607         /* TODO: concatenate multiple strings separated by whitespace... */
 608
 609         /* add finishing 0 to the string */
 610         obstack_1grow(&symbol_obstack, '\0');
 611         string = obstack_finish(&symbol_obstack);
 612
 613         /* check if there is already a copy of the string */
 614         result = strset_insert(&stringset, string);
 615         if(result != string) {
 616                 obstack_free(&symbol_obstack, string);
 617         }
 618
 619         lexer_token.type     = T_STRING_LITERAL;
 620         lexer_token.v.string = result;
 621 }
 622
 623 static void parse_character_constant(void)
 624 {
 625         eat('\'');
 626
 627         int found_char = 0;
 628         while(1) {
 629                 switch(c) {
 630                 case '\\':
 631                         found_char = parse_escape_sequence();
 632                         break;
 633
 634                 MATCH_NEWLINE(
 635                         parse_error("newline while parsing character constant");
 636                         break;
 637                 )
 638
 639                 case '\'':
 640                         next_char();
 641                         goto end_of_char_constant;
 642
 643                 case EOF:
 644                         parse_error("EOF while parsing character constant");
 645                         lexer_token.type = T_ERROR;
 646                         return;
 647
 648                 default:
 649                         if(found_char != 0) {
 650                                 parse_error("more than 1 characters in character "
 651                                             "constant");
 652                                 goto end_of_char_constant;
 653                         } else {
 654                                 found_char = c;
 655                                 next_char();
 656                         }
 657                         break;
 658                 }
 659         }
 660
 661 end_of_char_constant:
 662         lexer_token.type       = T_INTEGER;
 663         lexer_token.v.intvalue = found_char;
 664 }
 665
 666 static void skip_multiline_comment(void)
 667 {
 668         unsigned start_linenr = lexer_token.source_position.linenr;
 669
 670         while(1) {
 671                 switch(c) {
 672                 case '*':
 673                         next_char();
 674                         if(c == '/') {
 675                                 next_char();
 676                                 return;
 677                         }
 678                         break;
 679
 680                 MATCH_NEWLINE(break;)
 681
 682                 case EOF:
 683                         error_prefix_at(lexer_token.source_position.input_name,
 684                                         start_linenr);
 685                         fprintf(stderr, "at end of file while looking for comment end\n");
 686                         return;
 687
 688                 default:
 689                         next_char();
 690                         break;
 691                 }
 692         }
 693 }
 694
 695 static void skip_line_comment(void)
 696 {
 697         while(1) {
 698                 switch(c) {
 699                 case EOF:
 700                         return;
 701
 702                 case '\n':
 703                 case '\r':
 704                         return;
 705
 706                 default:
 707                         next_char();
 708                         break;
 709                 }
 710         }
 711 }
 712
 713 static token_t pp_token;
 714
 715 static inline void next_pp_token(void)
 716 {
 717         lexer_next_preprocessing_token();
 718         pp_token = lexer_token;
 719 }
 720
 721 static void eat_until_newline(void)
 722 {
 723         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 724                 next_pp_token();
 725         }
 726 }
 727
 728 static void error_directive(void)
 729 {
 730         error_prefix();
 731         fprintf(stderr, "#error directive: \n");
 732
 733         /* parse pp-tokens until new-line */
 734 }
 735
 736 static void define_directive(void)
 737 {
 738         lexer_next_preprocessing_token();
 739         if(lexer_token.type != T_IDENTIFIER) {
 740                 parse_error("expected identifier after #define\n");
 741                 eat_until_newline();
 742         }
 743 }
 744
 745 static void ifdef_directive(int is_ifndef)
 746 {
 747         (void) is_ifndef;
 748         lexer_next_preprocessing_token();
 749         //expect_identifier();
 750         //extect_newline();
 751 }
 752
 753 static void endif_directive(void)
 754 {
 755         //expect_newline();
 756 }
 757
 758 static void parse_line_directive(void)
 759 {
 760         if(pp_token.type != T_INTEGER) {
 761                 parse_error("expected integer");
 762         } else {
 763                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 764                 next_pp_token();
 765         }
 766         if(pp_token.type == T_STRING_LITERAL) {
 767                 lexer_token.source_position.input_name = pp_token.v.string;
 768                 next_pp_token();
 769         }
 770
 771         eat_until_newline();
 772 }
 773
 774 static void parse_preprocessor_identifier(void)
 775 {
 776         assert(pp_token.type == T_IDENTIFIER);
 777         symbol_t *symbol = pp_token.v.symbol;
 778
 779         switch(symbol->pp_ID) {
 780         case TP_include:
 781                 printf("include - enable header name parsing!\n");
 782                 break;
 783         case TP_define:
 784                 define_directive();
 785                 break;
 786         case TP_ifdef:
 787                 ifdef_directive(0);
 788                 break;
 789         case TP_ifndef:
 790                 ifdef_directive(1);
 791                 break;
 792         case TP_endif:
 793                 endif_directive();
 794                 break;
 795         case TP_line:
 796                 next_pp_token();
 797                 parse_line_directive();
 798                 break;
 799         case TP_if:
 800         case TP_else:
 801         case TP_elif:
 802         case TP_undef:
 803         case TP_error:
 804                 error_directive();
 805                 break;
 806         case TP_pragma:
 807                 break;
 808         }
 809 }
 810
 811 static void parse_preprocessor_directive(void)
 812 {
 813         next_pp_token();
 814
 815         switch(pp_token.type) {
 816         case T_IDENTIFIER:
 817                 parse_preprocessor_identifier();
 818                 break;
 819         case T_INTEGER:
 820                 parse_line_directive();
 821                 break;
 822         default:
 823                 parse_error("invalid preprocessor directive");
 824                 eat_until_newline();
 825                 break;
 826         }
 827 }
 828
 829 #define MAYBE_PROLOG                                       \
 830                         next_char();                                   \
 831                         while(1) {                                     \
 832                                 switch(c) {
 833
 834 #define MAYBE(ch, set_type)                                \
 835                                 case ch:                                   \
 836                                         next_char();                           \
 837                                         lexer_token.type = set_type;           \
 838                                         return;
 839
 840 #define ELSE_CODE(code)                                    \
 841                                 default:                                   \
 842                                         code;                                  \
 843                                 }                                          \
 844                         } /* end of while(1) */                        \
 845                         break;
 846
 847 #define ELSE(set_type)                                     \
 848                 ELSE_CODE(                                         \
 849                         lexer_token.type = set_type;                   \
 850                         return;                                        \
 851                 )
 852
 853 void lexer_next_preprocessing_token(void)
 854 {
 855         while(1) {
 856                 switch(c) {
 857                 case ' ':
 858                 case '\t':
 859                         next_char();
 860                         break;
 861
 862                 MATCH_NEWLINE(
 863                         lexer_token.type = '\n';
 864                         return;
 865                 )
 866
 867                 SYMBOL_CHARS
 868                         parse_symbol();
 869                         /* might be a wide string ( L"string" ) */
 870                         if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
 871                            lexer_token.v.symbol == symbol_L)) {
 872                                 parse_string_literal();
 873                                 return;
 874                         }
 875                         return;
 876
 877                 DIGITS
 878                         parse_number();
 879                         return;
 880
 881                 case '"':
 882                         parse_string_literal();
 883                         return;
 884
 885                 case '\'':
 886                         parse_character_constant();
 887                         return;
 888
 889                 case '.':
 890                         MAYBE_PROLOG
 891                                 case '.':
 892                                         MAYBE_PROLOG
 893                                         MAYBE('.', T_DOTDOTDOT)
 894                                         ELSE_CODE(
 895                                                 put_back(c);
 896                                                 c = '.';
 897                                                 lexer_token.type = '.';
 898                                                 return;
 899                                         )
 900                         ELSE('.')
 901                 case '&':
 902                         MAYBE_PROLOG
 903                         MAYBE('&', T_ANDAND)
 904                         MAYBE('=', T_ANDEQUAL)
 905                         ELSE('&')
 906                 case '*':
 907                         MAYBE_PROLOG
 908                         MAYBE('=', T_ASTERISKEQUAL)
 909                         ELSE('*')
 910                 case '+':
 911                         MAYBE_PROLOG
 912                         MAYBE('+', T_PLUSPLUS)
 913                         MAYBE('=', T_PLUSEQUAL)
 914                         ELSE('+')
 915                 case '-':
 916                         MAYBE_PROLOG
 917                         MAYBE('>', T_MINUSGREATER)
 918                         MAYBE('-', T_MINUSMINUS)
 919                         MAYBE('=', T_MINUSEQUAL)
 920                         ELSE('-')
 921                 case '!':
 922                         MAYBE_PROLOG
 923                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 924                         ELSE('!')
 925                 case '/':
 926                         MAYBE_PROLOG
 927                         MAYBE('=', T_SLASHEQUAL)
 928                                 case '*':
 929                                         next_char();
 930                                         skip_multiline_comment();
 931                                         lexer_next_preprocessing_token();
 932                                         return;
 933                                 case '/':
 934                                         next_char();
 935                                         skip_line_comment();
 936                                         lexer_next_preprocessing_token();
 937                                         return;
 938                         ELSE('/')
 939                 case '%':
 940                         MAYBE_PROLOG
 941                         MAYBE('>', T_PERCENTGREATER)
 942                         MAYBE('=', T_PERCENTEQUAL)
 943                                 case ':':
 944                                         MAYBE_PROLOG
 945                                                 case '%':
 946                                                         MAYBE_PROLOG
 947                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 948                                                         ELSE_CODE(
 949                                                                 put_back(c);
 950                                                                 c = '%';
 951                                                                 lexer_token.type = T_PERCENTCOLON;
 952                                                                 return;
 953                                                         )
 954                                         ELSE(T_PERCENTCOLON)
 955                         ELSE('%')
 956                 case '<':
 957                         MAYBE_PROLOG
 958                         MAYBE(':', T_LESSCOLON)
 959                         MAYBE('%', T_LESSPERCENT)
 960                         MAYBE('=', T_LESSEQUAL)
 961                                 case '<':
 962                                         MAYBE_PROLOG
 963                                         MAYBE('=', T_LESSLESSEQUAL)
 964                                         ELSE(T_LESSLESS)
 965                         ELSE('<')
 966                 case '>':
 967                         MAYBE_PROLOG
 968                         MAYBE('=', T_GREATEREQUAL)
 969                                 case '>':
 970                                         MAYBE_PROLOG
 971                                         MAYBE('=', T_GREATERGREATEREQUAL)
 972                                         ELSE(T_GREATERGREATER)
 973                         ELSE('>')
 974                 case '^':
 975                         MAYBE_PROLOG
 976                         MAYBE('=', T_CARETEQUAL)
 977                         ELSE('^')
 978                 case '|':
 979                         MAYBE_PROLOG
 980                         MAYBE('=', T_PIPEEQUAL)
 981                         MAYBE('|', T_PIPEPIPE)
 982                         ELSE('|')
 983                 case ':':
 984                         MAYBE_PROLOG
 985                         MAYBE('>', T_COLONGREATER)
 986                         ELSE(':')
 987                 case '=':
 988                         MAYBE_PROLOG
 989                         MAYBE('=', T_EQUALEQUAL)
 990                         ELSE('=')
 991                 case '#':
 992                         MAYBE_PROLOG
 993                         MAYBE('#', T_HASHHASH)
 994                         ELSE('#')
 995
 996                 case '?':
 997                 case '[':
 998                 case ']':
 999                 case '(':
1000                 case ')':
1001                 case '{':
1002                 case '}':
1003                 case '~':
1004                 case ';':
1005                 case ',':
1006                 case '\\':
1007                         lexer_token.type = c;
1008                         next_char();
1009                         return;
1010
1011                 case EOF:
1012                         lexer_token.type = T_EOF;
1013                         return;
1014
1015                 default:
1016                         next_char();
1017                         error_prefix();
1018                         fprintf(stderr, "unknown character '%c' found\n", c);
1019                         lexer_token.type = T_ERROR;
1020                         return;
1021                 }
1022         }
1023 }
1024
1025 void lexer_next_token(void)
1026 {
1027         lexer_next_preprocessing_token();
1028         if(lexer_token.type != '\n')
1029                 return;
1030
1031 newline_found:
1032         do {
1033                 lexer_next_preprocessing_token();
1034         } while(lexer_token.type == '\n');
1035
1036         if(lexer_token.type == '#') {
1037                 parse_preprocessor_directive();
1038                 goto newline_found;
1039         }
1040 }
1041
1042 void init_lexer(void)
1043 {
1044         strset_init(&stringset);
1045 }
1046
1047 void lexer_open_stream(FILE *stream, const char *input_name)
1048 {
1049         input                                  = stream;
1050         lexer_token.source_position.linenr     = 0;
1051         lexer_token.source_position.input_name = input_name;
1052
1053         symbol_L = symbol_table_insert("L");
1054
1055         /* place a virtual \n at the beginning so the lexer knows that we're
1056          * at the beginning of a line */
1057         c = '\n';
1058 }
1059
1060 void exit_lexer(void)
1061 {
1062         strset_destroy(&stringset);
1063 }
1064
1065 static __attribute__((unused))
1066 void dbg_pos(const source_position_t source_position)
1067 {
1068         fprintf(stdout, "%s:%d\n", source_position.input_name,
1069                 source_position.linenr);
1070         fflush(stdout);
1071 }