nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8 #include "adt/util.h"
   9
  10 #include <assert.h>
  11 #include <errno.h>
  12 #include <string.h>
  13 #include <stdbool.h>
  14 #include <ctype.h>
  15
  16 //#define DEBUG_CHARS
  17 #define MAX_PUTBACK 3
  18
  19 static int         c;
  20 token_t            lexer_token;
  21 symbol_t          *symbol_L;
  22 static FILE       *input;
  23 static char        buf[1024 + MAX_PUTBACK];
  24 static const char *bufend;
  25 static const char *bufpos;
  26 static strset_t    stringset;
  27
  28 static void error_prefix_at(const char *input_name, unsigned linenr)
  29 {
  30         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  31 }
  32
  33 static void error_prefix(void)
  34 {
  35         error_prefix_at(lexer_token.source_position.input_name,
  36                         lexer_token.source_position.linenr);
  37 }
  38
  39 static void parse_error(const char *msg)
  40 {
  41         error_prefix();
  42         fprintf(stderr, "%s\n", msg);
  43 }
  44
  45 static inline void next_real_char(void)
  46 {
  47         bufpos++;
  48         if(bufpos >= bufend) {
  49                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  50                                  input);
  51                 if(s == 0) {
  52                         c = EOF;
  53                         return;
  54                 }
  55                 bufpos = buf + MAX_PUTBACK;
  56                 bufend = buf + MAX_PUTBACK + s;
  57         }
  58         c = *(bufpos);
  59 }
  60
  61 static inline void put_back(int pc)
  62 {
  63         assert(bufpos >= buf);
  64         //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
  65
  66         char *p = buf + (bufpos - buf);
  67         *p = pc;
  68
  69         /* going backwards in the buffer is legal as long as it's not more often
  70          * than MAX_PUTBACK */
  71         bufpos--;
  72
  73 #ifdef DEBUG_CHARS
  74         printf("putback '%c'\n", pc);
  75 #endif
  76 }
  77
  78 static inline void next_char(void);
  79
  80 #define MATCH_NEWLINE(code)                   \
  81         case '\r':                                \
  82                 next_char();                          \
  83                 if(c == '\n') {                       \
  84                         next_char();                      \
  85                 }                                     \
  86                 lexer_token.source_position.linenr++; \
  87                 code;                                 \
  88         case '\n':                                \
  89                 next_char();                          \
  90                 lexer_token.source_position.linenr++; \
  91                 code;
  92
  93 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
  94
  95 static void maybe_concat_lines(void)
  96 {
  97         eat('\\');
  98
  99         switch(c) {
 100         MATCH_NEWLINE(return;)
 101
 102         default:
 103                 break;
 104         }
 105
 106         put_back(c);
 107         c = '\\';
 108 }
 109
 110 static inline void next_char(void)
 111 {
 112         next_real_char();
 113
 114         /* filter trigraphs */
 115         if(UNLIKELY(c == '\\')) {
 116                 maybe_concat_lines();
 117                 goto end_of_next_char;
 118         }
 119
 120         if(LIKELY(c != '?'))
 121                 goto end_of_next_char;
 122
 123         next_real_char();
 124         if(LIKELY(c != '?')) {
 125                 put_back(c);
 126                 c = '?';
 127                 goto end_of_next_char;
 128         }
 129
 130         next_real_char();
 131         switch(c) {
 132         case '=': c = '#'; break;
 133         case '(': c = '['; break;
 134         case '/': c = '\\'; maybe_concat_lines(); break;
 135         case ')': c = ']'; break;
 136         case '\'': c = '^'; break;
 137         case '<': c = '{'; break;
 138         case '!': c = '|'; break;
 139         case '>': c = '}'; break;
 140         case '-': c = '~'; break;
 141         default:
 142                 put_back('?');
 143                 put_back(c);
 144                 c = '?';
 145                 break;
 146         }
 147
 148 end_of_next_char:;
 149 #ifdef DEBUG_CHARS
 150         printf("nchar '%c'\n", c);
 151 #endif
 152 }
 153
 154 #define SYMBOL_CHARS  \
 155         case 'a':         \
 156         case 'b':         \
 157         case 'c':         \
 158         case 'd':         \
 159         case 'e':         \
 160         case 'f':         \
 161         case 'g':         \
 162         case 'h':         \
 163         case 'i':         \
 164         case 'j':         \
 165         case 'k':         \
 166         case 'l':         \
 167         case 'm':         \
 168         case 'n':         \
 169         case 'o':         \
 170         case 'p':         \
 171         case 'q':         \
 172         case 'r':         \
 173         case 's':         \
 174         case 't':         \
 175         case 'u':         \
 176         case 'v':         \
 177         case 'w':         \
 178         case 'x':         \
 179         case 'y':         \
 180         case 'z':         \
 181         case 'A':         \
 182         case 'B':         \
 183         case 'C':         \
 184         case 'D':         \
 185         case 'E':         \
 186         case 'F':         \
 187         case 'G':         \
 188         case 'H':         \
 189         case 'I':         \
 190         case 'J':         \
 191         case 'K':         \
 192         case 'L':         \
 193         case 'M':         \
 194         case 'N':         \
 195         case 'O':         \
 196         case 'P':         \
 197         case 'Q':         \
 198         case 'R':         \
 199         case 'S':         \
 200         case 'T':         \
 201         case 'U':         \
 202         case 'V':         \
 203         case 'W':         \
 204         case 'X':         \
 205         case 'Y':         \
 206         case 'Z':         \
 207         case '_':
 208
 209 #define DIGITS        \
 210         case '0':         \
 211         case '1':         \
 212         case '2':         \
 213         case '3':         \
 214         case '4':         \
 215         case '5':         \
 216         case '6':         \
 217         case '7':         \
 218         case '8':         \
 219         case '9':
 220
 221 static void parse_symbol(void)
 222 {
 223         symbol_t *symbol;
 224         char     *string;
 225
 226         obstack_1grow(&symbol_obstack, c);
 227         next_char();
 228
 229         while(1) {
 230                 switch(c) {
 231                 DIGITS
 232                 SYMBOL_CHARS
 233                         obstack_1grow(&symbol_obstack, c);
 234                         next_char();
 235                         break;
 236
 237                 default:
 238                         goto end_symbol;
 239                 }
 240         }
 241
 242 end_symbol:
 243         obstack_1grow(&symbol_obstack, '\0');
 244
 245         string = obstack_finish(&symbol_obstack);
 246         symbol = symbol_table_insert(string);
 247
 248         lexer_token.type     = symbol->ID;
 249         lexer_token.v.symbol = symbol;
 250
 251         if(symbol->string != string) {
 252                 obstack_free(&symbol_obstack, string);
 253         }
 254 }
 255
 256 static void parse_integer_suffix(void)
 257 {
 258         if(c == 'U' || c == 'U') {
 259                 /* TODO do something with the suffixes... */
 260                 next_char();
 261                 if(c == 'L' || c == 'l') {
 262                         next_char();
 263                         if(c == 'L' || c == 'l') {
 264                                 next_char();
 265                         }
 266                 }
 267         } else if(c == 'l' || c == 'L') {
 268                 next_char();
 269                 if(c == 'l' || c == 'L') {
 270                         next_char();
 271                         if(c == 'u' || c == 'U') {
 272                                 next_char();
 273                         }
 274                 } else if(c == 'u' || c == 'U') {
 275                         next_char();
 276                 }
 277         }
 278 }
 279
 280 static void parse_floating_suffix(void)
 281 {
 282         switch(c) {
 283         /* TODO: do something usefull with the suffixes... */
 284         case 'f':
 285         case 'F':
 286         case 'l':
 287         case 'L':
 288                 next_char();
 289                 break;
 290         default:
 291                 break;
 292         }
 293 }
 294
 295 static inline bool is_hex_digit(int c)
 296 {
 297         return (c >= '0' && c <= '9')
 298                         || (c >= 'a' && c <= 'z')
 299                         || (c >= 'A' && c <= 'Z');
 300 }
 301
 302 static void parse_number_hex(void)
 303 {
 304         assert(c == 'x' || c == 'X');
 305         next_char();
 306
 307         while(is_hex_digit(c)) {
 308                 obstack_1grow(&symbol_obstack, c);
 309                 next_char();
 310         }
 311         obstack_1grow(&symbol_obstack, '\0');
 312         char *string = obstack_finish(&symbol_obstack);
 313
 314         if(c == '.' || c == 'p' || c == 'P') {
 315                 next_char();
 316                 panic("Hex floating point numbers not implemented yet");
 317         }
 318         if(*string == '\0') {
 319                 parse_error("invalid hex number");
 320                 lexer_token.type = T_ERROR;
 321         }
 322
 323         char *endptr;
 324         int value = strtol(string, &endptr, 16);
 325         if(*endptr != '\0') {
 326                 parse_error("hex number literal too long");
 327         }
 328
 329         lexer_token.type       = T_INTEGER;
 330         lexer_token.v.intvalue = value;
 331
 332         parse_integer_suffix();
 333         obstack_free(&symbol_obstack, string);
 334 }
 335
 336 static inline bool is_octal_digit(int chr)
 337 {
 338         return '0' <= chr && chr <= '7';
 339 }
 340
 341 static void parse_number_oct(void)
 342 {
 343         while(is_octal_digit(c)) {
 344                 obstack_1grow(&symbol_obstack, c);
 345                 next_char();
 346         }
 347         obstack_1grow(&symbol_obstack, '\0');
 348         char *string = obstack_finish(&symbol_obstack);
 349
 350         char *endptr;
 351         int value = strtol(string, &endptr, 8);
 352         if(*endptr != '\0') {
 353                 parse_error("octal number literal too long");
 354         }
 355
 356         lexer_token.type       = T_INTEGER;
 357         lexer_token.v.intvalue = value;
 358
 359         parse_integer_suffix();
 360         obstack_free(&symbol_obstack, string);
 361 }
 362
 363 static void parse_number_dec(void)
 364 {
 365         bool is_float = false;
 366         while(isdigit(c)) {
 367                 obstack_1grow(&symbol_obstack, c);
 368                 next_char();
 369         }
 370
 371         if(c == '.') {
 372                 obstack_1grow(&symbol_obstack, '.');
 373                 next_char();
 374
 375                 while(isdigit(c)) {
 376                         obstack_1grow(&symbol_obstack, c);
 377                         next_char();
 378                 }
 379                 is_float = true;
 380         }
 381         if(c == 'e' || c == 'E') {
 382                 obstack_1grow(&symbol_obstack, 'e');
 383                 next_char();
 384
 385                 if(c == '-' || c == '+') {
 386                         obstack_1grow(&symbol_obstack, c);
 387                         next_char();
 388                 }
 389
 390                 while(isdigit(c)) {
 391                         obstack_1grow(&symbol_obstack, c);
 392                         next_char();
 393                 }
 394                 is_float = true;
 395         }
 396
 397         obstack_1grow(&symbol_obstack, '\0');
 398         char *string = obstack_finish(&symbol_obstack);
 399
 400         char *endptr;
 401         if(is_float) {
 402                 lexer_token.type         = T_FLOATINGPOINT;
 403                 lexer_token.v.floatvalue = strtod(string, &endptr);
 404
 405                 if(*endptr != '\0') {
 406                         parse_error("invalid number literal");
 407                 }
 408
 409                 parse_floating_suffix();
 410         } else {
 411                 lexer_token.type       = T_INTEGER;
 412                 lexer_token.v.intvalue = strtol(string, &endptr, 10);
 413
 414                 if(*endptr != '\0') {
 415                         parse_error("invalid number literal");
 416                 }
 417
 418                 parse_integer_suffix();
 419         }
 420         obstack_free(&symbol_obstack, string);
 421 }
 422
 423 static void parse_number(void)
 424 {
 425         if (c == '0') {
 426                 next_char();
 427                 switch (c) {
 428                         case 'X':
 429                         case 'x':
 430                                 parse_number_hex();
 431                                 break;
 432                         case '0':
 433                         case '1':
 434                         case '2':
 435                         case '3':
 436                         case '4':
 437                         case '5':
 438                         case '6':
 439                         case '7':
 440                                 parse_number_oct();
 441                                 break;
 442                         case '8':
 443                         case '9':
 444                                 next_char();
 445                                 parse_error("invalid octal number");
 446                                 lexer_token.type = T_ERROR;
 447                                 return;
 448                         case '.':
 449                         case 'e':
 450                         case 'E':
 451                         default:
 452                                 obstack_1grow(&symbol_obstack, '0');
 453                                 parse_number_dec();
 454                                 return;
 455                 }
 456         } else {
 457                 parse_number_dec();
 458         }
 459 }
 460
 461 static int parse_octal_sequence(const int first_digit)
 462 {
 463         assert(is_octal_digit(first_digit));
 464         int value = first_digit - '0';
 465         if (!is_octal_digit(c)) return value;
 466         value = 8 * value + c - '0';
 467         next_char();
 468         if (!is_octal_digit(c)) return value;
 469         value = 8 * value + c - '0';
 470         next_char();
 471         return value;
 472 }
 473
 474 static int parse_hex_sequence(void)
 475 {
 476         int value = 0;
 477         while(1) {
 478                 if (c >= '0' && c <= '9') {
 479                         value = 16 * value + c - '0';
 480                 } else if ('A' <= c && c <= 'F') {
 481                         value = 16 * value + c - 'A' + 10;
 482                 } else if ('a' <= c && c <= 'f') {
 483                         value = 16 * value + c - 'a' + 10;
 484                 } else {
 485                         break;
 486                 }
 487                 next_char();
 488         }
 489
 490         return value;
 491 }
 492
 493 static int parse_escape_sequence(void)
 494 {
 495         eat('\\');
 496
 497         int ec = c;
 498         next_char();
 499
 500         switch(ec) {
 501         case '"':  return '"';
 502         case '\'': return '\'';
 503         case '\\': return '\\';
 504         case '?': return '\?';
 505         case 'a': return '\a';
 506         case 'b': return '\b';
 507         case 'f': return '\f';
 508         case 'n': return '\n';
 509         case 'r': return '\r';
 510         case 't': return '\t';
 511         case 'v': return '\v';
 512         case 'x':
 513                 return parse_hex_sequence();
 514         case '0':
 515         case '1':
 516         case '2':
 517         case '3':
 518         case '4':
 519         case '5':
 520         case '6':
 521         case '7':
 522                 return parse_octal_sequence(ec);
 523         case EOF:
 524                 parse_error("reached end of file while parsing escape sequence");
 525                 return EOF;
 526         default:
 527                 parse_error("unknown escape sequence");
 528                 return EOF;
 529         }
 530 }
 531
 532 const char *concat_strings(const char *s1, const char *s2)
 533 {
 534         size_t  len1   = strlen(s1);
 535         size_t  len2   = strlen(s2);
 536
 537         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 538         memcpy(concat, s1, len1);
 539         memcpy(concat + len1, s2, len2 + 1);
 540
 541         const char *result = strset_insert(&stringset, concat);
 542         if(result != concat) {
 543                 obstack_free(&symbol_obstack, concat);
 544         }
 545
 546         return result;
 547 }
 548
 549 static void parse_string_literal(void)
 550 {
 551         unsigned    start_linenr = lexer_token.source_position.linenr;
 552         char       *string;
 553         const char *result;
 554
 555         assert(c == '"');
 556         next_char();
 557
 558         int tc;
 559         while(1) {
 560                 switch(c) {
 561                 case '\\':
 562                         tc = parse_escape_sequence();
 563                         obstack_1grow(&symbol_obstack, tc);
 564                         break;
 565
 566                 case EOF:
 567                         error_prefix_at(lexer_token.source_position.input_name,
 568                                         start_linenr);
 569                         fprintf(stderr, "string has no end\n");
 570                         lexer_token.type = T_ERROR;
 571                         return;
 572
 573                 case '"':
 574                         next_char();
 575                         goto end_of_string;
 576
 577                 default:
 578                         obstack_1grow(&symbol_obstack, c);
 579                         next_char();
 580                         break;
 581                 }
 582         }
 583
 584 end_of_string:
 585
 586         /* TODO: concatenate multiple strings separated by whitespace... */
 587
 588         /* add finishing 0 to the string */
 589         obstack_1grow(&symbol_obstack, '\0');
 590         string = obstack_finish(&symbol_obstack);
 591
 592         /* check if there is already a copy of the string */
 593         result = strset_insert(&stringset, string);
 594         if(result != string) {
 595                 obstack_free(&symbol_obstack, string);
 596         }
 597
 598         lexer_token.type     = T_STRING_LITERAL;
 599         lexer_token.v.string = result;
 600 }
 601
 602 static void parse_character_constant(void)
 603 {
 604         eat('\'');
 605
 606         int found_char = 0;
 607         while(1) {
 608                 switch(c) {
 609                 case '\\':
 610                         found_char = parse_escape_sequence();
 611                         break;
 612
 613                 MATCH_NEWLINE(
 614                         parse_error("newline while parsing character constant");
 615                         break;
 616                 )
 617
 618                 case '\'':
 619                         next_char();
 620                         goto end_of_char_constant;
 621
 622                 case EOF:
 623                         parse_error("EOF while parsing character constant");
 624                         lexer_token.type = T_ERROR;
 625                         return;
 626
 627                 default:
 628                         if(found_char != 0) {
 629                                 parse_error("more than 1 characters in character "
 630                                             "constant");
 631                                 goto end_of_char_constant;
 632                         } else {
 633                                 found_char = c;
 634                                 next_char();
 635                         }
 636                         break;
 637                 }
 638         }
 639
 640 end_of_char_constant:
 641         lexer_token.type       = T_INTEGER;
 642         lexer_token.v.intvalue = found_char;
 643 }
 644
 645 static void skip_multiline_comment(void)
 646 {
 647         unsigned start_linenr = lexer_token.source_position.linenr;
 648
 649         while(1) {
 650                 switch(c) {
 651                 case '*':
 652                         next_char();
 653                         if(c == '/') {
 654                                 next_char();
 655                                 return;
 656                         }
 657                         break;
 658
 659                 MATCH_NEWLINE(break;)
 660
 661                 case EOF:
 662                         error_prefix_at(lexer_token.source_position.input_name,
 663                                         start_linenr);
 664                         fprintf(stderr, "at end of file while looking for comment end\n");
 665                         return;
 666
 667                 default:
 668                         next_char();
 669                         break;
 670                 }
 671         }
 672 }
 673
 674 static void skip_line_comment(void)
 675 {
 676         while(1) {
 677                 switch(c) {
 678                 case EOF:
 679                         return;
 680
 681                 case '\n':
 682                 case '\r':
 683                         return;
 684
 685                 default:
 686                         next_char();
 687                         break;
 688                 }
 689         }
 690 }
 691
 692 static token_t pp_token;
 693
 694 static inline void next_pp_token(void)
 695 {
 696         lexer_next_preprocessing_token();
 697         pp_token = lexer_token;
 698 }
 699
 700 static void eat_until_newline(void)
 701 {
 702         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 703                 next_pp_token();
 704         }
 705 }
 706
 707 static void error_directive(void)
 708 {
 709         error_prefix();
 710         fprintf(stderr, "#error directive: \n");
 711
 712         /* parse pp-tokens until new-line */
 713 }
 714
 715 static void define_directive(void)
 716 {
 717         lexer_next_preprocessing_token();
 718         if(lexer_token.type != T_IDENTIFIER) {
 719                 parse_error("expected identifier after #define\n");
 720                 eat_until_newline();
 721         }
 722 }
 723
 724 static void ifdef_directive(int is_ifndef)
 725 {
 726         (void) is_ifndef;
 727         lexer_next_preprocessing_token();
 728         //expect_identifier();
 729         //extect_newline();
 730 }
 731
 732 static void endif_directive(void)
 733 {
 734         //expect_newline();
 735 }
 736
 737 static void parse_line_directive(void)
 738 {
 739         if(pp_token.type != T_INTEGER) {
 740                 parse_error("expected integer");
 741         } else {
 742                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 743                 next_pp_token();
 744         }
 745         if(pp_token.type == T_STRING_LITERAL) {
 746                 lexer_token.source_position.input_name = pp_token.v.string;
 747                 next_pp_token();
 748         }
 749
 750         eat_until_newline();
 751 }
 752
 753 static void parse_preprocessor_identifier(void)
 754 {
 755         assert(pp_token.type == T_IDENTIFIER);
 756         symbol_t *symbol = pp_token.v.symbol;
 757
 758         switch(symbol->pp_ID) {
 759         case TP_include:
 760                 printf("include - enable header name parsing!\n");
 761                 break;
 762         case TP_define:
 763                 define_directive();
 764                 break;
 765         case TP_ifdef:
 766                 ifdef_directive(0);
 767                 break;
 768         case TP_ifndef:
 769                 ifdef_directive(1);
 770                 break;
 771         case TP_endif:
 772                 endif_directive();
 773                 break;
 774         case TP_line:
 775                 next_pp_token();
 776                 parse_line_directive();
 777                 break;
 778         case TP_if:
 779         case TP_else:
 780         case TP_elif:
 781         case TP_undef:
 782         case TP_error:
 783                 error_directive();
 784                 break;
 785         case TP_pragma:
 786                 break;
 787         }
 788 }
 789
 790 static void parse_preprocessor_directive(void)
 791 {
 792         next_pp_token();
 793
 794         switch(pp_token.type) {
 795         case T_IDENTIFIER:
 796                 parse_preprocessor_identifier();
 797                 break;
 798         case T_INTEGER:
 799                 parse_line_directive();
 800                 break;
 801         default:
 802                 parse_error("invalid preprocessor directive");
 803                 eat_until_newline();
 804                 break;
 805         }
 806 }
 807
 808 #define MAYBE_PROLOG                                       \
 809                         next_char();                                   \
 810                         while(1) {                                     \
 811                                 switch(c) {
 812
 813 #define MAYBE(ch, set_type)                                \
 814                                 case ch:                                   \
 815                                         next_char();                           \
 816                                         lexer_token.type = set_type;           \
 817                                         return;
 818
 819 #define ELSE_CODE(code)                                    \
 820                                 default:                                   \
 821                                         code;                                  \
 822                                 }                                          \
 823                         } /* end of while(1) */                        \
 824                         break;
 825
 826 #define ELSE(set_type)                                     \
 827                 ELSE_CODE(                                         \
 828                         lexer_token.type = set_type;                   \
 829                         return;                                        \
 830                 )
 831
 832 void lexer_next_preprocessing_token(void)
 833 {
 834         while(1) {
 835                 switch(c) {
 836                 case ' ':
 837                 case '\t':
 838                         next_char();
 839                         break;
 840
 841                 MATCH_NEWLINE(
 842                         lexer_token.type = '\n';
 843                         return;
 844                 )
 845
 846                 SYMBOL_CHARS
 847                         parse_symbol();
 848                         /* might be a wide string ( L"string" ) */
 849                         if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
 850                            lexer_token.v.symbol == symbol_L)) {
 851                                 parse_string_literal();
 852                                 return;
 853                         }
 854                         return;
 855
 856                 DIGITS
 857                         parse_number();
 858                         return;
 859
 860                 case '"':
 861                         parse_string_literal();
 862                         return;
 863
 864                 case '\'':
 865                         parse_character_constant();
 866                         return;
 867
 868                 case '.':
 869                         MAYBE_PROLOG
 870                                 case '.':
 871                                         MAYBE_PROLOG
 872                                         MAYBE('.', T_DOTDOTDOT)
 873                                         ELSE_CODE(
 874                                                 put_back(c);
 875                                                 c = '.';
 876                                                 lexer_token.type = '.';
 877                                                 return;
 878                                         )
 879                         ELSE('.')
 880                 case '&':
 881                         MAYBE_PROLOG
 882                         MAYBE('&', T_ANDAND)
 883                         MAYBE('=', T_ANDEQUAL)
 884                         ELSE('&')
 885                 case '*':
 886                         MAYBE_PROLOG
 887                         MAYBE('=', T_ASTERISKEQUAL)
 888                         ELSE('*')
 889                 case '+':
 890                         MAYBE_PROLOG
 891                         MAYBE('+', T_PLUSPLUS)
 892                         MAYBE('=', T_PLUSEQUAL)
 893                         ELSE('+')
 894                 case '-':
 895                         MAYBE_PROLOG
 896                         MAYBE('>', T_MINUSGREATER)
 897                         MAYBE('-', T_MINUSMINUS)
 898                         MAYBE('=', T_MINUSEQUAL)
 899                         ELSE('-')
 900                 case '!':
 901                         MAYBE_PROLOG
 902                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 903                         ELSE('!')
 904                 case '/':
 905                         MAYBE_PROLOG
 906                         MAYBE('=', T_SLASHEQUAL)
 907                                 case '*':
 908                                         next_char();
 909                                         skip_multiline_comment();
 910                                         lexer_next_preprocessing_token();
 911                                         return;
 912                                 case '/':
 913                                         next_char();
 914                                         skip_line_comment();
 915                                         lexer_next_preprocessing_token();
 916                                         return;
 917                         ELSE('/')
 918                 case '%':
 919                         MAYBE_PROLOG
 920                         MAYBE('>', T_PERCENTGREATER)
 921                         MAYBE('=', T_PERCENTEQUAL)
 922                                 case ':':
 923                                         MAYBE_PROLOG
 924                                                 case '%':
 925                                                         MAYBE_PROLOG
 926                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 927                                                         ELSE_CODE(
 928                                                                 put_back(c);
 929                                                                 c = '%';
 930                                                                 lexer_token.type = T_PERCENTCOLON;
 931                                                                 return;
 932                                                         )
 933                                         ELSE(T_PERCENTCOLON)
 934                         ELSE('%')
 935                 case '<':
 936                         MAYBE_PROLOG
 937                         MAYBE(':', T_LESSCOLON)
 938                         MAYBE('%', T_LESSPERCENT)
 939                         MAYBE('=', T_LESSEQUAL)
 940                                 case '<':
 941                                         MAYBE_PROLOG
 942                                         MAYBE('=', T_LESSLESSEQUAL)
 943                                         ELSE(T_LESSLESS)
 944                         ELSE('<')
 945                 case '>':
 946                         MAYBE_PROLOG
 947                         MAYBE('=', T_GREATEREQUAL)
 948                                 case '>':
 949                                         MAYBE_PROLOG
 950                                         MAYBE('=', T_GREATERGREATEREQUAL)
 951                                         ELSE(T_GREATERGREATER)
 952                         ELSE('>')
 953                 case '^':
 954                         MAYBE_PROLOG
 955                         MAYBE('=', T_CARETEQUAL)
 956                         ELSE('^')
 957                 case '|':
 958                         MAYBE_PROLOG
 959                         MAYBE('=', T_PIPEEQUAL)
 960                         MAYBE('|', T_PIPEPIPE)
 961                         ELSE('|')
 962                 case ':':
 963                         MAYBE_PROLOG
 964                         MAYBE('>', T_COLONGREATER)
 965                         ELSE(':')
 966                 case '=':
 967                         MAYBE_PROLOG
 968                         MAYBE('=', T_EQUALEQUAL)
 969                         ELSE('=')
 970                 case '#':
 971                         MAYBE_PROLOG
 972                         MAYBE('#', T_HASHHASH)
 973                         ELSE('#')
 974
 975                 case '?':
 976                 case '[':
 977                 case ']':
 978                 case '(':
 979                 case ')':
 980                 case '{':
 981                 case '}':
 982                 case '~':
 983                 case ';':
 984                 case ',':
 985                 case '\\':
 986                         lexer_token.type = c;
 987                         next_char();
 988                         return;
 989
 990                 case EOF:
 991                         lexer_token.type = T_EOF;
 992                         return;
 993
 994                 default:
 995                         next_char();
 996                         error_prefix();
 997                         fprintf(stderr, "unknown character '%c' found\n", c);
 998                         lexer_token.type = T_ERROR;
 999                         return;
1000                 }
1001         }
1002 }
1003
1004 void lexer_next_token(void)
1005 {
1006         lexer_next_preprocessing_token();
1007         if(lexer_token.type != '\n')
1008                 return;
1009
1010 newline_found:
1011         do {
1012                 lexer_next_preprocessing_token();
1013         } while(lexer_token.type == '\n');
1014
1015         if(lexer_token.type == '#') {
1016                 parse_preprocessor_directive();
1017                 goto newline_found;
1018         }
1019 }
1020
1021 void init_lexer(void)
1022 {
1023         strset_init(&stringset);
1024 }
1025
1026 void lexer_open_stream(FILE *stream, const char *input_name)
1027 {
1028         input                                  = stream;
1029         lexer_token.source_position.linenr     = 0;
1030         lexer_token.source_position.input_name = input_name;
1031
1032         symbol_L = symbol_table_insert("L");
1033
1034         /* place a virtual \n at the beginning so the lexer knows that we're
1035          * at the beginning of a line */
1036         c = '\n';
1037 }
1038
1039 void exit_lexer(void)
1040 {
1041         strset_destroy(&stringset);
1042 }
1043
1044 static __attribute__((unused))
1045 void dbg_pos(const source_position_t source_position)
1046 {
1047         fprintf(stdout, "%s:%d\n", source_position.input_name,
1048                 source_position.linenr);
1049         fflush(stdout);
1050 }