nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8 #include "adt/util.h"
   9 #include "type_t.h"
  10
  11 #include <assert.h>
  12 #include <errno.h>
  13 #include <string.h>
  14 #include <stdbool.h>
  15 #include <ctype.h>
  16
  17 //#define DEBUG_CHARS
  18 #define MAX_PUTBACK 3
  19
  20 static int         c;
  21 token_t            lexer_token;
  22 symbol_t          *symbol_L;
  23 static FILE       *input;
  24 static char        buf[1024 + MAX_PUTBACK];
  25 static const char *bufend;
  26 static const char *bufpos;
  27 static strset_t    stringset;
  28
  29 static type_t     *type_int        = NULL;
  30 static type_t     *type_uint       = NULL;
  31 static type_t     *type_long       = NULL;
  32 static type_t     *type_ulong      = NULL;
  33 static type_t     *type_longlong   = NULL;
  34 static type_t     *type_ulonglong  = NULL;
  35 static type_t     *type_float      = NULL;
  36 static type_t     *type_double     = NULL;
  37 static type_t     *type_longdouble = NULL;
  38
  39 static void error_prefix_at(const char *input_name, unsigned linenr)
  40 {
  41         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  42 }
  43
  44 static void error_prefix(void)
  45 {
  46         error_prefix_at(lexer_token.source_position.input_name,
  47                         lexer_token.source_position.linenr);
  48 }
  49
  50 static void parse_error(const char *msg)
  51 {
  52         error_prefix();
  53         fprintf(stderr, "%s\n", msg);
  54 }
  55
  56 static inline void next_real_char(void)
  57 {
  58         bufpos++;
  59         if(bufpos >= bufend) {
  60                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  61                                  input);
  62                 if(s == 0) {
  63                         c = EOF;
  64                         return;
  65                 }
  66                 bufpos = buf + MAX_PUTBACK;
  67                 bufend = buf + MAX_PUTBACK + s;
  68         }
  69         c = *(bufpos);
  70 }
  71
  72 static inline void put_back(int pc)
  73 {
  74         assert(bufpos >= buf);
  75         //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
  76
  77         char *p = buf + (bufpos - buf);
  78         *p = pc;
  79
  80         /* going backwards in the buffer is legal as long as it's not more often
  81          * than MAX_PUTBACK */
  82         bufpos--;
  83
  84 #ifdef DEBUG_CHARS
  85         printf("putback '%c'\n", pc);
  86 #endif
  87 }
  88
  89 static inline void next_char(void);
  90
  91 #define MATCH_NEWLINE(code)                   \
  92         case '\r':                                \
  93                 next_char();                          \
  94                 if(c == '\n') {                       \
  95                         next_char();                      \
  96                 }                                     \
  97                 lexer_token.source_position.linenr++; \
  98                 code;                                 \
  99         case '\n':                                \
 100                 next_char();                          \
 101                 lexer_token.source_position.linenr++; \
 102                 code;
 103
 104 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 105
 106 static void maybe_concat_lines(void)
 107 {
 108         eat('\\');
 109
 110         switch(c) {
 111         MATCH_NEWLINE(return;)
 112
 113         default:
 114                 break;
 115         }
 116
 117         put_back(c);
 118         c = '\\';
 119 }
 120
 121 static inline void next_char(void)
 122 {
 123         next_real_char();
 124
 125         /* filter trigraphs */
 126         if(UNLIKELY(c == '\\')) {
 127                 maybe_concat_lines();
 128                 goto end_of_next_char;
 129         }
 130
 131         if(LIKELY(c != '?'))
 132                 goto end_of_next_char;
 133
 134         next_real_char();
 135         if(LIKELY(c != '?')) {
 136                 put_back(c);
 137                 c = '?';
 138                 goto end_of_next_char;
 139         }
 140
 141         next_real_char();
 142         switch(c) {
 143         case '=': c = '#'; break;
 144         case '(': c = '['; break;
 145         case '/': c = '\\'; maybe_concat_lines(); break;
 146         case ')': c = ']'; break;
 147         case '\'': c = '^'; break;
 148         case '<': c = '{'; break;
 149         case '!': c = '|'; break;
 150         case '>': c = '}'; break;
 151         case '-': c = '~'; break;
 152         default:
 153                 put_back('?');
 154                 put_back(c);
 155                 c = '?';
 156                 break;
 157         }
 158
 159 end_of_next_char:;
 160 #ifdef DEBUG_CHARS
 161         printf("nchar '%c'\n", c);
 162 #endif
 163 }
 164
 165 #define SYMBOL_CHARS  \
 166         case 'a':         \
 167         case 'b':         \
 168         case 'c':         \
 169         case 'd':         \
 170         case 'e':         \
 171         case 'f':         \
 172         case 'g':         \
 173         case 'h':         \
 174         case 'i':         \
 175         case 'j':         \
 176         case 'k':         \
 177         case 'l':         \
 178         case 'm':         \
 179         case 'n':         \
 180         case 'o':         \
 181         case 'p':         \
 182         case 'q':         \
 183         case 'r':         \
 184         case 's':         \
 185         case 't':         \
 186         case 'u':         \
 187         case 'v':         \
 188         case 'w':         \
 189         case 'x':         \
 190         case 'y':         \
 191         case 'z':         \
 192         case 'A':         \
 193         case 'B':         \
 194         case 'C':         \
 195         case 'D':         \
 196         case 'E':         \
 197         case 'F':         \
 198         case 'G':         \
 199         case 'H':         \
 200         case 'I':         \
 201         case 'J':         \
 202         case 'K':         \
 203         case 'L':         \
 204         case 'M':         \
 205         case 'N':         \
 206         case 'O':         \
 207         case 'P':         \
 208         case 'Q':         \
 209         case 'R':         \
 210         case 'S':         \
 211         case 'T':         \
 212         case 'U':         \
 213         case 'V':         \
 214         case 'W':         \
 215         case 'X':         \
 216         case 'Y':         \
 217         case 'Z':         \
 218         case '_':
 219
 220 #define DIGITS        \
 221         case '0':         \
 222         case '1':         \
 223         case '2':         \
 224         case '3':         \
 225         case '4':         \
 226         case '5':         \
 227         case '6':         \
 228         case '7':         \
 229         case '8':         \
 230         case '9':
 231
 232 static void parse_symbol(void)
 233 {
 234         symbol_t *symbol;
 235         char     *string;
 236
 237         obstack_1grow(&symbol_obstack, c);
 238         next_char();
 239
 240         while(1) {
 241                 switch(c) {
 242                 DIGITS
 243                 SYMBOL_CHARS
 244                         obstack_1grow(&symbol_obstack, c);
 245                         next_char();
 246                         break;
 247
 248                 default:
 249                         goto end_symbol;
 250                 }
 251         }
 252
 253 end_symbol:
 254         obstack_1grow(&symbol_obstack, '\0');
 255
 256         string = obstack_finish(&symbol_obstack);
 257         symbol = symbol_table_insert(string);
 258
 259         lexer_token.type     = symbol->ID;
 260         lexer_token.v.symbol = symbol;
 261
 262         if(symbol->string != string) {
 263                 obstack_free(&symbol_obstack, string);
 264         }
 265 }
 266
 267 static void parse_integer_suffix(void)
 268 {
 269         if(c == 'U' || c == 'U') {
 270                 next_char();
 271                 if(c == 'L' || c == 'l') {
 272                         next_char();
 273                         if(c == 'L' || c == 'l') {
 274                                 next_char();
 275                                 lexer_token.datatype = type_ulonglong;
 276                         } else {
 277                                 lexer_token.datatype = type_ulong;
 278                         }
 279                 } else {
 280                         lexer_token.datatype = type_uint;
 281                 }
 282         } else if(c == 'l' || c == 'L') {
 283                 next_char();
 284                 if(c == 'l' || c == 'L') {
 285                         next_char();
 286                         if(c == 'u' || c == 'U') {
 287                                 next_char();
 288                                 lexer_token.datatype = type_ulonglong;
 289                         } else {
 290                                 lexer_token.datatype = type_longlong;
 291                         }
 292                 } else if(c == 'u' || c == 'U') {
 293                         next_char();
 294                         lexer_token.datatype = type_ulong;
 295                 } else {
 296                         lexer_token.datatype = type_int;
 297                 }
 298         } else {
 299                 lexer_token.datatype = type_int;
 300         }
 301 }
 302
 303 static void parse_floating_suffix(void)
 304 {
 305         switch(c) {
 306         /* TODO: do something usefull with the suffixes... */
 307         case 'f':
 308         case 'F':
 309                 next_char();
 310                 lexer_token.datatype = type_float;
 311                 break;
 312         case 'l':
 313         case 'L':
 314                 next_char();
 315                 lexer_token.datatype = type_longdouble;
 316                 break;
 317         default:
 318                 lexer_token.datatype = type_double;
 319                 break;
 320         }
 321 }
 322
 323 static inline bool is_hex_digit(int c)
 324 {
 325         return (c >= '0' && c <= '9')
 326                         || (c >= 'a' && c <= 'f')
 327                         || (c >= 'A' && c <= 'F');
 328 }
 329
 330 static void parse_number_hex(void)
 331 {
 332         assert(c == 'x' || c == 'X');
 333         next_char();
 334
 335         while(is_hex_digit(c)) {
 336                 obstack_1grow(&symbol_obstack, c);
 337                 next_char();
 338         }
 339         obstack_1grow(&symbol_obstack, '\0');
 340         char *string = obstack_finish(&symbol_obstack);
 341
 342         if(c == '.' || c == 'p' || c == 'P') {
 343                 next_char();
 344                 panic("Hex floating point numbers not implemented yet");
 345         }
 346         if(*string == '\0') {
 347                 parse_error("invalid hex number");
 348                 lexer_token.type = T_ERROR;
 349         }
 350
 351         char *endptr;
 352         lexer_token.type       = T_INTEGER;
 353         lexer_token.v.intvalue = strtoull(string, &endptr, 16);
 354         if(*endptr != '\0') {
 355                 parse_error("hex number literal too long");
 356         }
 357
 358         obstack_free(&symbol_obstack, string);
 359         parse_integer_suffix();
 360 }
 361
 362 static inline bool is_octal_digit(int chr)
 363 {
 364         return '0' <= chr && chr <= '7';
 365 }
 366
 367 static void parse_number_oct(void)
 368 {
 369         while(is_octal_digit(c)) {
 370                 obstack_1grow(&symbol_obstack, c);
 371                 next_char();
 372         }
 373         obstack_1grow(&symbol_obstack, '\0');
 374         char *string = obstack_finish(&symbol_obstack);
 375
 376         char *endptr;
 377         lexer_token.type       = T_INTEGER;
 378         lexer_token.v.intvalue = strtoull(string, &endptr, 8);
 379         if(*endptr != '\0') {
 380                 parse_error("octal number literal too long");
 381         }
 382
 383         obstack_free(&symbol_obstack, string);
 384         parse_integer_suffix();
 385 }
 386
 387 static void parse_number_dec(void)
 388 {
 389         bool is_float = false;
 390         while(isdigit(c)) {
 391                 obstack_1grow(&symbol_obstack, c);
 392                 next_char();
 393         }
 394
 395         if(c == '.') {
 396                 obstack_1grow(&symbol_obstack, '.');
 397                 next_char();
 398
 399                 while(isdigit(c)) {
 400                         obstack_1grow(&symbol_obstack, c);
 401                         next_char();
 402                 }
 403                 is_float = true;
 404         }
 405         if(c == 'e' || c == 'E') {
 406                 obstack_1grow(&symbol_obstack, 'e');
 407                 next_char();
 408
 409                 if(c == '-' || c == '+') {
 410                         obstack_1grow(&symbol_obstack, c);
 411                         next_char();
 412                 }
 413
 414                 while(isdigit(c)) {
 415                         obstack_1grow(&symbol_obstack, c);
 416                         next_char();
 417                 }
 418                 is_float = true;
 419         }
 420
 421         obstack_1grow(&symbol_obstack, '\0');
 422         char *string = obstack_finish(&symbol_obstack);
 423
 424         char *endptr;
 425         if(is_float) {
 426                 lexer_token.type         = T_FLOATINGPOINT;
 427                 lexer_token.v.floatvalue = strtold(string, &endptr);
 428
 429                 if(*endptr != '\0') {
 430                         parse_error("invalid number literal");
 431                 }
 432
 433                 parse_floating_suffix();
 434         } else {
 435                 lexer_token.type       = T_INTEGER;
 436                 lexer_token.v.intvalue = strtoull(string, &endptr, 10);
 437
 438                 if(*endptr != '\0') {
 439                         parse_error("invalid number literal");
 440                 }
 441
 442                 parse_integer_suffix();
 443         }
 444         obstack_free(&symbol_obstack, string);
 445 }
 446
 447 static void parse_number(void)
 448 {
 449         if (c == '0') {
 450                 next_char();
 451                 switch (c) {
 452                         case 'X':
 453                         case 'x':
 454                                 parse_number_hex();
 455                                 break;
 456                         case '0':
 457                         case '1':
 458                         case '2':
 459                         case '3':
 460                         case '4':
 461                         case '5':
 462                         case '6':
 463                         case '7':
 464                                 parse_number_oct();
 465                                 break;
 466                         case '8':
 467                         case '9':
 468                                 next_char();
 469                                 parse_error("invalid octal number");
 470                                 lexer_token.type = T_ERROR;
 471                                 return;
 472                         case '.':
 473                         case 'e':
 474                         case 'E':
 475                         default:
 476                                 obstack_1grow(&symbol_obstack, '0');
 477                                 parse_number_dec();
 478                                 return;
 479                 }
 480         } else {
 481                 parse_number_dec();
 482         }
 483 }
 484
 485 static int parse_octal_sequence(const int first_digit)
 486 {
 487         assert(is_octal_digit(first_digit));
 488         int value = first_digit - '0';
 489         if (!is_octal_digit(c)) return value;
 490         value = 8 * value + c - '0';
 491         next_char();
 492         if (!is_octal_digit(c)) return value;
 493         value = 8 * value + c - '0';
 494         next_char();
 495         return value;
 496 }
 497
 498 static int parse_hex_sequence(void)
 499 {
 500         int value = 0;
 501         while(1) {
 502                 if (c >= '0' && c <= '9') {
 503                         value = 16 * value + c - '0';
 504                 } else if ('A' <= c && c <= 'F') {
 505                         value = 16 * value + c - 'A' + 10;
 506                 } else if ('a' <= c && c <= 'f') {
 507                         value = 16 * value + c - 'a' + 10;
 508                 } else {
 509                         break;
 510                 }
 511                 next_char();
 512         }
 513
 514         return value;
 515 }
 516
 517 static int parse_escape_sequence(void)
 518 {
 519         eat('\\');
 520
 521         int ec = c;
 522         next_char();
 523
 524         switch(ec) {
 525         case '"':  return '"';
 526         case '\'': return '\'';
 527         case '\\': return '\\';
 528         case '?': return '\?';
 529         case 'a': return '\a';
 530         case 'b': return '\b';
 531         case 'f': return '\f';
 532         case 'n': return '\n';
 533         case 'r': return '\r';
 534         case 't': return '\t';
 535         case 'v': return '\v';
 536         case 'x':
 537                 return parse_hex_sequence();
 538         case '0':
 539         case '1':
 540         case '2':
 541         case '3':
 542         case '4':
 543         case '5':
 544         case '6':
 545         case '7':
 546                 return parse_octal_sequence(ec);
 547         case EOF:
 548                 parse_error("reached end of file while parsing escape sequence");
 549                 return EOF;
 550         default:
 551                 parse_error("unknown escape sequence");
 552                 return EOF;
 553         }
 554 }
 555
 556 const char *concat_strings(const char *s1, const char *s2)
 557 {
 558         size_t  len1   = strlen(s1);
 559         size_t  len2   = strlen(s2);
 560
 561         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 562         memcpy(concat, s1, len1);
 563         memcpy(concat + len1, s2, len2 + 1);
 564
 565         const char *result = strset_insert(&stringset, concat);
 566         if(result != concat) {
 567                 obstack_free(&symbol_obstack, concat);
 568         }
 569
 570         return result;
 571 }
 572
 573 static void parse_string_literal(void)
 574 {
 575         unsigned    start_linenr = lexer_token.source_position.linenr;
 576         char       *string;
 577         const char *result;
 578
 579         assert(c == '"');
 580         next_char();
 581
 582         int tc;
 583         while(1) {
 584                 switch(c) {
 585                 case '\\':
 586                         tc = parse_escape_sequence();
 587                         obstack_1grow(&symbol_obstack, tc);
 588                         break;
 589
 590                 case EOF:
 591                         error_prefix_at(lexer_token.source_position.input_name,
 592                                         start_linenr);
 593                         fprintf(stderr, "string has no end\n");
 594                         lexer_token.type = T_ERROR;
 595                         return;
 596
 597                 case '"':
 598                         next_char();
 599                         goto end_of_string;
 600
 601                 default:
 602                         obstack_1grow(&symbol_obstack, c);
 603                         next_char();
 604                         break;
 605                 }
 606         }
 607
 608 end_of_string:
 609
 610         /* TODO: concatenate multiple strings separated by whitespace... */
 611
 612         /* add finishing 0 to the string */
 613         obstack_1grow(&symbol_obstack, '\0');
 614         string = obstack_finish(&symbol_obstack);
 615
 616         /* check if there is already a copy of the string */
 617         result = strset_insert(&stringset, string);
 618         if(result != string) {
 619                 obstack_free(&symbol_obstack, string);
 620         }
 621
 622         lexer_token.type     = T_STRING_LITERAL;
 623         lexer_token.v.string = result;
 624 }
 625
 626 static void parse_character_constant(void)
 627 {
 628         eat('\'');
 629
 630         int found_char = 0;
 631         while(1) {
 632                 switch(c) {
 633                 case '\\':
 634                         found_char = parse_escape_sequence();
 635                         break;
 636
 637                 MATCH_NEWLINE(
 638                         parse_error("newline while parsing character constant");
 639                         break;
 640                 )
 641
 642                 case '\'':
 643                         next_char();
 644                         goto end_of_char_constant;
 645
 646                 case EOF:
 647                         parse_error("EOF while parsing character constant");
 648                         lexer_token.type = T_ERROR;
 649                         return;
 650
 651                 default:
 652                         if(found_char != 0) {
 653                                 parse_error("more than 1 characters in character "
 654                                             "constant");
 655                                 goto end_of_char_constant;
 656                         } else {
 657                                 found_char = c;
 658                                 next_char();
 659                         }
 660                         break;
 661                 }
 662         }
 663
 664 end_of_char_constant:
 665         lexer_token.type       = T_INTEGER;
 666         lexer_token.v.intvalue = found_char;
 667 }
 668
 669 static void skip_multiline_comment(void)
 670 {
 671         unsigned start_linenr = lexer_token.source_position.linenr;
 672
 673         while(1) {
 674                 switch(c) {
 675                 case '*':
 676                         next_char();
 677                         if(c == '/') {
 678                                 next_char();
 679                                 return;
 680                         }
 681                         break;
 682
 683                 MATCH_NEWLINE(break;)
 684
 685                 case EOF:
 686                         error_prefix_at(lexer_token.source_position.input_name,
 687                                         start_linenr);
 688                         fprintf(stderr, "at end of file while looking for comment end\n");
 689                         return;
 690
 691                 default:
 692                         next_char();
 693                         break;
 694                 }
 695         }
 696 }
 697
 698 static void skip_line_comment(void)
 699 {
 700         while(1) {
 701                 switch(c) {
 702                 case EOF:
 703                         return;
 704
 705                 case '\n':
 706                 case '\r':
 707                         return;
 708
 709                 default:
 710                         next_char();
 711                         break;
 712                 }
 713         }
 714 }
 715
 716 static token_t pp_token;
 717
 718 static inline void next_pp_token(void)
 719 {
 720         lexer_next_preprocessing_token();
 721         pp_token = lexer_token;
 722 }
 723
 724 static void eat_until_newline(void)
 725 {
 726         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 727                 next_pp_token();
 728         }
 729 }
 730
 731 static void error_directive(void)
 732 {
 733         error_prefix();
 734         fprintf(stderr, "#error directive: \n");
 735
 736         /* parse pp-tokens until new-line */
 737 }
 738
 739 static void define_directive(void)
 740 {
 741         lexer_next_preprocessing_token();
 742         if(lexer_token.type != T_IDENTIFIER) {
 743                 parse_error("expected identifier after #define\n");
 744                 eat_until_newline();
 745         }
 746 }
 747
 748 static void ifdef_directive(int is_ifndef)
 749 {
 750         (void) is_ifndef;
 751         lexer_next_preprocessing_token();
 752         //expect_identifier();
 753         //extect_newline();
 754 }
 755
 756 static void endif_directive(void)
 757 {
 758         //expect_newline();
 759 }
 760
 761 static void parse_line_directive(void)
 762 {
 763         if(pp_token.type != T_INTEGER) {
 764                 parse_error("expected integer");
 765         } else {
 766                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
 767                 next_pp_token();
 768         }
 769         if(pp_token.type == T_STRING_LITERAL) {
 770                 lexer_token.source_position.input_name = pp_token.v.string;
 771                 next_pp_token();
 772         }
 773
 774         eat_until_newline();
 775 }
 776
 777 static void parse_preprocessor_identifier(void)
 778 {
 779         assert(pp_token.type == T_IDENTIFIER);
 780         symbol_t *symbol = pp_token.v.symbol;
 781
 782         switch(symbol->pp_ID) {
 783         case TP_include:
 784                 printf("include - enable header name parsing!\n");
 785                 break;
 786         case TP_define:
 787                 define_directive();
 788                 break;
 789         case TP_ifdef:
 790                 ifdef_directive(0);
 791                 break;
 792         case TP_ifndef:
 793                 ifdef_directive(1);
 794                 break;
 795         case TP_endif:
 796                 endif_directive();
 797                 break;
 798         case TP_line:
 799                 next_pp_token();
 800                 parse_line_directive();
 801                 break;
 802         case TP_if:
 803         case TP_else:
 804         case TP_elif:
 805         case TP_undef:
 806         case TP_error:
 807                 error_directive();
 808                 break;
 809         case TP_pragma:
 810                 break;
 811         }
 812 }
 813
 814 static void parse_preprocessor_directive(void)
 815 {
 816         next_pp_token();
 817
 818         switch(pp_token.type) {
 819         case T_IDENTIFIER:
 820                 parse_preprocessor_identifier();
 821                 break;
 822         case T_INTEGER:
 823                 parse_line_directive();
 824                 break;
 825         default:
 826                 parse_error("invalid preprocessor directive");
 827                 eat_until_newline();
 828                 break;
 829         }
 830 }
 831
 832 #define MAYBE_PROLOG                                       \
 833                         next_char();                                   \
 834                         while(1) {                                     \
 835                                 switch(c) {
 836
 837 #define MAYBE(ch, set_type)                                \
 838                                 case ch:                                   \
 839                                         next_char();                           \
 840                                         lexer_token.type = set_type;           \
 841                                         return;
 842
 843 #define ELSE_CODE(code)                                    \
 844                                 default:                                   \
 845                                         code;                                  \
 846                                 }                                          \
 847                         } /* end of while(1) */                        \
 848                         break;
 849
 850 #define ELSE(set_type)                                     \
 851                 ELSE_CODE(                                         \
 852                         lexer_token.type = set_type;                   \
 853                         return;                                        \
 854                 )
 855
 856 void lexer_next_preprocessing_token(void)
 857 {
 858         while(1) {
 859                 switch(c) {
 860                 case ' ':
 861                 case '\t':
 862                         next_char();
 863                         break;
 864
 865                 MATCH_NEWLINE(
 866                         lexer_token.type = '\n';
 867                         return;
 868                 )
 869
 870                 SYMBOL_CHARS
 871                         parse_symbol();
 872                         /* might be a wide string ( L"string" ) */
 873                         if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
 874                            lexer_token.v.symbol == symbol_L)) {
 875                                 parse_string_literal();
 876                                 return;
 877                         }
 878                         return;
 879
 880                 DIGITS
 881                         parse_number();
 882                         return;
 883
 884                 case '"':
 885                         parse_string_literal();
 886                         return;
 887
 888                 case '\'':
 889                         parse_character_constant();
 890                         return;
 891
 892                 case '.':
 893                         MAYBE_PROLOG
 894                                 case '.':
 895                                         MAYBE_PROLOG
 896                                         MAYBE('.', T_DOTDOTDOT)
 897                                         ELSE_CODE(
 898                                                 put_back(c);
 899                                                 c = '.';
 900                                                 lexer_token.type = '.';
 901                                                 return;
 902                                         )
 903                         ELSE('.')
 904                 case '&':
 905                         MAYBE_PROLOG
 906                         MAYBE('&', T_ANDAND)
 907                         MAYBE('=', T_ANDEQUAL)
 908                         ELSE('&')
 909                 case '*':
 910                         MAYBE_PROLOG
 911                         MAYBE('=', T_ASTERISKEQUAL)
 912                         ELSE('*')
 913                 case '+':
 914                         MAYBE_PROLOG
 915                         MAYBE('+', T_PLUSPLUS)
 916                         MAYBE('=', T_PLUSEQUAL)
 917                         ELSE('+')
 918                 case '-':
 919                         MAYBE_PROLOG
 920                         MAYBE('>', T_MINUSGREATER)
 921                         MAYBE('-', T_MINUSMINUS)
 922                         MAYBE('=', T_MINUSEQUAL)
 923                         ELSE('-')
 924                 case '!':
 925                         MAYBE_PROLOG
 926                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
 927                         ELSE('!')
 928                 case '/':
 929                         MAYBE_PROLOG
 930                         MAYBE('=', T_SLASHEQUAL)
 931                                 case '*':
 932                                         next_char();
 933                                         skip_multiline_comment();
 934                                         lexer_next_preprocessing_token();
 935                                         return;
 936                                 case '/':
 937                                         next_char();
 938                                         skip_line_comment();
 939                                         lexer_next_preprocessing_token();
 940                                         return;
 941                         ELSE('/')
 942                 case '%':
 943                         MAYBE_PROLOG
 944                         MAYBE('>', T_PERCENTGREATER)
 945                         MAYBE('=', T_PERCENTEQUAL)
 946                                 case ':':
 947                                         MAYBE_PROLOG
 948                                                 case '%':
 949                                                         MAYBE_PROLOG
 950                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
 951                                                         ELSE_CODE(
 952                                                                 put_back(c);
 953                                                                 c = '%';
 954                                                                 lexer_token.type = T_PERCENTCOLON;
 955                                                                 return;
 956                                                         )
 957                                         ELSE(T_PERCENTCOLON)
 958                         ELSE('%')
 959                 case '<':
 960                         MAYBE_PROLOG
 961                         MAYBE(':', T_LESSCOLON)
 962                         MAYBE('%', T_LESSPERCENT)
 963                         MAYBE('=', T_LESSEQUAL)
 964                                 case '<':
 965                                         MAYBE_PROLOG
 966                                         MAYBE('=', T_LESSLESSEQUAL)
 967                                         ELSE(T_LESSLESS)
 968                         ELSE('<')
 969                 case '>':
 970                         MAYBE_PROLOG
 971                         MAYBE('=', T_GREATEREQUAL)
 972                                 case '>':
 973                                         MAYBE_PROLOG
 974                                         MAYBE('=', T_GREATERGREATEREQUAL)
 975                                         ELSE(T_GREATERGREATER)
 976                         ELSE('>')
 977                 case '^':
 978                         MAYBE_PROLOG
 979                         MAYBE('=', T_CARETEQUAL)
 980                         ELSE('^')
 981                 case '|':
 982                         MAYBE_PROLOG
 983                         MAYBE('=', T_PIPEEQUAL)
 984                         MAYBE('|', T_PIPEPIPE)
 985                         ELSE('|')
 986                 case ':':
 987                         MAYBE_PROLOG
 988                         MAYBE('>', T_COLONGREATER)
 989                         ELSE(':')
 990                 case '=':
 991                         MAYBE_PROLOG
 992                         MAYBE('=', T_EQUALEQUAL)
 993                         ELSE('=')
 994                 case '#':
 995                         MAYBE_PROLOG
 996                         MAYBE('#', T_HASHHASH)
 997                         ELSE('#')
 998
 999                 case '?':
1000                 case '[':
1001                 case ']':
1002                 case '(':
1003                 case ')':
1004                 case '{':
1005                 case '}':
1006                 case '~':
1007                 case ';':
1008                 case ',':
1009                 case '\\':
1010                         lexer_token.type = c;
1011                         next_char();
1012                         return;
1013
1014                 case EOF:
1015                         lexer_token.type = T_EOF;
1016                         return;
1017
1018                 default:
1019                         next_char();
1020                         error_prefix();
1021                         fprintf(stderr, "unknown character '%c' found\n", c);
1022                         lexer_token.type = T_ERROR;
1023                         return;
1024                 }
1025         }
1026 }
1027
1028 void lexer_next_token(void)
1029 {
1030         lexer_next_preprocessing_token();
1031         if(lexer_token.type != '\n')
1032                 return;
1033
1034 newline_found:
1035         do {
1036                 lexer_next_preprocessing_token();
1037         } while(lexer_token.type == '\n');
1038
1039         if(lexer_token.type == '#') {
1040                 parse_preprocessor_directive();
1041                 goto newline_found;
1042         }
1043 }
1044
1045 void init_lexer(void)
1046 {
1047         strset_init(&stringset);
1048
1049         type_int       = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_CONST);
1050         type_uint      = make_atomic_type(ATOMIC_TYPE_UINT, TYPE_QUALIFIER_CONST);
1051         type_long      = make_atomic_type(ATOMIC_TYPE_LONG, TYPE_QUALIFIER_CONST);
1052         type_ulong     = make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_CONST);
1053         type_longlong  = make_atomic_type(ATOMIC_TYPE_LONGLONG,
1054                                           TYPE_QUALIFIER_CONST);
1055         type_ulonglong = make_atomic_type(ATOMIC_TYPE_ULONGLONG,
1056                                           TYPE_QUALIFIER_CONST);
1057
1058         type_float      = make_atomic_type(ATOMIC_TYPE_FLOAT, TYPE_QUALIFIER_CONST);
1059         type_double     = make_atomic_type(ATOMIC_TYPE_DOUBLE,
1060                                            TYPE_QUALIFIER_CONST);
1061         type_longdouble = make_atomic_type(ATOMIC_TYPE_LONG_DOUBLE,
1062                                            TYPE_QUALIFIER_CONST);
1063 }
1064
1065 void lexer_open_stream(FILE *stream, const char *input_name)
1066 {
1067         input                                  = stream;
1068         lexer_token.source_position.linenr     = 0;
1069         lexer_token.source_position.input_name = input_name;
1070
1071         symbol_L = symbol_table_insert("L");
1072
1073         /* place a virtual \n at the beginning so the lexer knows that we're
1074          * at the beginning of a line */
1075         c = '\n';
1076 }
1077
1078 void exit_lexer(void)
1079 {
1080         strset_destroy(&stringset);
1081 }
1082
1083 static __attribute__((unused))
1084 void dbg_pos(const source_position_t source_position)
1085 {
1086         fprintf(stdout, "%s:%d\n", source_position.input_name,
1087                 source_position.linenr);
1088         fflush(stdout);
1089 }