nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7 #include "adt/strset.h"
   8 #include "adt/util.h"
   9 #include "type_t.h"
  10 #include "target_architecture.h"
  11 #include "parser.h"
  12
  13 #include <assert.h>
  14 #include <errno.h>
  15 #include <string.h>
  16 #include <stdbool.h>
  17 #include <ctype.h>
  18
  19 //#define DEBUG_CHARS
  20 #define MAX_PUTBACK 3
  21
  22 #ifdef _WIN32
  23 /* No strtold on windows and no replacement yet */
  24 #define strtold(s, e) strtod(s, e)
  25 #endif
  26
  27 #if defined HAS_SIGNED_CHAR
  28 typedef signed char char_type;
  29 #elif defined HAS_UNSIGNED_CHAR
  30 typedef unsigned char char_type;
  31 #else
  32 #       error signedness of char not determined
  33 #endif
  34
  35 static int         c;
  36 token_t            lexer_token;
  37 symbol_t          *symbol_L;
  38 static FILE       *input;
  39 static char        buf[1024 + MAX_PUTBACK];
  40 static const char *bufend;
  41 static const char *bufpos;
  42 static strset_t    stringset;
  43
  44 static type_t     *type_int        = NULL;
  45 static type_t     *type_uint       = NULL;
  46 static type_t     *type_long       = NULL;
  47 static type_t     *type_ulong      = NULL;
  48 static type_t     *type_longlong   = NULL;
  49 static type_t     *type_ulonglong  = NULL;
  50 static type_t     *type_float      = NULL;
  51 static type_t     *type_double     = NULL;
  52 static type_t     *type_longdouble = NULL;
  53
  54 static void error_prefix_at(const char *input_name, unsigned linenr)
  55 {
  56         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  57 }
  58
  59 static void error_prefix(void)
  60 {
  61         error_prefix_at(lexer_token.source_position.input_name,
  62                         lexer_token.source_position.linenr);
  63 }
  64
  65 static void parse_error(const char *msg)
  66 {
  67         error_prefix();
  68         fprintf(stderr, "%s\n", msg);
  69 }
  70
  71 static inline void next_real_char(void)
  72 {
  73         bufpos++;
  74         if(bufpos >= bufend) {
  75                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  76                                  input);
  77                 if(s == 0) {
  78                         c = EOF;
  79                         return;
  80                 }
  81                 bufpos = buf + MAX_PUTBACK;
  82                 bufend = buf + MAX_PUTBACK + s;
  83         }
  84         c = *(bufpos);
  85 }
  86
  87 static inline void put_back(int pc)
  88 {
  89         assert(bufpos >= buf);
  90         //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
  91
  92         char *p = buf + (bufpos - buf);
  93         *p = (char) pc;
  94
  95         /* going backwards in the buffer is legal as long as it's not more often
  96          * than MAX_PUTBACK */
  97         bufpos--;
  98
  99 #ifdef DEBUG_CHARS
 100         printf("putback '%c'\n", pc);
 101 #endif
 102 }
 103
 104 static inline void next_char(void);
 105
 106 #define MATCH_NEWLINE(code)                   \
 107         case '\r':                                \
 108                 next_char();                          \
 109                 if(c == '\n') {                       \
 110                         next_char();                      \
 111                 }                                     \
 112                 lexer_token.source_position.linenr++; \
 113                 code;                                 \
 114         case '\n':                                \
 115                 next_char();                          \
 116                 lexer_token.source_position.linenr++; \
 117                 code;
 118
 119 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 120
 121 static void maybe_concat_lines(void)
 122 {
 123         eat('\\');
 124
 125         switch(c) {
 126         MATCH_NEWLINE(return;)
 127
 128         default:
 129                 break;
 130         }
 131
 132         put_back(c);
 133         c = '\\';
 134 }
 135
 136 static inline void next_char(void)
 137 {
 138         next_real_char();
 139
 140         /* filter trigraphs */
 141         if(UNLIKELY(c == '\\')) {
 142                 maybe_concat_lines();
 143                 goto end_of_next_char;
 144         }
 145
 146         if(LIKELY(c != '?'))
 147                 goto end_of_next_char;
 148
 149         next_real_char();
 150         if(LIKELY(c != '?')) {
 151                 put_back(c);
 152                 c = '?';
 153                 goto end_of_next_char;
 154         }
 155
 156         next_real_char();
 157         switch(c) {
 158         case '=': c = '#'; break;
 159         case '(': c = '['; break;
 160         case '/': c = '\\'; maybe_concat_lines(); break;
 161         case ')': c = ']'; break;
 162         case '\'': c = '^'; break;
 163         case '<': c = '{'; break;
 164         case '!': c = '|'; break;
 165         case '>': c = '}'; break;
 166         case '-': c = '~'; break;
 167         default:
 168                 put_back('?');
 169                 put_back(c);
 170                 c = '?';
 171                 break;
 172         }
 173
 174 end_of_next_char:;
 175 #ifdef DEBUG_CHARS
 176         printf("nchar '%c'\n", c);
 177 #endif
 178 }
 179
 180 #define SYMBOL_CHARS  \
 181         case 'a':         \
 182         case 'b':         \
 183         case 'c':         \
 184         case 'd':         \
 185         case 'e':         \
 186         case 'f':         \
 187         case 'g':         \
 188         case 'h':         \
 189         case 'i':         \
 190         case 'j':         \
 191         case 'k':         \
 192         case 'l':         \
 193         case 'm':         \
 194         case 'n':         \
 195         case 'o':         \
 196         case 'p':         \
 197         case 'q':         \
 198         case 'r':         \
 199         case 's':         \
 200         case 't':         \
 201         case 'u':         \
 202         case 'v':         \
 203         case 'w':         \
 204         case 'x':         \
 205         case 'y':         \
 206         case 'z':         \
 207         case 'A':         \
 208         case 'B':         \
 209         case 'C':         \
 210         case 'D':         \
 211         case 'E':         \
 212         case 'F':         \
 213         case 'G':         \
 214         case 'H':         \
 215         case 'I':         \
 216         case 'J':         \
 217         case 'K':         \
 218         case 'L':         \
 219         case 'M':         \
 220         case 'N':         \
 221         case 'O':         \
 222         case 'P':         \
 223         case 'Q':         \
 224         case 'R':         \
 225         case 'S':         \
 226         case 'T':         \
 227         case 'U':         \
 228         case 'V':         \
 229         case 'W':         \
 230         case 'X':         \
 231         case 'Y':         \
 232         case 'Z':         \
 233         case '_':
 234
 235 #define DIGITS        \
 236         case '0':         \
 237         case '1':         \
 238         case '2':         \
 239         case '3':         \
 240         case '4':         \
 241         case '5':         \
 242         case '6':         \
 243         case '7':         \
 244         case '8':         \
 245         case '9':
 246
 247 static void parse_symbol(void)
 248 {
 249         symbol_t *symbol;
 250         char     *string;
 251
 252         obstack_1grow(&symbol_obstack, (char) c);
 253         next_char();
 254
 255         while(1) {
 256                 switch(c) {
 257                 DIGITS
 258                 SYMBOL_CHARS
 259                         obstack_1grow(&symbol_obstack, (char) c);
 260                         next_char();
 261                         break;
 262
 263                 default:
 264                         goto end_symbol;
 265                 }
 266         }
 267
 268 end_symbol:
 269         obstack_1grow(&symbol_obstack, '\0');
 270
 271         string = obstack_finish(&symbol_obstack);
 272         symbol = symbol_table_insert(string);
 273
 274         lexer_token.type     = symbol->ID;
 275         lexer_token.v.symbol = symbol;
 276
 277         if(symbol->string != string) {
 278                 obstack_free(&symbol_obstack, string);
 279         }
 280 }
 281
 282 static void parse_integer_suffix(bool is_oct_hex)
 283 {
 284         bool is_unsigned  = false;
 285         bool min_long     = false;
 286         bool min_longlong = false;
 287
 288         if(c == 'U' || c == 'u') {
 289                 is_unsigned = true;
 290                 next_char();
 291                 if(c == 'L' || c == 'l') {
 292                         min_long = true;
 293                         next_char();
 294                         if(c == 'L' || c == 'l') {
 295                                 min_longlong = true;
 296                                 next_char();
 297                         }
 298                 }
 299         } else if(c == 'l' || c == 'L') {
 300                 min_long = true;
 301                 next_char();
 302                 if(c == 'l' || c == 'L') {
 303                         min_longlong = true;
 304                         next_char();
 305                         if(c == 'u' || c == 'U') {
 306                                 is_unsigned = true;
 307                                 next_char();
 308                         }
 309                 } else if(c == 'u' || c == 'U') {
 310                         is_unsigned = true;
 311                         next_char();
 312                         lexer_token.datatype = type_ulong;
 313                 }
 314         }
 315
 316         if(!is_unsigned) {
 317                 long long v = lexer_token.v.intvalue;
 318                 if(!min_long) {
 319                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 320                                 lexer_token.datatype = type_int;
 321                                 return;
 322                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 323                                 lexer_token.datatype = type_uint;
 324                                 return;
 325                         }
 326                 }
 327                 if(!min_longlong) {
 328                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 329                                 lexer_token.datatype = type_long;
 330                                 return;
 331                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
 332                                 lexer_token.datatype = type_ulong;
 333                                 return;
 334                         }
 335                 }
 336                 unsigned long long uv = (unsigned long long) v;
 337                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 338                         lexer_token.datatype = type_ulonglong;
 339                         return;
 340                 }
 341
 342                 lexer_token.datatype = type_longlong;
 343         } else {
 344                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 345                 if(!min_long && v <= TARGET_UINT_MAX) {
 346                         lexer_token.datatype = type_uint;
 347                         return;
 348                 }
 349                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
 350                         lexer_token.datatype = type_ulong;
 351                         return;
 352                 }
 353                 lexer_token.datatype = type_ulonglong;
 354         }
 355 }
 356
 357 static void parse_floating_suffix(void)
 358 {
 359         switch(c) {
 360         /* TODO: do something usefull with the suffixes... */
 361         case 'f':
 362         case 'F':
 363                 next_char();
 364                 lexer_token.datatype = type_float;
 365                 break;
 366         case 'l':
 367         case 'L':
 368                 next_char();
 369                 lexer_token.datatype = type_longdouble;
 370                 break;
 371         default:
 372                 lexer_token.datatype = type_double;
 373                 break;
 374         }
 375 }
 376
 377 /**
 378  * A replacement for strtoull. Only those parts needed for
 379  * our parser are implemented.
 380  */
 381 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 382         unsigned long long v = 0;
 383
 384         switch (base) {
 385         case 16:
 386                 for (;; ++s) {
 387                         /* check for overrun */
 388                         if (v >= 0x1000000000000000ULL)
 389                                 break;
 390                         switch (tolower(*s)) {
 391                         case '0': v <<= 4; break;
 392                         case '1': v <<= 4; v |= 0x1; break;
 393                         case '2': v <<= 4; v |= 0x2; break;
 394                         case '3': v <<= 4; v |= 0x3; break;
 395                         case '4': v <<= 4; v |= 0x4; break;
 396                         case '5': v <<= 4; v |= 0x5; break;
 397                         case '6': v <<= 4; v |= 0x6; break;
 398                         case '7': v <<= 4; v |= 0x7; break;
 399                         case '8': v <<= 4; v |= 0x8; break;
 400                         case '9': v <<= 4; v |= 0x9; break;
 401                         case 'a': v <<= 4; v |= 0xa; break;
 402                         case 'b': v <<= 4; v |= 0xb; break;
 403                         case 'c': v <<= 4; v |= 0xc; break;
 404                         case 'd': v <<= 4; v |= 0xd; break;
 405                         case 'e': v <<= 4; v |= 0xe; break;
 406                         case 'f': v <<= 4; v |= 0xf; break;
 407                         default:
 408                                 goto end;
 409                         }
 410                 }
 411                 break;
 412         case 8:
 413                 for (;; ++s) {
 414                         /* check for overrun */
 415                         if (v >= 0x2000000000000000ULL)
 416                                 break;
 417                         switch (tolower(*s)) {
 418                         case '0': v <<= 3; break;
 419                         case '1': v <<= 3; v |= 1; break;
 420                         case '2': v <<= 3; v |= 2; break;
 421                         case '3': v <<= 3; v |= 3; break;
 422                         case '4': v <<= 3; v |= 4; break;
 423                         case '5': v <<= 3; v |= 5; break;
 424                         case '6': v <<= 3; v |= 6; break;
 425                         case '7': v <<= 3; v |= 7; break;
 426                         default:
 427                                 goto end;
 428                         }
 429                 }
 430                 break;
 431         case 10:
 432                 for (;; ++s) {
 433                         /* check for overrun */
 434                         if (v > 0x1999999999999999ULL)
 435                                 break;
 436                         switch (tolower(*s)) {
 437                         case '0': v *= 10; break;
 438                         case '1': v *= 10; v += 1; break;
 439                         case '2': v *= 10; v += 2; break;
 440                         case '3': v *= 10; v += 3; break;
 441                         case '4': v *= 10; v += 4; break;
 442                         case '5': v *= 10; v += 5; break;
 443                         case '6': v *= 10; v += 6; break;
 444                         case '7': v *= 10; v += 7; break;
 445                         case '8': v *= 10; v += 8; break;
 446                         case '9': v *= 10; v += 9; break;
 447                         default:
 448                                 goto end;
 449                         }
 450                 }
 451                 break;
 452         default:
 453                 assert(0);
 454                 break;
 455         }
 456 end:
 457         *endptr = s;
 458         return v;
 459 }
 460
 461 static void parse_number_hex(void)
 462 {
 463         assert(c == 'x' || c == 'X');
 464         next_char();
 465
 466         while(isxdigit(c)) {
 467                 obstack_1grow(&symbol_obstack, (char) c);
 468                 next_char();
 469         }
 470         obstack_1grow(&symbol_obstack, '\0');
 471         char *string = obstack_finish(&symbol_obstack);
 472
 473         if(c == '.' || c == 'p' || c == 'P') {
 474                 next_char();
 475                 panic("Hex floating point numbers not implemented yet");
 476         }
 477         if(*string == '\0') {
 478                 parse_error("invalid hex number");
 479                 lexer_token.type = T_ERROR;
 480         }
 481
 482         const char *endptr;
 483         lexer_token.type       = T_INTEGER;
 484         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
 485         if(*endptr != '\0') {
 486                 parse_error("hex number literal too long");
 487         }
 488
 489         obstack_free(&symbol_obstack, string);
 490         parse_integer_suffix(true);
 491 }
 492
 493 static inline bool is_octal_digit(int chr)
 494 {
 495         return '0' <= chr && chr <= '7';
 496 }
 497
 498 static void parse_number_oct(void)
 499 {
 500         while(is_octal_digit(c)) {
 501                 obstack_1grow(&symbol_obstack, (char) c);
 502                 next_char();
 503         }
 504         obstack_1grow(&symbol_obstack, '\0');
 505         char *string = obstack_finish(&symbol_obstack);
 506
 507         const char *endptr;
 508         lexer_token.type       = T_INTEGER;
 509         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 510         if(*endptr != '\0') {
 511                 parse_error("octal number literal too long");
 512         }
 513
 514         obstack_free(&symbol_obstack, string);
 515         parse_integer_suffix(true);
 516 }
 517
 518 static void parse_number_dec(void)
 519 {
 520         bool is_float = false;
 521         while(isdigit(c)) {
 522                 obstack_1grow(&symbol_obstack, (char) c);
 523                 next_char();
 524         }
 525
 526         if(c == '.') {
 527                 obstack_1grow(&symbol_obstack, '.');
 528                 next_char();
 529
 530                 while(isdigit(c)) {
 531                         obstack_1grow(&symbol_obstack, (char) c);
 532                         next_char();
 533                 }
 534                 is_float = true;
 535         }
 536         if(c == 'e' || c == 'E') {
 537                 obstack_1grow(&symbol_obstack, 'e');
 538                 next_char();
 539
 540                 if(c == '-' || c == '+') {
 541                         obstack_1grow(&symbol_obstack, (char) c);
 542                         next_char();
 543                 }
 544
 545                 while(isdigit(c)) {
 546                         obstack_1grow(&symbol_obstack, (char) c);
 547                         next_char();
 548                 }
 549                 is_float = true;
 550         }
 551
 552         obstack_1grow(&symbol_obstack, '\0');
 553         char *string = obstack_finish(&symbol_obstack);
 554
 555         if(is_float) {
 556                 char *endptr;
 557                 lexer_token.type         = T_FLOATINGPOINT;
 558                 lexer_token.v.floatvalue = strtold(string, &endptr);
 559
 560                 if(*endptr != '\0') {
 561                         parse_error("invalid number literal");
 562                 }
 563
 564                 parse_floating_suffix();
 565         } else {
 566                 const char *endptr;
 567                 lexer_token.type       = T_INTEGER;
 568                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 569
 570                 if(*endptr != '\0') {
 571                         parse_error("invalid number literal");
 572                 }
 573
 574                 parse_integer_suffix(false);
 575         }
 576         obstack_free(&symbol_obstack, string);
 577 }
 578
 579 static void parse_number(void)
 580 {
 581         if (c == '0') {
 582                 next_char();
 583                 switch (c) {
 584                         case 'X':
 585                         case 'x':
 586                                 parse_number_hex();
 587                                 break;
 588                         case '0':
 589                         case '1':
 590                         case '2':
 591                         case '3':
 592                         case '4':
 593                         case '5':
 594                         case '6':
 595                         case '7':
 596                                 parse_number_oct();
 597                                 break;
 598                         case '8':
 599                         case '9':
 600                                 next_char();
 601                                 parse_error("invalid octal number");
 602                                 lexer_token.type = T_ERROR;
 603                                 return;
 604                         case '.':
 605                         case 'e':
 606                         case 'E':
 607                         default:
 608                                 obstack_1grow(&symbol_obstack, '0');
 609                                 parse_number_dec();
 610                                 return;
 611                 }
 612         } else {
 613                 parse_number_dec();
 614         }
 615 }
 616
 617 static int parse_octal_sequence(const int first_digit)
 618 {
 619         assert(is_octal_digit(first_digit));
 620         int value = first_digit - '0';
 621         if (!is_octal_digit(c)) return value;
 622         value = 8 * value + c - '0';
 623         next_char();
 624         if (!is_octal_digit(c)) return value;
 625         value = 8 * value + c - '0';
 626         next_char();
 627         return (char_type)value;
 628 }
 629
 630 static int parse_hex_sequence(void)
 631 {
 632         int value = 0;
 633         while(1) {
 634                 if (c >= '0' && c <= '9') {
 635                         value = 16 * value + c - '0';
 636                 } else if ('A' <= c && c <= 'F') {
 637                         value = 16 * value + c - 'A' + 10;
 638                 } else if ('a' <= c && c <= 'f') {
 639                         value = 16 * value + c - 'a' + 10;
 640                 } else {
 641                         break;
 642                 }
 643                 next_char();
 644         }
 645
 646         return (char_type)value;
 647 }
 648
 649 static int parse_escape_sequence(void)
 650 {
 651         eat('\\');
 652
 653         int ec = c;
 654         next_char();
 655
 656         switch(ec) {
 657         case '"':  return '"';
 658         case '\'': return '\'';
 659         case '\\': return '\\';
 660         case '?': return '\?';
 661         case 'a': return '\a';
 662         case 'b': return '\b';
 663         case 'f': return '\f';
 664         case 'n': return '\n';
 665         case 'r': return '\r';
 666         case 't': return '\t';
 667         case 'v': return '\v';
 668         case 'x':
 669                 return parse_hex_sequence();
 670         case '0':
 671         case '1':
 672         case '2':
 673         case '3':
 674         case '4':
 675         case '5':
 676         case '6':
 677         case '7':
 678                 return parse_octal_sequence(ec);
 679         case EOF:
 680                 parse_error("reached end of file while parsing escape sequence");
 681                 return EOF;
 682         default:
 683                 parse_error("unknown escape sequence");
 684                 return EOF;
 685         }
 686 }
 687
 688 const char *concat_strings(const char *s1, const char *s2)
 689 {
 690         size_t  len1   = strlen(s1);
 691         size_t  len2   = strlen(s2);
 692
 693         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 694         memcpy(concat, s1, len1);
 695         memcpy(concat + len1, s2, len2 + 1);
 696
 697         const char *result = strset_insert(&stringset, concat);
 698         if(result != concat) {
 699                 obstack_free(&symbol_obstack, concat);
 700         }
 701
 702         return result;
 703 }
 704
 705 static void parse_string_literal(void)
 706 {
 707         unsigned    start_linenr = lexer_token.source_position.linenr;
 708         char       *string;
 709         const char *result;
 710
 711         assert(c == '"');
 712         next_char();
 713
 714         int tc;
 715         while(1) {
 716                 switch(c) {
 717                 case '\\':
 718                         tc = parse_escape_sequence();
 719                         obstack_1grow(&symbol_obstack, (char) tc);
 720                         break;
 721
 722                 case EOF:
 723                         error_prefix_at(lexer_token.source_position.input_name,
 724                                         start_linenr);
 725                         fprintf(stderr, "string has no end\n");
 726                         lexer_token.type = T_ERROR;
 727                         return;
 728
 729                 case '"':
 730                         next_char();
 731                         goto end_of_string;
 732
 733                 default:
 734                         obstack_1grow(&symbol_obstack, (char) c);
 735                         next_char();
 736                         break;
 737                 }
 738         }
 739
 740 end_of_string:
 741
 742         /* TODO: concatenate multiple strings separated by whitespace... */
 743
 744         /* add finishing 0 to the string */
 745         obstack_1grow(&symbol_obstack, '\0');
 746         string = obstack_finish(&symbol_obstack);
 747
 748         /* check if there is already a copy of the string */
 749         result = strset_insert(&stringset, string);
 750         if(result != string) {
 751                 obstack_free(&symbol_obstack, string);
 752         }
 753
 754         lexer_token.type     = T_STRING_LITERAL;
 755         lexer_token.v.string = result;
 756 }
 757
 758 static void parse_wide_character_constant(void)
 759 {
 760         eat('\'');
 761
 762         int found_char = 0;
 763         while(1) {
 764                 switch(c) {
 765                 case '\\':
 766                         found_char = parse_escape_sequence();
 767                         break;
 768
 769                 MATCH_NEWLINE(
 770                         parse_error("newline while parsing character constant");
 771                         break;
 772                 )
 773
 774                 case '\'':
 775                         next_char();
 776                         goto end_of_wide_char_constant;
 777
 778                 case EOF:
 779                         parse_error("EOF while parsing character constant");
 780                         lexer_token.type = T_ERROR;
 781                         return;
 782
 783                 default:
 784                         if(found_char != 0) {
 785                                 parse_error("more than 1 characters in character "
 786                                             "constant");
 787                                 goto end_of_wide_char_constant;
 788                         } else {
 789                                 found_char = c;
 790                                 next_char();
 791                         }
 792                         break;
 793                 }
 794         }
 795
 796 end_of_wide_char_constant:
 797         lexer_token.type       = T_INTEGER;
 798         lexer_token.v.intvalue = found_char;
 799         lexer_token.datatype   = type_wchar_t;
 800 }
 801
 802 static void parse_wide_string_literal(void)
 803 {
 804         const unsigned start_linenr = lexer_token.source_position.linenr;
 805
 806         assert(c == '"');
 807         next_char();
 808
 809         while(1) {
 810                 switch(c) {
 811                         case '\\': {
 812                                 wchar_rep_t tc = parse_escape_sequence();
 813                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 814                                 break;
 815                         }
 816
 817                         case EOF:
 818                                 error_prefix_at(lexer_token.source_position.input_name,
 819                                                 start_linenr);
 820                                 fprintf(stderr, "string has no end\n");
 821                                 lexer_token.type = T_ERROR;
 822                                 return;
 823
 824                         case '"':
 825                                 next_char();
 826                                 goto end_of_string;
 827
 828                         default: {
 829                                 wchar_rep_t tc = c;
 830                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 831                                 next_char();
 832                                 break;
 833                         }
 834                 }
 835         }
 836
 837 end_of_string:;
 838
 839         /* TODO: concatenate multiple strings separated by whitespace... */
 840
 841         /* add finishing 0 to the string */
 842         wchar_rep_t nul = L'\0';
 843         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 844         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 845         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
 846
 847 #if 0 /* TODO hash */
 848         /* check if there is already a copy of the string */
 849         const wchar_rep_t *const result = strset_insert(&stringset, string);
 850         if(result != string) {
 851                 obstack_free(&symbol_obstack, string);
 852         }
 853 #else
 854         const wchar_rep_t *const result = string;
 855 #endif
 856
 857         lexer_token.type                = T_WIDE_STRING_LITERAL;
 858         lexer_token.v.wide_string.begin = result;
 859         lexer_token.v.wide_string.size  = size;
 860 }
 861
 862 static void parse_character_constant(void)
 863 {
 864         eat('\'');
 865
 866         int found_char = 0;
 867         while(1) {
 868                 switch(c) {
 869                 case '\\':
 870                         found_char = parse_escape_sequence();
 871                         break;
 872
 873                 MATCH_NEWLINE(
 874                         parse_error("newline while parsing character constant");
 875                         break;
 876                 )
 877
 878                 case '\'':
 879                         next_char();
 880                         goto end_of_char_constant;
 881
 882                 case EOF:
 883                         parse_error("EOF while parsing character constant");
 884                         lexer_token.type = T_ERROR;
 885                         return;
 886
 887                 default:
 888                         if(found_char != 0) {
 889                                 parse_error("more than 1 characters in character "
 890                                             "constant");
 891                                 goto end_of_char_constant;
 892                         } else {
 893                                 found_char = c;
 894                                 next_char();
 895                         }
 896                         break;
 897                 }
 898         }
 899
 900 end_of_char_constant:
 901         lexer_token.type       = T_INTEGER;
 902         lexer_token.v.intvalue = found_char;
 903         lexer_token.datatype   = type_int;
 904 }
 905
 906 static void skip_multiline_comment(void)
 907 {
 908         unsigned start_linenr = lexer_token.source_position.linenr;
 909
 910         while(1) {
 911                 switch(c) {
 912                 case '*':
 913                         next_char();
 914                         if(c == '/') {
 915                                 next_char();
 916                                 return;
 917                         }
 918                         break;
 919
 920                 MATCH_NEWLINE(break;)
 921
 922                 case EOF:
 923                         error_prefix_at(lexer_token.source_position.input_name,
 924                                         start_linenr);
 925                         fprintf(stderr, "at end of file while looking for comment end\n");
 926                         return;
 927
 928                 default:
 929                         next_char();
 930                         break;
 931                 }
 932         }
 933 }
 934
 935 static void skip_line_comment(void)
 936 {
 937         while(1) {
 938                 switch(c) {
 939                 case EOF:
 940                         return;
 941
 942                 case '\n':
 943                 case '\r':
 944                         return;
 945
 946                 default:
 947                         next_char();
 948                         break;
 949                 }
 950         }
 951 }
 952
 953 static token_t pp_token;
 954
 955 static inline void next_pp_token(void)
 956 {
 957         lexer_next_preprocessing_token();
 958         pp_token = lexer_token;
 959 }
 960
 961 static void eat_until_newline(void)
 962 {
 963         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 964                 next_pp_token();
 965         }
 966 }
 967
 968 static void error_directive(void)
 969 {
 970         error_prefix();
 971         fprintf(stderr, "#error directive: \n");
 972
 973         /* parse pp-tokens until new-line */
 974 }
 975
 976 static void define_directive(void)
 977 {
 978         lexer_next_preprocessing_token();
 979         if(lexer_token.type != T_IDENTIFIER) {
 980                 parse_error("expected identifier after #define\n");
 981                 eat_until_newline();
 982         }
 983 }
 984
 985 static void ifdef_directive(int is_ifndef)
 986 {
 987         (void) is_ifndef;
 988         lexer_next_preprocessing_token();
 989         //expect_identifier();
 990         //extect_newline();
 991 }
 992
 993 static void endif_directive(void)
 994 {
 995         //expect_newline();
 996 }
 997
 998 static void parse_line_directive(void)
 999 {
1000         if(pp_token.type != T_INTEGER) {
1001                 parse_error("expected integer");
1002         } else {
1003                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1004                 next_pp_token();
1005         }
1006         if(pp_token.type == T_STRING_LITERAL) {
1007                 lexer_token.source_position.input_name = pp_token.v.string;
1008                 next_pp_token();
1009         }
1010
1011         eat_until_newline();
1012 }
1013
1014 static void parse_preprocessor_identifier(void)
1015 {
1016         assert(pp_token.type == T_IDENTIFIER);
1017         symbol_t *symbol = pp_token.v.symbol;
1018
1019         switch(symbol->pp_ID) {
1020         case TP_include:
1021                 printf("include - enable header name parsing!\n");
1022                 break;
1023         case TP_define:
1024                 define_directive();
1025                 break;
1026         case TP_ifdef:
1027                 ifdef_directive(0);
1028                 break;
1029         case TP_ifndef:
1030                 ifdef_directive(1);
1031                 break;
1032         case TP_endif:
1033                 endif_directive();
1034                 break;
1035         case TP_line:
1036                 next_pp_token();
1037                 parse_line_directive();
1038                 break;
1039         case TP_if:
1040         case TP_else:
1041         case TP_elif:
1042         case TP_undef:
1043         case TP_error:
1044                 error_directive();
1045                 break;
1046         case TP_pragma:
1047                 break;
1048         }
1049 }
1050
1051 static void parse_preprocessor_directive(void)
1052 {
1053         next_pp_token();
1054
1055         switch(pp_token.type) {
1056         case T_IDENTIFIER:
1057                 parse_preprocessor_identifier();
1058                 break;
1059         case T_INTEGER:
1060                 parse_line_directive();
1061                 break;
1062         default:
1063                 parse_error("invalid preprocessor directive");
1064                 eat_until_newline();
1065                 break;
1066         }
1067 }
1068
1069 #define MAYBE_PROLOG                                       \
1070                         next_char();                                   \
1071                         while(1) {                                     \
1072                                 switch(c) {
1073
1074 #define MAYBE(ch, set_type)                                \
1075                                 case ch:                                   \
1076                                         next_char();                           \
1077                                         lexer_token.type = set_type;           \
1078                                         return;
1079
1080 #define ELSE_CODE(code)                                    \
1081                                 default:                                   \
1082                                         code;                                  \
1083                                 }                                          \
1084                         } /* end of while(1) */                        \
1085                         break;
1086
1087 #define ELSE(set_type)                                     \
1088                 ELSE_CODE(                                         \
1089                         lexer_token.type = set_type;                   \
1090                         return;                                        \
1091                 )
1092
1093 void lexer_next_preprocessing_token(void)
1094 {
1095         while(1) {
1096                 switch(c) {
1097                 case ' ':
1098                 case '\t':
1099                         next_char();
1100                         break;
1101
1102                 MATCH_NEWLINE(
1103                         lexer_token.type = '\n';
1104                         return;
1105                 )
1106
1107                 SYMBOL_CHARS
1108                         parse_symbol();
1109                         /* might be a wide string ( L"string" ) */
1110                         if(lexer_token.type == T_IDENTIFIER &&
1111                             lexer_token.v.symbol == symbol_L) {
1112                             if(c == '"') {
1113                                         parse_wide_string_literal();
1114                                 } else if(c == '\'') {
1115                                         parse_wide_character_constant();
1116                                 }
1117                         }
1118                         return;
1119
1120                 DIGITS
1121                         parse_number();
1122                         return;
1123
1124                 case '"':
1125                         parse_string_literal();
1126                         return;
1127
1128                 case '\'':
1129                         parse_character_constant();
1130                         return;
1131
1132                 case '.':
1133                         MAYBE_PROLOG
1134                                 case '.':
1135                                         MAYBE_PROLOG
1136                                         MAYBE('.', T_DOTDOTDOT)
1137                                         ELSE_CODE(
1138                                                 put_back(c);
1139                                                 c = '.';
1140                                                 lexer_token.type = '.';
1141                                                 return;
1142                                         )
1143                         ELSE('.')
1144                 case '&':
1145                         MAYBE_PROLOG
1146                         MAYBE('&', T_ANDAND)
1147                         MAYBE('=', T_ANDEQUAL)
1148                         ELSE('&')
1149                 case '*':
1150                         MAYBE_PROLOG
1151                         MAYBE('=', T_ASTERISKEQUAL)
1152                         ELSE('*')
1153                 case '+':
1154                         MAYBE_PROLOG
1155                         MAYBE('+', T_PLUSPLUS)
1156                         MAYBE('=', T_PLUSEQUAL)
1157                         ELSE('+')
1158                 case '-':
1159                         MAYBE_PROLOG
1160                         MAYBE('>', T_MINUSGREATER)
1161                         MAYBE('-', T_MINUSMINUS)
1162                         MAYBE('=', T_MINUSEQUAL)
1163                         ELSE('-')
1164                 case '!':
1165                         MAYBE_PROLOG
1166                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1167                         ELSE('!')
1168                 case '/':
1169                         MAYBE_PROLOG
1170                         MAYBE('=', T_SLASHEQUAL)
1171                                 case '*':
1172                                         next_char();
1173                                         skip_multiline_comment();
1174                                         lexer_next_preprocessing_token();
1175                                         return;
1176                                 case '/':
1177                                         next_char();
1178                                         skip_line_comment();
1179                                         lexer_next_preprocessing_token();
1180                                         return;
1181                         ELSE('/')
1182                 case '%':
1183                         MAYBE_PROLOG
1184                         MAYBE('>', T_PERCENTGREATER)
1185                         MAYBE('=', T_PERCENTEQUAL)
1186                                 case ':':
1187                                         MAYBE_PROLOG
1188                                                 case '%':
1189                                                         MAYBE_PROLOG
1190                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1191                                                         ELSE_CODE(
1192                                                                 put_back(c);
1193                                                                 c = '%';
1194                                                                 lexer_token.type = T_PERCENTCOLON;
1195                                                                 return;
1196                                                         )
1197                                         ELSE(T_PERCENTCOLON)
1198                         ELSE('%')
1199                 case '<':
1200                         MAYBE_PROLOG
1201                         MAYBE(':', T_LESSCOLON)
1202                         MAYBE('%', T_LESSPERCENT)
1203                         MAYBE('=', T_LESSEQUAL)
1204                                 case '<':
1205                                         MAYBE_PROLOG
1206                                         MAYBE('=', T_LESSLESSEQUAL)
1207                                         ELSE(T_LESSLESS)
1208                         ELSE('<')
1209                 case '>':
1210                         MAYBE_PROLOG
1211                         MAYBE('=', T_GREATEREQUAL)
1212                                 case '>':
1213                                         MAYBE_PROLOG
1214                                         MAYBE('=', T_GREATERGREATEREQUAL)
1215                                         ELSE(T_GREATERGREATER)
1216                         ELSE('>')
1217                 case '^':
1218                         MAYBE_PROLOG
1219                         MAYBE('=', T_CARETEQUAL)
1220                         ELSE('^')
1221                 case '|':
1222                         MAYBE_PROLOG
1223                         MAYBE('=', T_PIPEEQUAL)
1224                         MAYBE('|', T_PIPEPIPE)
1225                         ELSE('|')
1226                 case ':':
1227                         MAYBE_PROLOG
1228                         MAYBE('>', T_COLONGREATER)
1229                         ELSE(':')
1230                 case '=':
1231                         MAYBE_PROLOG
1232                         MAYBE('=', T_EQUALEQUAL)
1233                         ELSE('=')
1234                 case '#':
1235                         MAYBE_PROLOG
1236                         MAYBE('#', T_HASHHASH)
1237                         ELSE('#')
1238
1239                 case '?':
1240                 case '[':
1241                 case ']':
1242                 case '(':
1243                 case ')':
1244                 case '{':
1245                 case '}':
1246                 case '~':
1247                 case ';':
1248                 case ',':
1249                 case '\\':
1250                         lexer_token.type = c;
1251                         next_char();
1252                         return;
1253
1254                 case EOF:
1255                         lexer_token.type = T_EOF;
1256                         return;
1257
1258                 default:
1259                         next_char();
1260                         error_prefix();
1261                         fprintf(stderr, "unknown character '%c' found\n", c);
1262                         lexer_token.type = T_ERROR;
1263                         return;
1264                 }
1265         }
1266 }
1267
1268 void lexer_next_token(void)
1269 {
1270         lexer_next_preprocessing_token();
1271         if(lexer_token.type != '\n')
1272                 return;
1273
1274 newline_found:
1275         do {
1276                 lexer_next_preprocessing_token();
1277         } while(lexer_token.type == '\n');
1278
1279         if(lexer_token.type == '#') {
1280                 parse_preprocessor_directive();
1281                 goto newline_found;
1282         }
1283 }
1284
1285 void init_lexer(void)
1286 {
1287         strset_init(&stringset);
1288
1289         type_int       = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_NONE);
1290         type_uint      = make_atomic_type(ATOMIC_TYPE_UINT, TYPE_QUALIFIER_NONE);
1291         type_long      = make_atomic_type(ATOMIC_TYPE_LONG, TYPE_QUALIFIER_NONE);
1292         type_ulong     = make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_NONE);
1293         type_longlong  = make_atomic_type(ATOMIC_TYPE_LONGLONG,
1294                                           TYPE_QUALIFIER_NONE);
1295         type_ulonglong = make_atomic_type(ATOMIC_TYPE_ULONGLONG,
1296                                           TYPE_QUALIFIER_NONE);
1297
1298         type_float      = make_atomic_type(ATOMIC_TYPE_FLOAT, TYPE_QUALIFIER_CONST);
1299         type_double     = make_atomic_type(ATOMIC_TYPE_DOUBLE,
1300                                            TYPE_QUALIFIER_CONST);
1301         type_longdouble = make_atomic_type(ATOMIC_TYPE_LONG_DOUBLE,
1302                                            TYPE_QUALIFIER_CONST);
1303 }
1304
1305 void lexer_open_stream(FILE *stream, const char *input_name)
1306 {
1307         input                                  = stream;
1308         lexer_token.source_position.linenr     = 0;
1309         lexer_token.source_position.input_name = input_name;
1310
1311         symbol_L = symbol_table_insert("L");
1312
1313         /* place a virtual \n at the beginning so the lexer knows that we're
1314          * at the beginning of a line */
1315         c = '\n';
1316 }
1317
1318 void exit_lexer(void)
1319 {
1320         strset_destroy(&stringset);
1321 }
1322
1323 static __attribute__((unused))
1324 void dbg_pos(const source_position_t source_position)
1325 {
1326         fprintf(stdout, "%s:%u\n", source_position.input_name,
1327                 source_position.linenr);
1328         fflush(stdout);
1329 }