nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "diagnostic.h"
   4 #include "lexer.h"
   5 #include "token_t.h"
   6 #include "symbol_table_t.h"
   7 #include "adt/error.h"
   8 #include "adt/strset.h"
   9 #include "adt/util.h"
  10 #include "types.h"
  11 #include "type_t.h"
  12 #include "target_architecture.h"
  13 #include "parser.h"
  14 #include "warning.h"
  15
  16 #include <assert.h>
  17 #include <errno.h>
  18 #include <string.h>
  19 #include <stdbool.h>
  20 #include <ctype.h>
  21
  22 //#define DEBUG_CHARS
  23 #define MAX_PUTBACK 3
  24
  25 #ifdef _WIN32
  26 /* No strtold on windows and no replacement yet */
  27 #define strtold(s, e) strtod(s, e)
  28 #endif
  29
  30 #if defined HAS_SIGNED_CHAR
  31 typedef signed char char_type;
  32 #elif defined HAS_UNSIGNED_CHAR
  33 typedef unsigned char char_type;
  34 #else
  35 #       error signedness of char not determined
  36 #endif
  37
  38 static int         c;
  39 token_t            lexer_token;
  40 symbol_t          *symbol_L;
  41 static FILE       *input;
  42 static char        buf[1024 + MAX_PUTBACK];
  43 static const char *bufend;
  44 static const char *bufpos;
  45 static strset_t    stringset;
  46
  47 /**
  48  * Print an error prefix at the given coordinates.
  49  *
  50  * @param input_name   the input file name
  51  * @param linenr       the line number
  52  */
  53 static void error_prefix_at(const char *input_name, unsigned linenr)
  54 {
  55         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  56 }
  57
  58 /**
  59  * Print an error prefix at the current token coordinates.
  60  */
  61 static void error_prefix(void)
  62 {
  63         error_prefix_at(lexer_token.source_position.input_name,
  64                         lexer_token.source_position.linenr);
  65 }
  66
  67 /**
  68  * Prints a parse error message at the current token.
  69  *
  70  * @param msg   the error message
  71  */
  72 static void parse_error(const char *msg)
  73 {
  74         error_prefix();
  75         fprintf(stderr, "%s\n", msg);
  76 }
  77
  78 static inline void next_real_char(void)
  79 {
  80         assert(bufpos <= bufend);
  81         if (bufpos >= bufend) {
  82                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  83                                  input);
  84                 if(s == 0) {
  85                         c = EOF;
  86                         return;
  87                 }
  88                 bufpos = buf + MAX_PUTBACK;
  89                 bufend = buf + MAX_PUTBACK + s;
  90         }
  91         c = *bufpos++;
  92 }
  93
  94 static inline void put_back(int pc)
  95 {
  96         assert(bufpos > buf);
  97         *(--bufpos - buf + buf) = (char) pc;
  98
  99 #ifdef DEBUG_CHARS
 100         printf("putback '%c'\n", pc);
 101 #endif
 102 }
 103
 104 static inline void next_char(void);
 105
 106 #define MATCH_NEWLINE(code)                   \
 107         case '\r':                                \
 108                 next_char();                          \
 109                 if(c == '\n') {                       \
 110                         next_char();                      \
 111                 }                                     \
 112                 lexer_token.source_position.linenr++; \
 113                 code                                  \
 114         case '\n':                                \
 115                 next_char();                          \
 116                 lexer_token.source_position.linenr++; \
 117                 code
 118
 119 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 120
 121 static void maybe_concat_lines(void)
 122 {
 123         eat('\\');
 124
 125         switch(c) {
 126         MATCH_NEWLINE(return;)
 127
 128         default:
 129                 break;
 130         }
 131
 132         put_back(c);
 133         c = '\\';
 134 }
 135
 136 static inline void next_char(void)
 137 {
 138         next_real_char();
 139
 140         /* filter trigraphs */
 141         if(UNLIKELY(c == '\\')) {
 142                 maybe_concat_lines();
 143                 goto end_of_next_char;
 144         }
 145
 146         if(LIKELY(c != '?'))
 147                 goto end_of_next_char;
 148
 149         next_real_char();
 150         if(LIKELY(c != '?')) {
 151                 put_back(c);
 152                 c = '?';
 153                 goto end_of_next_char;
 154         }
 155
 156         next_real_char();
 157         switch(c) {
 158         case '=': c = '#'; break;
 159         case '(': c = '['; break;
 160         case '/': c = '\\'; maybe_concat_lines(); break;
 161         case ')': c = ']'; break;
 162         case '\'': c = '^'; break;
 163         case '<': c = '{'; break;
 164         case '!': c = '|'; break;
 165         case '>': c = '}'; break;
 166         case '-': c = '~'; break;
 167         default:
 168                 put_back(c);
 169                 put_back('?');
 170                 c = '?';
 171                 break;
 172         }
 173
 174 end_of_next_char:;
 175 #ifdef DEBUG_CHARS
 176         printf("nchar '%c'\n", c);
 177 #endif
 178 }
 179
 180 #define SYMBOL_CHARS  \
 181         case 'a':         \
 182         case 'b':         \
 183         case 'c':         \
 184         case 'd':         \
 185         case 'e':         \
 186         case 'f':         \
 187         case 'g':         \
 188         case 'h':         \
 189         case 'i':         \
 190         case 'j':         \
 191         case 'k':         \
 192         case 'l':         \
 193         case 'm':         \
 194         case 'n':         \
 195         case 'o':         \
 196         case 'p':         \
 197         case 'q':         \
 198         case 'r':         \
 199         case 's':         \
 200         case 't':         \
 201         case 'u':         \
 202         case 'v':         \
 203         case 'w':         \
 204         case 'x':         \
 205         case 'y':         \
 206         case 'z':         \
 207         case 'A':         \
 208         case 'B':         \
 209         case 'C':         \
 210         case 'D':         \
 211         case 'E':         \
 212         case 'F':         \
 213         case 'G':         \
 214         case 'H':         \
 215         case 'I':         \
 216         case 'J':         \
 217         case 'K':         \
 218         case 'L':         \
 219         case 'M':         \
 220         case 'N':         \
 221         case 'O':         \
 222         case 'P':         \
 223         case 'Q':         \
 224         case 'R':         \
 225         case 'S':         \
 226         case 'T':         \
 227         case 'U':         \
 228         case 'V':         \
 229         case 'W':         \
 230         case 'X':         \
 231         case 'Y':         \
 232         case 'Z':         \
 233         case '_':
 234
 235 #define DIGITS        \
 236         case '0':         \
 237         case '1':         \
 238         case '2':         \
 239         case '3':         \
 240         case '4':         \
 241         case '5':         \
 242         case '6':         \
 243         case '7':         \
 244         case '8':         \
 245         case '9':
 246
 247 static void parse_symbol(void)
 248 {
 249         symbol_t *symbol;
 250         char     *string;
 251
 252         obstack_1grow(&symbol_obstack, (char) c);
 253         next_char();
 254
 255         while(1) {
 256                 switch(c) {
 257                 DIGITS
 258                 SYMBOL_CHARS
 259                         obstack_1grow(&symbol_obstack, (char) c);
 260                         next_char();
 261                         break;
 262
 263                 default:
 264                         goto end_symbol;
 265                 }
 266         }
 267
 268 end_symbol:
 269         obstack_1grow(&symbol_obstack, '\0');
 270
 271         string = obstack_finish(&symbol_obstack);
 272         symbol = symbol_table_insert(string);
 273
 274         lexer_token.type     = symbol->ID;
 275         lexer_token.v.symbol = symbol;
 276
 277         if(symbol->string != string) {
 278                 obstack_free(&symbol_obstack, string);
 279         }
 280 }
 281
 282 static void parse_integer_suffix(bool is_oct_hex)
 283 {
 284         bool is_unsigned  = false;
 285         bool min_long     = false;
 286         bool min_longlong = false;
 287
 288         if(c == 'U' || c == 'u') {
 289                 is_unsigned = true;
 290                 next_char();
 291                 if(c == 'L' || c == 'l') {
 292                         min_long = true;
 293                         next_char();
 294                         if(c == 'L' || c == 'l') {
 295                                 min_longlong = true;
 296                                 next_char();
 297                         }
 298                 }
 299         } else if(c == 'l' || c == 'L') {
 300                 min_long = true;
 301                 next_char();
 302                 if(c == 'l' || c == 'L') {
 303                         min_longlong = true;
 304                         next_char();
 305                         if(c == 'u' || c == 'U') {
 306                                 is_unsigned = true;
 307                                 next_char();
 308                         }
 309                 } else if(c == 'u' || c == 'U') {
 310                         is_unsigned = true;
 311                         next_char();
 312                         lexer_token.datatype = type_unsigned_long;
 313                 }
 314         }
 315
 316         if(!is_unsigned) {
 317                 long long v = lexer_token.v.intvalue;
 318                 if(!min_long) {
 319                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 320                                 lexer_token.datatype = type_int;
 321                                 return;
 322                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 323                                 lexer_token.datatype = type_unsigned_int;
 324                                 return;
 325                         }
 326                 }
 327                 if(!min_longlong) {
 328                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 329                                 lexer_token.datatype = type_long;
 330                                 return;
 331                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
 332                                 lexer_token.datatype = type_unsigned_long;
 333                                 return;
 334                         }
 335                 }
 336                 unsigned long long uv = (unsigned long long) v;
 337                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 338                         lexer_token.datatype = type_unsigned_long_long;
 339                         return;
 340                 }
 341
 342                 lexer_token.datatype = type_long_long;
 343         } else {
 344                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 345                 if(!min_long && v <= TARGET_UINT_MAX) {
 346                         lexer_token.datatype = type_unsigned_int;
 347                         return;
 348                 }
 349                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
 350                         lexer_token.datatype = type_unsigned_long;
 351                         return;
 352                 }
 353                 lexer_token.datatype = type_unsigned_long_long;
 354         }
 355 }
 356
 357 static void parse_floating_suffix(void)
 358 {
 359         switch(c) {
 360         /* TODO: do something usefull with the suffixes... */
 361         case 'f':
 362         case 'F':
 363                 next_char();
 364                 lexer_token.datatype = type_float;
 365                 break;
 366         case 'l':
 367         case 'L':
 368                 next_char();
 369                 lexer_token.datatype = type_long_double;
 370                 break;
 371         default:
 372                 lexer_token.datatype = type_double;
 373                 break;
 374         }
 375 }
 376
 377 /**
 378  * A replacement for strtoull. Only those parts needed for
 379  * our parser are implemented.
 380  */
 381 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 382         unsigned long long v = 0;
 383
 384         switch (base) {
 385         case 16:
 386                 for (;; ++s) {
 387                         /* check for overrun */
 388                         if (v >= 0x1000000000000000ULL)
 389                                 break;
 390                         switch (tolower(*s)) {
 391                         case '0': v <<= 4; break;
 392                         case '1': v <<= 4; v |= 0x1; break;
 393                         case '2': v <<= 4; v |= 0x2; break;
 394                         case '3': v <<= 4; v |= 0x3; break;
 395                         case '4': v <<= 4; v |= 0x4; break;
 396                         case '5': v <<= 4; v |= 0x5; break;
 397                         case '6': v <<= 4; v |= 0x6; break;
 398                         case '7': v <<= 4; v |= 0x7; break;
 399                         case '8': v <<= 4; v |= 0x8; break;
 400                         case '9': v <<= 4; v |= 0x9; break;
 401                         case 'a': v <<= 4; v |= 0xa; break;
 402                         case 'b': v <<= 4; v |= 0xb; break;
 403                         case 'c': v <<= 4; v |= 0xc; break;
 404                         case 'd': v <<= 4; v |= 0xd; break;
 405                         case 'e': v <<= 4; v |= 0xe; break;
 406                         case 'f': v <<= 4; v |= 0xf; break;
 407                         default:
 408                                 goto end;
 409                         }
 410                 }
 411                 break;
 412         case 8:
 413                 for (;; ++s) {
 414                         /* check for overrun */
 415                         if (v >= 0x2000000000000000ULL)
 416                                 break;
 417                         switch (tolower(*s)) {
 418                         case '0': v <<= 3; break;
 419                         case '1': v <<= 3; v |= 1; break;
 420                         case '2': v <<= 3; v |= 2; break;
 421                         case '3': v <<= 3; v |= 3; break;
 422                         case '4': v <<= 3; v |= 4; break;
 423                         case '5': v <<= 3; v |= 5; break;
 424                         case '6': v <<= 3; v |= 6; break;
 425                         case '7': v <<= 3; v |= 7; break;
 426                         default:
 427                                 goto end;
 428                         }
 429                 }
 430                 break;
 431         case 10:
 432                 for (;; ++s) {
 433                         /* check for overrun */
 434                         if (v > 0x1999999999999999ULL)
 435                                 break;
 436                         switch (tolower(*s)) {
 437                         case '0': v *= 10; break;
 438                         case '1': v *= 10; v += 1; break;
 439                         case '2': v *= 10; v += 2; break;
 440                         case '3': v *= 10; v += 3; break;
 441                         case '4': v *= 10; v += 4; break;
 442                         case '5': v *= 10; v += 5; break;
 443                         case '6': v *= 10; v += 6; break;
 444                         case '7': v *= 10; v += 7; break;
 445                         case '8': v *= 10; v += 8; break;
 446                         case '9': v *= 10; v += 9; break;
 447                         default:
 448                                 goto end;
 449                         }
 450                 }
 451                 break;
 452         default:
 453                 assert(0);
 454                 break;
 455         }
 456 end:
 457         *endptr = s;
 458         return v;
 459 }
 460
 461 static void parse_number_hex(void)
 462 {
 463         assert(c == 'x' || c == 'X');
 464         next_char();
 465
 466         while(isxdigit(c)) {
 467                 obstack_1grow(&symbol_obstack, (char) c);
 468                 next_char();
 469         }
 470         obstack_1grow(&symbol_obstack, '\0');
 471         char *string = obstack_finish(&symbol_obstack);
 472
 473         if(c == '.' || c == 'p' || c == 'P') {
 474                 next_char();
 475                 panic("Hex floating point numbers not implemented yet");
 476         }
 477         if(*string == '\0') {
 478                 parse_error("invalid hex number");
 479                 lexer_token.type = T_ERROR;
 480         }
 481
 482         const char *endptr;
 483         lexer_token.type       = T_INTEGER;
 484         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
 485         if(*endptr != '\0') {
 486                 parse_error("hex number literal too long");
 487         }
 488
 489         obstack_free(&symbol_obstack, string);
 490         parse_integer_suffix(true);
 491 }
 492
 493 static inline bool is_octal_digit(int chr)
 494 {
 495         return '0' <= chr && chr <= '7';
 496 }
 497
 498 static void parse_number_oct(void)
 499 {
 500         while(is_octal_digit(c)) {
 501                 obstack_1grow(&symbol_obstack, (char) c);
 502                 next_char();
 503         }
 504         obstack_1grow(&symbol_obstack, '\0');
 505         char *string = obstack_finish(&symbol_obstack);
 506
 507         const char *endptr;
 508         lexer_token.type       = T_INTEGER;
 509         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 510         if(*endptr != '\0') {
 511                 parse_error("octal number literal too long");
 512         }
 513
 514         obstack_free(&symbol_obstack, string);
 515         parse_integer_suffix(true);
 516 }
 517
 518 static void parse_number_dec(void)
 519 {
 520         bool is_float = false;
 521         while(isdigit(c)) {
 522                 obstack_1grow(&symbol_obstack, (char) c);
 523                 next_char();
 524         }
 525
 526         if(c == '.') {
 527                 obstack_1grow(&symbol_obstack, '.');
 528                 next_char();
 529
 530                 while(isdigit(c)) {
 531                         obstack_1grow(&symbol_obstack, (char) c);
 532                         next_char();
 533                 }
 534                 is_float = true;
 535         }
 536         if(c == 'e' || c == 'E') {
 537                 obstack_1grow(&symbol_obstack, 'e');
 538                 next_char();
 539
 540                 if(c == '-' || c == '+') {
 541                         obstack_1grow(&symbol_obstack, (char) c);
 542                         next_char();
 543                 }
 544
 545                 while(isdigit(c)) {
 546                         obstack_1grow(&symbol_obstack, (char) c);
 547                         next_char();
 548                 }
 549                 is_float = true;
 550         }
 551
 552         obstack_1grow(&symbol_obstack, '\0');
 553         char *string = obstack_finish(&symbol_obstack);
 554
 555         if(is_float) {
 556                 char *endptr;
 557                 lexer_token.type         = T_FLOATINGPOINT;
 558                 lexer_token.v.floatvalue = strtold(string, &endptr);
 559
 560                 if(*endptr != '\0') {
 561                         parse_error("invalid number literal");
 562                 }
 563
 564                 parse_floating_suffix();
 565         } else {
 566                 const char *endptr;
 567                 lexer_token.type       = T_INTEGER;
 568                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 569
 570                 if(*endptr != '\0') {
 571                         parse_error("invalid number literal");
 572                 }
 573
 574                 parse_integer_suffix(false);
 575         }
 576         obstack_free(&symbol_obstack, string);
 577 }
 578
 579 static void parse_number(void)
 580 {
 581         if (c == '0') {
 582                 next_char();
 583                 switch (c) {
 584                         case 'X':
 585                         case 'x':
 586                                 parse_number_hex();
 587                                 break;
 588                         case '0':
 589                         case '1':
 590                         case '2':
 591                         case '3':
 592                         case '4':
 593                         case '5':
 594                         case '6':
 595                         case '7':
 596                                 parse_number_oct();
 597                                 break;
 598                         case '8':
 599                         case '9':
 600                                 next_char();
 601                                 parse_error("invalid octal number");
 602                                 lexer_token.type = T_ERROR;
 603                                 return;
 604                         case '.':
 605                         case 'e':
 606                         case 'E':
 607                         default:
 608                                 obstack_1grow(&symbol_obstack, '0');
 609                                 parse_number_dec();
 610                                 return;
 611                 }
 612         } else {
 613                 parse_number_dec();
 614         }
 615 }
 616
 617 static int parse_octal_sequence(const int first_digit)
 618 {
 619         assert(is_octal_digit(first_digit));
 620         int value = first_digit - '0';
 621         if (!is_octal_digit(c)) return value;
 622         value = 8 * value + c - '0';
 623         next_char();
 624         if (!is_octal_digit(c)) return value;
 625         value = 8 * value + c - '0';
 626         next_char();
 627         return (char_type)value;
 628 }
 629
 630 static int parse_hex_sequence(void)
 631 {
 632         int value = 0;
 633         while(1) {
 634                 if (c >= '0' && c <= '9') {
 635                         value = 16 * value + c - '0';
 636                 } else if ('A' <= c && c <= 'F') {
 637                         value = 16 * value + c - 'A' + 10;
 638                 } else if ('a' <= c && c <= 'f') {
 639                         value = 16 * value + c - 'a' + 10;
 640                 } else {
 641                         break;
 642                 }
 643                 next_char();
 644         }
 645
 646         return (char_type)value;
 647 }
 648
 649 static int parse_escape_sequence(void)
 650 {
 651         eat('\\');
 652
 653         int ec = c;
 654         next_char();
 655
 656         switch(ec) {
 657         case '"':  return '"';
 658         case '\'': return '\'';
 659         case '\\': return '\\';
 660         case '?': return '\?';
 661         case 'a': return '\a';
 662         case 'b': return '\b';
 663         case 'f': return '\f';
 664         case 'n': return '\n';
 665         case 'r': return '\r';
 666         case 't': return '\t';
 667         case 'v': return '\v';
 668         case 'x':
 669                 return parse_hex_sequence();
 670         case '0':
 671         case '1':
 672         case '2':
 673         case '3':
 674         case '4':
 675         case '5':
 676         case '6':
 677         case '7':
 678                 return parse_octal_sequence(ec);
 679         case EOF:
 680                 parse_error("reached end of file while parsing escape sequence");
 681                 return EOF;
 682         default:
 683                 parse_error("unknown escape sequence");
 684                 return EOF;
 685         }
 686 }
 687
 688 string_t concat_strings(const string_t *const s1, const string_t *const s2)
 689 {
 690         const size_t len1 = s1->size - 1;
 691         const size_t len2 = s2->size - 1;
 692
 693         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 694         memcpy(concat, s1->begin, len1);
 695         memcpy(concat + len1, s2->begin, len2 + 1);
 696
 697 #if 0 /* TODO hash */
 698         const char *result = strset_insert(&stringset, concat);
 699         if(result != concat) {
 700                 obstack_free(&symbol_obstack, concat);
 701         }
 702
 703         return result;
 704 #else
 705         return (string_t){ concat, len1 + len2 + 1 };
 706 #endif
 707 }
 708
 709 static void parse_string_literal(void)
 710 {
 711         const unsigned start_linenr = lexer_token.source_position.linenr;
 712
 713         assert(c == '"');
 714         next_char();
 715
 716         int tc;
 717         while(1) {
 718                 switch(c) {
 719                 case '\\':
 720                         tc = parse_escape_sequence();
 721                         obstack_1grow(&symbol_obstack, (char) tc);
 722                         break;
 723
 724                 case EOF:
 725                         error_prefix_at(lexer_token.source_position.input_name,
 726                                         start_linenr);
 727                         fprintf(stderr, "string has no end\n");
 728                         lexer_token.type = T_ERROR;
 729                         return;
 730
 731                 case '"':
 732                         next_char();
 733                         goto end_of_string;
 734
 735                 default:
 736                         obstack_1grow(&symbol_obstack, (char) c);
 737                         next_char();
 738                         break;
 739                 }
 740         }
 741
 742 end_of_string:
 743
 744         /* TODO: concatenate multiple strings separated by whitespace... */
 745
 746         /* add finishing 0 to the string */
 747         obstack_1grow(&symbol_obstack, '\0');
 748         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 749         const char *const string = obstack_finish(&symbol_obstack);
 750
 751 #if 0 /* TODO hash */
 752         /* check if there is already a copy of the string */
 753         result = strset_insert(&stringset, string);
 754         if(result != string) {
 755                 obstack_free(&symbol_obstack, string);
 756         }
 757 #else
 758         const char *const result = string;
 759 #endif
 760
 761         lexer_token.type           = T_STRING_LITERAL;
 762         lexer_token.v.string.begin = result;
 763         lexer_token.v.string.size  = size;
 764 }
 765
 766 static void parse_wide_character_constant(void)
 767 {
 768         eat('\'');
 769
 770         int found_char = 0;
 771         while(1) {
 772                 switch(c) {
 773                 case '\\':
 774                         found_char = parse_escape_sequence();
 775                         break;
 776
 777                 MATCH_NEWLINE(
 778                         parse_error("newline while parsing character constant");
 779                         break;
 780                 )
 781
 782                 case '\'':
 783                         next_char();
 784                         goto end_of_wide_char_constant;
 785
 786                 case EOF:
 787                         parse_error("EOF while parsing character constant");
 788                         lexer_token.type = T_ERROR;
 789                         return;
 790
 791                 default:
 792                         if(found_char != 0) {
 793                                 parse_error("more than 1 characters in character "
 794                                             "constant");
 795                                 goto end_of_wide_char_constant;
 796                         } else {
 797                                 found_char = c;
 798                                 next_char();
 799                         }
 800                         break;
 801                 }
 802         }
 803
 804 end_of_wide_char_constant:
 805         lexer_token.type       = T_INTEGER;
 806         lexer_token.v.intvalue = found_char;
 807         lexer_token.datatype   = type_wchar_t;
 808 }
 809
 810 static void parse_wide_string_literal(void)
 811 {
 812         const unsigned start_linenr = lexer_token.source_position.linenr;
 813
 814         assert(c == '"');
 815         next_char();
 816
 817         while(1) {
 818                 switch(c) {
 819                         case '\\': {
 820                                 wchar_rep_t tc = parse_escape_sequence();
 821                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 822                                 break;
 823                         }
 824
 825                         case EOF:
 826                                 error_prefix_at(lexer_token.source_position.input_name,
 827                                                 start_linenr);
 828                                 fprintf(stderr, "string has no end\n");
 829                                 lexer_token.type = T_ERROR;
 830                                 return;
 831
 832                         case '"':
 833                                 next_char();
 834                                 goto end_of_string;
 835
 836                         default: {
 837                                 wchar_rep_t tc = c;
 838                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 839                                 next_char();
 840                                 break;
 841                         }
 842                 }
 843         }
 844
 845 end_of_string:;
 846
 847         /* TODO: concatenate multiple strings separated by whitespace... */
 848
 849         /* add finishing 0 to the string */
 850         wchar_rep_t nul = L'\0';
 851         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 852         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 853         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
 854
 855 #if 0 /* TODO hash */
 856         /* check if there is already a copy of the string */
 857         const wchar_rep_t *const result = strset_insert(&stringset, string);
 858         if(result != string) {
 859                 obstack_free(&symbol_obstack, string);
 860         }
 861 #else
 862         const wchar_rep_t *const result = string;
 863 #endif
 864
 865         lexer_token.type                = T_WIDE_STRING_LITERAL;
 866         lexer_token.v.wide_string.begin = result;
 867         lexer_token.v.wide_string.size  = size;
 868 }
 869
 870 static void parse_character_constant(void)
 871 {
 872         eat('\'');
 873
 874         int found_char = 0;
 875         while(1) {
 876                 switch(c) {
 877                 case '\\':
 878                         found_char = parse_escape_sequence();
 879                         break;
 880
 881                 MATCH_NEWLINE(
 882                         parse_error("newline while parsing character constant");
 883                         break;
 884                 )
 885
 886                 case '\'':
 887                         next_char();
 888                         goto end_of_char_constant;
 889
 890                 case EOF:
 891                         parse_error("EOF while parsing character constant");
 892                         lexer_token.type = T_ERROR;
 893                         return;
 894
 895                 default:
 896                         if(found_char != 0) {
 897                                 parse_error("more than 1 characters in character "
 898                                             "constant");
 899                                 goto end_of_char_constant;
 900                         } else {
 901                                 found_char = c;
 902                                 next_char();
 903                         }
 904                         break;
 905                 }
 906         }
 907
 908 end_of_char_constant:
 909         lexer_token.type       = T_INTEGER;
 910         lexer_token.v.intvalue = found_char;
 911         lexer_token.datatype   = type_int;
 912 }
 913
 914 static void skip_multiline_comment(void)
 915 {
 916         unsigned start_linenr = lexer_token.source_position.linenr;
 917
 918         while(1) {
 919                 switch(c) {
 920                 case '*':
 921                         next_char();
 922                         if(c == '/') {
 923                                 next_char();
 924                                 return;
 925                         }
 926                         break;
 927
 928                 MATCH_NEWLINE(break;)
 929
 930                 case EOF:
 931                         error_prefix_at(lexer_token.source_position.input_name,
 932                                         start_linenr);
 933                         fprintf(stderr, "at end of file while looking for comment end\n");
 934                         return;
 935
 936                 default:
 937                         next_char();
 938                         break;
 939                 }
 940         }
 941 }
 942
 943 static void skip_line_comment(void)
 944 {
 945         while(1) {
 946                 switch(c) {
 947                 case EOF:
 948                         return;
 949
 950                 case '\n':
 951                 case '\r':
 952                         return;
 953
 954                 default:
 955                         next_char();
 956                         break;
 957                 }
 958         }
 959 }
 960
 961 static token_t pp_token;
 962
 963 static inline void next_pp_token(void)
 964 {
 965         lexer_next_preprocessing_token();
 966         pp_token = lexer_token;
 967 }
 968
 969 static void eat_until_newline(void)
 970 {
 971         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 972                 next_pp_token();
 973         }
 974 }
 975
 976 static void error_directive(void)
 977 {
 978         error_prefix();
 979         fprintf(stderr, "#error directive: \n");
 980
 981         /* parse pp-tokens until new-line */
 982 }
 983
 984 static void define_directive(void)
 985 {
 986         lexer_next_preprocessing_token();
 987         if(lexer_token.type != T_IDENTIFIER) {
 988                 parse_error("expected identifier after #define\n");
 989                 eat_until_newline();
 990         }
 991 }
 992
 993 static void ifdef_directive(int is_ifndef)
 994 {
 995         (void) is_ifndef;
 996         lexer_next_preprocessing_token();
 997         //expect_identifier();
 998         //extect_newline();
 999 }
1000
1001 static void endif_directive(void)
1002 {
1003         //expect_newline();
1004 }
1005
1006 static void parse_line_directive(void)
1007 {
1008         if(pp_token.type != T_INTEGER) {
1009                 parse_error("expected integer");
1010         } else {
1011                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1012                 next_pp_token();
1013         }
1014         if(pp_token.type == T_STRING_LITERAL) {
1015                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1016                 next_pp_token();
1017         }
1018
1019         eat_until_newline();
1020 }
1021
1022 static void parse_preprocessor_identifier(void)
1023 {
1024         assert(pp_token.type == T_IDENTIFIER);
1025         symbol_t *symbol = pp_token.v.symbol;
1026
1027         switch(symbol->pp_ID) {
1028         case TP_include:
1029                 printf("include - enable header name parsing!\n");
1030                 break;
1031         case TP_define:
1032                 define_directive();
1033                 break;
1034         case TP_ifdef:
1035                 ifdef_directive(0);
1036                 break;
1037         case TP_ifndef:
1038                 ifdef_directive(1);
1039                 break;
1040         case TP_endif:
1041                 endif_directive();
1042                 break;
1043         case TP_line:
1044                 next_pp_token();
1045                 parse_line_directive();
1046                 break;
1047         case TP_if:
1048         case TP_else:
1049         case TP_elif:
1050         case TP_undef:
1051         case TP_error:
1052                 error_directive();
1053                 break;
1054         case TP_pragma:
1055                 if (warning.unknown_pragmas) {
1056                         warningf(lexer_token.source_position, "encountered unknown #pragma");
1057                 }
1058                 eat_until_newline();
1059                 break;
1060         }
1061 }
1062
1063 static void parse_preprocessor_directive(void)
1064 {
1065         next_pp_token();
1066
1067         switch(pp_token.type) {
1068         case T_IDENTIFIER:
1069                 parse_preprocessor_identifier();
1070                 break;
1071         case T_INTEGER:
1072                 parse_line_directive();
1073                 break;
1074         default:
1075                 parse_error("invalid preprocessor directive");
1076                 eat_until_newline();
1077                 break;
1078         }
1079 }
1080
1081 #define MAYBE_PROLOG                                       \
1082                         next_char();                                   \
1083                         while(1) {                                     \
1084                                 switch(c) {
1085
1086 #define MAYBE(ch, set_type)                                \
1087                                 case ch:                                   \
1088                                         next_char();                           \
1089                                         lexer_token.type = set_type;           \
1090                                         return;
1091
1092 #define ELSE_CODE(code)                                    \
1093                                 default:                                   \
1094                                         code;                                  \
1095                                 }                                          \
1096                         } /* end of while(1) */                        \
1097                         break;
1098
1099 #define ELSE(set_type)                                     \
1100                 ELSE_CODE(                                         \
1101                         lexer_token.type = set_type;                   \
1102                         return;                                        \
1103                 )
1104
1105 void lexer_next_preprocessing_token(void)
1106 {
1107         while(1) {
1108                 switch(c) {
1109                 case ' ':
1110                 case '\t':
1111                         next_char();
1112                         break;
1113
1114                 MATCH_NEWLINE(
1115                         lexer_token.type = '\n';
1116                         return;
1117                 )
1118
1119                 SYMBOL_CHARS
1120                         parse_symbol();
1121                         /* might be a wide string ( L"string" ) */
1122                         if(lexer_token.type == T_IDENTIFIER &&
1123                             lexer_token.v.symbol == symbol_L) {
1124                             if(c == '"') {
1125                                         parse_wide_string_literal();
1126                                 } else if(c == '\'') {
1127                                         parse_wide_character_constant();
1128                                 }
1129                         }
1130                         return;
1131
1132                 DIGITS
1133                         parse_number();
1134                         return;
1135
1136                 case '"':
1137                         parse_string_literal();
1138                         return;
1139
1140                 case '\'':
1141                         parse_character_constant();
1142                         return;
1143
1144                 case '.':
1145                         MAYBE_PROLOG
1146                                 case '0':
1147                                 case '1':
1148                                 case '2':
1149                                 case '3':
1150                                 case '4':
1151                                 case '5':
1152                                 case '6':
1153                                 case '7':
1154                                 case '8':
1155                                 case '9':
1156                                         put_back(c);
1157                                         c = '.';
1158                                         parse_number_dec();
1159                                         return;
1160
1161                                 case '.':
1162                                         MAYBE_PROLOG
1163                                         MAYBE('.', T_DOTDOTDOT)
1164                                         ELSE_CODE(
1165                                                 put_back(c);
1166                                                 c = '.';
1167                                                 lexer_token.type = '.';
1168                                                 return;
1169                                         )
1170                         ELSE('.')
1171                 case '&':
1172                         MAYBE_PROLOG
1173                         MAYBE('&', T_ANDAND)
1174                         MAYBE('=', T_ANDEQUAL)
1175                         ELSE('&')
1176                 case '*':
1177                         MAYBE_PROLOG
1178                         MAYBE('=', T_ASTERISKEQUAL)
1179                         ELSE('*')
1180                 case '+':
1181                         MAYBE_PROLOG
1182                         MAYBE('+', T_PLUSPLUS)
1183                         MAYBE('=', T_PLUSEQUAL)
1184                         ELSE('+')
1185                 case '-':
1186                         MAYBE_PROLOG
1187                         MAYBE('>', T_MINUSGREATER)
1188                         MAYBE('-', T_MINUSMINUS)
1189                         MAYBE('=', T_MINUSEQUAL)
1190                         ELSE('-')
1191                 case '!':
1192                         MAYBE_PROLOG
1193                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1194                         ELSE('!')
1195                 case '/':
1196                         MAYBE_PROLOG
1197                         MAYBE('=', T_SLASHEQUAL)
1198                                 case '*':
1199                                         next_char();
1200                                         skip_multiline_comment();
1201                                         lexer_next_preprocessing_token();
1202                                         return;
1203                                 case '/':
1204                                         next_char();
1205                                         skip_line_comment();
1206                                         lexer_next_preprocessing_token();
1207                                         return;
1208                         ELSE('/')
1209                 case '%':
1210                         MAYBE_PROLOG
1211                         MAYBE('>', T_PERCENTGREATER)
1212                         MAYBE('=', T_PERCENTEQUAL)
1213                                 case ':':
1214                                         MAYBE_PROLOG
1215                                                 case '%':
1216                                                         MAYBE_PROLOG
1217                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1218                                                         ELSE_CODE(
1219                                                                 put_back(c);
1220                                                                 c = '%';
1221                                                                 lexer_token.type = T_PERCENTCOLON;
1222                                                                 return;
1223                                                         )
1224                                         ELSE(T_PERCENTCOLON)
1225                         ELSE('%')
1226                 case '<':
1227                         MAYBE_PROLOG
1228                         MAYBE(':', T_LESSCOLON)
1229                         MAYBE('%', T_LESSPERCENT)
1230                         MAYBE('=', T_LESSEQUAL)
1231                                 case '<':
1232                                         MAYBE_PROLOG
1233                                         MAYBE('=', T_LESSLESSEQUAL)
1234                                         ELSE(T_LESSLESS)
1235                         ELSE('<')
1236                 case '>':
1237                         MAYBE_PROLOG
1238                         MAYBE('=', T_GREATEREQUAL)
1239                                 case '>':
1240                                         MAYBE_PROLOG
1241                                         MAYBE('=', T_GREATERGREATEREQUAL)
1242                                         ELSE(T_GREATERGREATER)
1243                         ELSE('>')
1244                 case '^':
1245                         MAYBE_PROLOG
1246                         MAYBE('=', T_CARETEQUAL)
1247                         ELSE('^')
1248                 case '|':
1249                         MAYBE_PROLOG
1250                         MAYBE('=', T_PIPEEQUAL)
1251                         MAYBE('|', T_PIPEPIPE)
1252                         ELSE('|')
1253                 case ':':
1254                         MAYBE_PROLOG
1255                         MAYBE('>', T_COLONGREATER)
1256                         ELSE(':')
1257                 case '=':
1258                         MAYBE_PROLOG
1259                         MAYBE('=', T_EQUALEQUAL)
1260                         ELSE('=')
1261                 case '#':
1262                         MAYBE_PROLOG
1263                         MAYBE('#', T_HASHHASH)
1264                         ELSE('#')
1265
1266                 case '?':
1267                 case '[':
1268                 case ']':
1269                 case '(':
1270                 case ')':
1271                 case '{':
1272                 case '}':
1273                 case '~':
1274                 case ';':
1275                 case ',':
1276                 case '\\':
1277                         lexer_token.type = c;
1278                         next_char();
1279                         return;
1280
1281                 case EOF:
1282                         lexer_token.type = T_EOF;
1283                         return;
1284
1285                 default:
1286                         next_char();
1287                         error_prefix();
1288                         fprintf(stderr, "unknown character '%c' found\n", c);
1289                         lexer_token.type = T_ERROR;
1290                         return;
1291                 }
1292         }
1293 }
1294
1295 void lexer_next_token(void)
1296 {
1297         lexer_next_preprocessing_token();
1298         if(lexer_token.type != '\n')
1299                 return;
1300
1301 newline_found:
1302         do {
1303                 lexer_next_preprocessing_token();
1304         } while(lexer_token.type == '\n');
1305
1306         if(lexer_token.type == '#') {
1307                 parse_preprocessor_directive();
1308                 goto newline_found;
1309         }
1310 }
1311
1312 void init_lexer(void)
1313 {
1314         strset_init(&stringset);
1315 }
1316
1317 void lexer_open_stream(FILE *stream, const char *input_name)
1318 {
1319         input                                  = stream;
1320         lexer_token.source_position.linenr     = 0;
1321         lexer_token.source_position.input_name = input_name;
1322
1323         symbol_L = symbol_table_insert("L");
1324         bufpos = NULL;
1325         bufend = NULL;
1326
1327         /* place a virtual \n at the beginning so the lexer knows that we're
1328          * at the beginning of a line */
1329         c = '\n';
1330 }
1331
1332 void exit_lexer(void)
1333 {
1334         strset_destroy(&stringset);
1335 }
1336
1337 static __attribute__((unused))
1338 void dbg_pos(const source_position_t source_position)
1339 {
1340         fprintf(stdout, "%s:%u\n", source_position.input_name,
1341                 source_position.linenr);
1342         fflush(stdout);
1343 }