nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "diagnostic.h"
   4 #include "lexer.h"
   5 #include "token_t.h"
   6 #include "symbol_table_t.h"
   7 #include "adt/error.h"
   8 #include "adt/strset.h"
   9 #include "adt/util.h"
  10 #include "types.h"
  11 #include "type_t.h"
  12 #include "target_architecture.h"
  13 #include "parser.h"
  14 #include "warning.h"
  15
  16 #include <assert.h>
  17 #include <errno.h>
  18 #include <string.h>
  19 #include <stdbool.h>
  20 #include <ctype.h>
  21
  22 //#define DEBUG_CHARS
  23 #define MAX_PUTBACK 3
  24
  25 #ifdef _WIN32
  26 /* No strtold on windows and no replacement yet */
  27 #define strtold(s, e) strtod(s, e)
  28 #endif
  29
  30 #if defined HAS_SIGNED_CHAR
  31 typedef signed char char_type;
  32 #elif defined HAS_UNSIGNED_CHAR
  33 typedef unsigned char char_type;
  34 #else
  35 #       error signedness of char not determined
  36 #endif
  37
  38 static int         c;
  39 token_t            lexer_token;
  40 symbol_t          *symbol_L;
  41 static FILE       *input;
  42 static char        buf[1024 + MAX_PUTBACK];
  43 static const char *bufend;
  44 static const char *bufpos;
  45 static strset_t    stringset;
  46
  47 /**
  48  * Print an error prefix at the given coordinates.
  49  *
  50  * @param input_name   the input file name
  51  * @param linenr       the line number
  52  */
  53 static void error_prefix_at(const char *input_name, unsigned linenr)
  54 {
  55         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  56 }
  57
  58 /**
  59  * Print an error prefix at the current token coordinates.
  60  */
  61 static void error_prefix(void)
  62 {
  63         error_prefix_at(lexer_token.source_position.input_name,
  64                         lexer_token.source_position.linenr);
  65 }
  66
  67 /**
  68  * Prints a parse error message at the current token.
  69  *
  70  * @param msg   the error message
  71  */
  72 static void parse_error(const char *msg)
  73 {
  74         error_prefix();
  75         fprintf(stderr, "%s\n", msg);
  76 }
  77
  78 static inline void next_real_char(void)
  79 {
  80         assert(bufpos <= bufend);
  81         if (bufpos >= bufend) {
  82                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  83                                  input);
  84                 if(s == 0) {
  85                         c = EOF;
  86                         return;
  87                 }
  88                 bufpos = buf + MAX_PUTBACK;
  89                 bufend = buf + MAX_PUTBACK + s;
  90         }
  91         c = *bufpos++;
  92 }
  93
  94 static inline void put_back(int pc)
  95 {
  96         assert(bufpos > buf);
  97         *(--bufpos - buf + buf) = (char) pc;
  98
  99 #ifdef DEBUG_CHARS
 100         printf("putback '%c'\n", pc);
 101 #endif
 102 }
 103
 104 static inline void next_char(void);
 105
 106 #define MATCH_NEWLINE(code)                   \
 107         case '\r':                                \
 108                 next_char();                          \
 109                 if(c == '\n') {                       \
 110                         next_char();                      \
 111                 }                                     \
 112                 lexer_token.source_position.linenr++; \
 113                 code                                  \
 114         case '\n':                                \
 115                 next_char();                          \
 116                 lexer_token.source_position.linenr++; \
 117                 code
 118
 119 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 120
 121 static void maybe_concat_lines(void)
 122 {
 123         eat('\\');
 124
 125         switch(c) {
 126         MATCH_NEWLINE(return;)
 127
 128         default:
 129                 break;
 130         }
 131
 132         put_back(c);
 133         c = '\\';
 134 }
 135
 136 static inline void next_char(void)
 137 {
 138         next_real_char();
 139
 140         /* filter trigraphs */
 141         if(UNLIKELY(c == '\\')) {
 142                 maybe_concat_lines();
 143                 goto end_of_next_char;
 144         }
 145
 146         if(LIKELY(c != '?'))
 147                 goto end_of_next_char;
 148
 149         next_real_char();
 150         if(LIKELY(c != '?')) {
 151                 put_back(c);
 152                 c = '?';
 153                 goto end_of_next_char;
 154         }
 155
 156         next_real_char();
 157         switch(c) {
 158         case '=': c = '#'; break;
 159         case '(': c = '['; break;
 160         case '/': c = '\\'; maybe_concat_lines(); break;
 161         case ')': c = ']'; break;
 162         case '\'': c = '^'; break;
 163         case '<': c = '{'; break;
 164         case '!': c = '|'; break;
 165         case '>': c = '}'; break;
 166         case '-': c = '~'; break;
 167         default:
 168                 put_back(c);
 169                 put_back('?');
 170                 c = '?';
 171                 break;
 172         }
 173
 174 end_of_next_char:;
 175 #ifdef DEBUG_CHARS
 176         printf("nchar '%c'\n", c);
 177 #endif
 178 }
 179
 180 #define SYMBOL_CHARS  \
 181         case 'a':         \
 182         case 'b':         \
 183         case 'c':         \
 184         case 'd':         \
 185         case 'e':         \
 186         case 'f':         \
 187         case 'g':         \
 188         case 'h':         \
 189         case 'i':         \
 190         case 'j':         \
 191         case 'k':         \
 192         case 'l':         \
 193         case 'm':         \
 194         case 'n':         \
 195         case 'o':         \
 196         case 'p':         \
 197         case 'q':         \
 198         case 'r':         \
 199         case 's':         \
 200         case 't':         \
 201         case 'u':         \
 202         case 'v':         \
 203         case 'w':         \
 204         case 'x':         \
 205         case 'y':         \
 206         case 'z':         \
 207         case 'A':         \
 208         case 'B':         \
 209         case 'C':         \
 210         case 'D':         \
 211         case 'E':         \
 212         case 'F':         \
 213         case 'G':         \
 214         case 'H':         \
 215         case 'I':         \
 216         case 'J':         \
 217         case 'K':         \
 218         case 'L':         \
 219         case 'M':         \
 220         case 'N':         \
 221         case 'O':         \
 222         case 'P':         \
 223         case 'Q':         \
 224         case 'R':         \
 225         case 'S':         \
 226         case 'T':         \
 227         case 'U':         \
 228         case 'V':         \
 229         case 'W':         \
 230         case 'X':         \
 231         case 'Y':         \
 232         case 'Z':         \
 233         case '_':
 234
 235 #define DIGITS        \
 236         case '0':         \
 237         case '1':         \
 238         case '2':         \
 239         case '3':         \
 240         case '4':         \
 241         case '5':         \
 242         case '6':         \
 243         case '7':         \
 244         case '8':         \
 245         case '9':
 246
 247 static void parse_symbol(void)
 248 {
 249         symbol_t *symbol;
 250         char     *string;
 251
 252         obstack_1grow(&symbol_obstack, (char) c);
 253         next_char();
 254
 255         while(1) {
 256                 switch(c) {
 257                 DIGITS
 258                 SYMBOL_CHARS
 259                         obstack_1grow(&symbol_obstack, (char) c);
 260                         next_char();
 261                         break;
 262
 263                 default:
 264                         goto end_symbol;
 265                 }
 266         }
 267
 268 end_symbol:
 269         obstack_1grow(&symbol_obstack, '\0');
 270
 271         string = obstack_finish(&symbol_obstack);
 272         symbol = symbol_table_insert(string);
 273
 274         lexer_token.type     = symbol->ID;
 275         lexer_token.v.symbol = symbol;
 276
 277         if(symbol->string != string) {
 278                 obstack_free(&symbol_obstack, string);
 279         }
 280 }
 281
 282 static void parse_integer_suffix(bool is_oct_hex)
 283 {
 284         bool is_unsigned  = false;
 285         bool min_long     = false;
 286         bool min_longlong = false;
 287
 288         if(c == 'U' || c == 'u') {
 289                 is_unsigned = true;
 290                 next_char();
 291                 if(c == 'L' || c == 'l') {
 292                         min_long = true;
 293                         next_char();
 294                         if(c == 'L' || c == 'l') {
 295                                 min_longlong = true;
 296                                 next_char();
 297                         }
 298                 }
 299         } else if(c == 'l' || c == 'L') {
 300                 min_long = true;
 301                 next_char();
 302                 if(c == 'l' || c == 'L') {
 303                         min_longlong = true;
 304                         next_char();
 305                         if(c == 'u' || c == 'U') {
 306                                 is_unsigned = true;
 307                                 next_char();
 308                         }
 309                 } else if(c == 'u' || c == 'U') {
 310                         is_unsigned = true;
 311                         next_char();
 312                         lexer_token.datatype = type_unsigned_long;
 313                 }
 314         }
 315
 316         if(!is_unsigned) {
 317                 long long v = lexer_token.v.intvalue;
 318                 if(!min_long) {
 319                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 320                                 lexer_token.datatype = type_int;
 321                                 return;
 322                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 323                                 lexer_token.datatype = type_unsigned_int;
 324                                 return;
 325                         }
 326                 }
 327                 if(!min_longlong) {
 328                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 329                                 lexer_token.datatype = type_long;
 330                                 return;
 331                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
 332                                 lexer_token.datatype = type_unsigned_long;
 333                                 return;
 334                         }
 335                 }
 336                 unsigned long long uv = (unsigned long long) v;
 337                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 338                         lexer_token.datatype = type_unsigned_long_long;
 339                         return;
 340                 }
 341
 342                 lexer_token.datatype = type_long_long;
 343         } else {
 344                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 345                 if(!min_long && v <= TARGET_UINT_MAX) {
 346                         lexer_token.datatype = type_unsigned_int;
 347                         return;
 348                 }
 349                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
 350                         lexer_token.datatype = type_unsigned_long;
 351                         return;
 352                 }
 353                 lexer_token.datatype = type_unsigned_long_long;
 354         }
 355 }
 356
 357 static void parse_floating_suffix(void)
 358 {
 359         switch(c) {
 360         /* TODO: do something usefull with the suffixes... */
 361         case 'f':
 362         case 'F':
 363                 next_char();
 364                 lexer_token.datatype = type_float;
 365                 break;
 366         case 'l':
 367         case 'L':
 368                 next_char();
 369                 lexer_token.datatype = type_long_double;
 370                 break;
 371         default:
 372                 lexer_token.datatype = type_double;
 373                 break;
 374         }
 375 }
 376
 377 /**
 378  * A replacement for strtoull. Only those parts needed for
 379  * our parser are implemented.
 380  */
 381 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 382         unsigned long long v = 0;
 383
 384         switch (base) {
 385         case 16:
 386                 for (;; ++s) {
 387                         /* check for overrun */
 388                         if (v >= 0x1000000000000000ULL)
 389                                 break;
 390                         switch (tolower(*s)) {
 391                         case '0': v <<= 4; break;
 392                         case '1': v <<= 4; v |= 0x1; break;
 393                         case '2': v <<= 4; v |= 0x2; break;
 394                         case '3': v <<= 4; v |= 0x3; break;
 395                         case '4': v <<= 4; v |= 0x4; break;
 396                         case '5': v <<= 4; v |= 0x5; break;
 397                         case '6': v <<= 4; v |= 0x6; break;
 398                         case '7': v <<= 4; v |= 0x7; break;
 399                         case '8': v <<= 4; v |= 0x8; break;
 400                         case '9': v <<= 4; v |= 0x9; break;
 401                         case 'a': v <<= 4; v |= 0xa; break;
 402                         case 'b': v <<= 4; v |= 0xb; break;
 403                         case 'c': v <<= 4; v |= 0xc; break;
 404                         case 'd': v <<= 4; v |= 0xd; break;
 405                         case 'e': v <<= 4; v |= 0xe; break;
 406                         case 'f': v <<= 4; v |= 0xf; break;
 407                         default:
 408                                 goto end;
 409                         }
 410                 }
 411                 break;
 412         case 8:
 413                 for (;; ++s) {
 414                         /* check for overrun */
 415                         if (v >= 0x2000000000000000ULL)
 416                                 break;
 417                         switch (tolower(*s)) {
 418                         case '0': v <<= 3; break;
 419                         case '1': v <<= 3; v |= 1; break;
 420                         case '2': v <<= 3; v |= 2; break;
 421                         case '3': v <<= 3; v |= 3; break;
 422                         case '4': v <<= 3; v |= 4; break;
 423                         case '5': v <<= 3; v |= 5; break;
 424                         case '6': v <<= 3; v |= 6; break;
 425                         case '7': v <<= 3; v |= 7; break;
 426                         default:
 427                                 goto end;
 428                         }
 429                 }
 430                 break;
 431         case 10:
 432                 for (;; ++s) {
 433                         /* check for overrun */
 434                         if (v > 0x1999999999999999ULL)
 435                                 break;
 436                         switch (tolower(*s)) {
 437                         case '0': v *= 10; break;
 438                         case '1': v *= 10; v += 1; break;
 439                         case '2': v *= 10; v += 2; break;
 440                         case '3': v *= 10; v += 3; break;
 441                         case '4': v *= 10; v += 4; break;
 442                         case '5': v *= 10; v += 5; break;
 443                         case '6': v *= 10; v += 6; break;
 444                         case '7': v *= 10; v += 7; break;
 445                         case '8': v *= 10; v += 8; break;
 446                         case '9': v *= 10; v += 9; break;
 447                         default:
 448                                 goto end;
 449                         }
 450                 }
 451                 break;
 452         default:
 453                 assert(0);
 454                 break;
 455         }
 456 end:
 457         *endptr = s;
 458         return v;
 459 }
 460
 461 static void parse_number_hex(void)
 462 {
 463         assert(c == 'x' || c == 'X');
 464         next_char();
 465
 466         while(isxdigit(c)) {
 467                 obstack_1grow(&symbol_obstack, (char) c);
 468                 next_char();
 469         }
 470         obstack_1grow(&symbol_obstack, '\0');
 471         char *string = obstack_finish(&symbol_obstack);
 472
 473         if(c == '.' || c == 'p' || c == 'P') {
 474                 next_char();
 475                 panic("Hex floating point numbers not implemented yet");
 476         }
 477         if(*string == '\0') {
 478                 parse_error("invalid hex number");
 479                 lexer_token.type = T_ERROR;
 480         }
 481
 482         const char *endptr;
 483         lexer_token.type       = T_INTEGER;
 484         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
 485         if(*endptr != '\0') {
 486                 parse_error("hex number literal too long");
 487         }
 488
 489         obstack_free(&symbol_obstack, string);
 490         parse_integer_suffix(true);
 491 }
 492
 493 static inline bool is_octal_digit(int chr)
 494 {
 495         return '0' <= chr && chr <= '7';
 496 }
 497
 498 static void parse_number_oct(void)
 499 {
 500         while(is_octal_digit(c)) {
 501                 obstack_1grow(&symbol_obstack, (char) c);
 502                 next_char();
 503         }
 504         obstack_1grow(&symbol_obstack, '\0');
 505         char *string = obstack_finish(&symbol_obstack);
 506
 507         const char *endptr;
 508         lexer_token.type       = T_INTEGER;
 509         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 510         if(*endptr != '\0') {
 511                 parse_error("octal number literal too long");
 512         }
 513
 514         obstack_free(&symbol_obstack, string);
 515         parse_integer_suffix(true);
 516 }
 517
 518 static void parse_number_dec(void)
 519 {
 520         bool is_float = false;
 521         while(isdigit(c)) {
 522                 obstack_1grow(&symbol_obstack, (char) c);
 523                 next_char();
 524         }
 525
 526         if(c == '.') {
 527                 obstack_1grow(&symbol_obstack, '.');
 528                 next_char();
 529
 530                 while(isdigit(c)) {
 531                         obstack_1grow(&symbol_obstack, (char) c);
 532                         next_char();
 533                 }
 534                 is_float = true;
 535         }
 536         if(c == 'e' || c == 'E') {
 537                 obstack_1grow(&symbol_obstack, 'e');
 538                 next_char();
 539
 540                 if(c == '-' || c == '+') {
 541                         obstack_1grow(&symbol_obstack, (char) c);
 542                         next_char();
 543                 }
 544
 545                 while(isdigit(c)) {
 546                         obstack_1grow(&symbol_obstack, (char) c);
 547                         next_char();
 548                 }
 549                 is_float = true;
 550         }
 551
 552         obstack_1grow(&symbol_obstack, '\0');
 553         char *string = obstack_finish(&symbol_obstack);
 554
 555         if(is_float) {
 556                 char *endptr;
 557                 lexer_token.type         = T_FLOATINGPOINT;
 558                 lexer_token.v.floatvalue = strtold(string, &endptr);
 559
 560                 if(*endptr != '\0') {
 561                         parse_error("invalid number literal");
 562                 }
 563
 564                 parse_floating_suffix();
 565         } else {
 566                 const char *endptr;
 567                 lexer_token.type       = T_INTEGER;
 568                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 569
 570                 if(*endptr != '\0') {
 571                         parse_error("invalid number literal");
 572                 }
 573
 574                 parse_integer_suffix(false);
 575         }
 576         obstack_free(&symbol_obstack, string);
 577 }
 578
 579 static void parse_number(void)
 580 {
 581         if (c == '0') {
 582                 next_char();
 583                 switch (c) {
 584                         case 'X':
 585                         case 'x':
 586                                 parse_number_hex();
 587                                 break;
 588                         case '0':
 589                         case '1':
 590                         case '2':
 591                         case '3':
 592                         case '4':
 593                         case '5':
 594                         case '6':
 595                         case '7':
 596                                 parse_number_oct();
 597                                 break;
 598                         case '8':
 599                         case '9':
 600                                 next_char();
 601                                 parse_error("invalid octal number");
 602                                 lexer_token.type = T_ERROR;
 603                                 return;
 604                         case '.':
 605                         case 'e':
 606                         case 'E':
 607                         default:
 608                                 obstack_1grow(&symbol_obstack, '0');
 609                                 parse_number_dec();
 610                                 return;
 611                 }
 612         } else {
 613                 parse_number_dec();
 614         }
 615 }
 616
 617 static int parse_octal_sequence(const int first_digit)
 618 {
 619         assert(is_octal_digit(first_digit));
 620         int value = first_digit - '0';
 621         if (!is_octal_digit(c)) return value;
 622         value = 8 * value + c - '0';
 623         next_char();
 624         if (!is_octal_digit(c)) return value;
 625         value = 8 * value + c - '0';
 626         next_char();
 627         return (char_type)value;
 628 }
 629
 630 static int parse_hex_sequence(void)
 631 {
 632         int value = 0;
 633         while(1) {
 634                 if (c >= '0' && c <= '9') {
 635                         value = 16 * value + c - '0';
 636                 } else if ('A' <= c && c <= 'F') {
 637                         value = 16 * value + c - 'A' + 10;
 638                 } else if ('a' <= c && c <= 'f') {
 639                         value = 16 * value + c - 'a' + 10;
 640                 } else {
 641                         break;
 642                 }
 643                 next_char();
 644         }
 645
 646         return (char_type)value;
 647 }
 648
 649 static int parse_escape_sequence(void)
 650 {
 651         eat('\\');
 652
 653         int ec = c;
 654         next_char();
 655
 656         switch(ec) {
 657         case '"':  return '"';
 658         case '\'': return '\'';
 659         case '\\': return '\\';
 660         case '?': return '\?';
 661         case 'a': return '\a';
 662         case 'b': return '\b';
 663         case 'f': return '\f';
 664         case 'n': return '\n';
 665         case 'r': return '\r';
 666         case 't': return '\t';
 667         case 'v': return '\v';
 668         case 'x':
 669                 return parse_hex_sequence();
 670         case '0':
 671         case '1':
 672         case '2':
 673         case '3':
 674         case '4':
 675         case '5':
 676         case '6':
 677         case '7':
 678                 return parse_octal_sequence(ec);
 679         case EOF:
 680                 parse_error("reached end of file while parsing escape sequence");
 681                 return EOF;
 682         default:
 683                 parse_error("unknown escape sequence");
 684                 return EOF;
 685         }
 686 }
 687
 688 string_t concat_strings(const string_t *const s1, const string_t *const s2)
 689 {
 690         const size_t len1 = s1->size - 1;
 691         const size_t len2 = s2->size - 1;
 692
 693         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 694         memcpy(concat, s1->begin, len1);
 695         memcpy(concat + len1, s2->begin, len2 + 1);
 696
 697 #if 0 /* TODO hash */
 698         const char *result = strset_insert(&stringset, concat);
 699         if(result != concat) {
 700                 obstack_free(&symbol_obstack, concat);
 701         }
 702
 703         return result;
 704 #else
 705         return (string_t){ concat, len1 + len2 + 1 };
 706 #endif
 707 }
 708
 709 static void parse_string_literal(void)
 710 {
 711         const unsigned start_linenr = lexer_token.source_position.linenr;
 712
 713         assert(c == '"');
 714         next_char();
 715
 716         int tc;
 717         while(1) {
 718                 switch(c) {
 719                 case '\\':
 720                         tc = parse_escape_sequence();
 721                         obstack_1grow(&symbol_obstack, (char) tc);
 722                         break;
 723
 724                 case EOF:
 725                         error_prefix_at(lexer_token.source_position.input_name,
 726                                         start_linenr);
 727                         fprintf(stderr, "string has no end\n");
 728                         lexer_token.type = T_ERROR;
 729                         return;
 730
 731                 case '"':
 732                         next_char();
 733                         goto end_of_string;
 734
 735                 default:
 736                         obstack_1grow(&symbol_obstack, (char) c);
 737                         next_char();
 738                         break;
 739                 }
 740         }
 741
 742 end_of_string:
 743
 744         /* TODO: concatenate multiple strings separated by whitespace... */
 745
 746         /* add finishing 0 to the string */
 747         obstack_1grow(&symbol_obstack, '\0');
 748         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 749         const char *const string = obstack_finish(&symbol_obstack);
 750
 751 #if 0 /* TODO hash */
 752         /* check if there is already a copy of the string */
 753         result = strset_insert(&stringset, string);
 754         if(result != string) {
 755                 obstack_free(&symbol_obstack, string);
 756         }
 757 #else
 758         const char *const result = string;
 759 #endif
 760
 761         lexer_token.type           = T_STRING_LITERAL;
 762         lexer_token.v.string.begin = result;
 763         lexer_token.v.string.size  = size;
 764 }
 765
 766 static void parse_wide_character_constant(void)
 767 {
 768         eat('\'');
 769
 770         int found_char = 0;
 771         while(1) {
 772                 switch(c) {
 773                 case '\\':
 774                         found_char = parse_escape_sequence();
 775                         break;
 776
 777                 MATCH_NEWLINE(
 778                         parse_error("newline while parsing character constant");
 779                         break;
 780                 )
 781
 782                 case '\'':
 783                         next_char();
 784                         goto end_of_wide_char_constant;
 785
 786                 case EOF:
 787                         parse_error("EOF while parsing character constant");
 788                         lexer_token.type = T_ERROR;
 789                         return;
 790
 791                 default:
 792                         if(found_char != 0) {
 793                                 parse_error("more than 1 characters in character "
 794                                             "constant");
 795                                 goto end_of_wide_char_constant;
 796                         } else {
 797                                 found_char = c;
 798                                 next_char();
 799                         }
 800                         break;
 801                 }
 802         }
 803
 804 end_of_wide_char_constant:
 805         lexer_token.type       = T_INTEGER;
 806         lexer_token.v.intvalue = found_char;
 807         lexer_token.datatype   = type_wchar_t;
 808 }
 809
 810 static void parse_wide_string_literal(void)
 811 {
 812         const unsigned start_linenr = lexer_token.source_position.linenr;
 813
 814         assert(c == '"');
 815         next_char();
 816
 817         while(1) {
 818                 switch(c) {
 819                         case '\\': {
 820                                 wchar_rep_t tc = parse_escape_sequence();
 821                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 822                                 break;
 823                         }
 824
 825                         case EOF:
 826                                 error_prefix_at(lexer_token.source_position.input_name,
 827                                                 start_linenr);
 828                                 fprintf(stderr, "string has no end\n");
 829                                 lexer_token.type = T_ERROR;
 830                                 return;
 831
 832                         case '"':
 833                                 next_char();
 834                                 goto end_of_string;
 835
 836                         default: {
 837                                 wchar_rep_t tc = c;
 838                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 839                                 next_char();
 840                                 break;
 841                         }
 842                 }
 843         }
 844
 845 end_of_string:;
 846
 847         /* TODO: concatenate multiple strings separated by whitespace... */
 848
 849         /* add finishing 0 to the string */
 850         wchar_rep_t nul = L'\0';
 851         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 852         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 853         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
 854
 855 #if 0 /* TODO hash */
 856         /* check if there is already a copy of the string */
 857         const wchar_rep_t *const result = strset_insert(&stringset, string);
 858         if(result != string) {
 859                 obstack_free(&symbol_obstack, string);
 860         }
 861 #else
 862         const wchar_rep_t *const result = string;
 863 #endif
 864
 865         lexer_token.type                = T_WIDE_STRING_LITERAL;
 866         lexer_token.v.wide_string.begin = result;
 867         lexer_token.v.wide_string.size  = size;
 868 }
 869
 870 static void parse_character_constant(void)
 871 {
 872         eat('\'');
 873
 874         int found_char = 0;
 875         while(1) {
 876                 switch(c) {
 877                 case '\\':
 878                         found_char = parse_escape_sequence();
 879                         break;
 880
 881                 MATCH_NEWLINE(
 882                         parse_error("newline while parsing character constant");
 883                         break;
 884                 )
 885
 886                 case '\'':
 887                         next_char();
 888                         goto end_of_char_constant;
 889
 890                 case EOF:
 891                         parse_error("EOF while parsing character constant");
 892                         lexer_token.type = T_ERROR;
 893                         return;
 894
 895                 default:
 896                         if(found_char != 0) {
 897                                 parse_error("more than 1 characters in character "
 898                                             "constant");
 899                                 goto end_of_char_constant;
 900                         } else {
 901                                 found_char = c;
 902                                 next_char();
 903                         }
 904                         break;
 905                 }
 906         }
 907
 908 end_of_char_constant:
 909         lexer_token.type       = T_INTEGER;
 910         lexer_token.v.intvalue = found_char;
 911         lexer_token.datatype   = type_int;
 912 }
 913
 914 static void skip_multiline_comment(void)
 915 {
 916         unsigned start_linenr = lexer_token.source_position.linenr;
 917
 918         while(1) {
 919                 switch(c) {
 920                 case '/':
 921                         next_char();
 922                         if (c == '*') {
 923                                 /* TODO: nested comment, warn here */
 924                         }
 925                         break;
 926                 case '*':
 927                         next_char();
 928                         if(c == '/') {
 929                                 next_char();
 930                                 return;
 931                         }
 932                         break;
 933
 934                 MATCH_NEWLINE(break;)
 935
 936                 case EOF:
 937                         error_prefix_at(lexer_token.source_position.input_name,
 938                                         start_linenr);
 939                         fprintf(stderr, "at end of file while looking for comment end\n");
 940                         return;
 941
 942                 default:
 943                         next_char();
 944                         break;
 945                 }
 946         }
 947 }
 948
 949 static void skip_line_comment(void)
 950 {
 951         while(1) {
 952                 switch(c) {
 953                 case EOF:
 954                         return;
 955
 956                 case '\n':
 957                 case '\r':
 958                         return;
 959
 960                 default:
 961                         next_char();
 962                         break;
 963                 }
 964         }
 965 }
 966
 967 static token_t pp_token;
 968
 969 static inline void next_pp_token(void)
 970 {
 971         lexer_next_preprocessing_token();
 972         pp_token = lexer_token;
 973 }
 974
 975 static void eat_until_newline(void)
 976 {
 977         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 978                 next_pp_token();
 979         }
 980 }
 981
 982 static void error_directive(void)
 983 {
 984         error_prefix();
 985         fprintf(stderr, "#error directive: \n");
 986
 987         /* parse pp-tokens until new-line */
 988 }
 989
 990 static void define_directive(void)
 991 {
 992         lexer_next_preprocessing_token();
 993         if(lexer_token.type != T_IDENTIFIER) {
 994                 parse_error("expected identifier after #define\n");
 995                 eat_until_newline();
 996         }
 997 }
 998
 999 static void ifdef_directive(int is_ifndef)
1000 {
1001         (void) is_ifndef;
1002         lexer_next_preprocessing_token();
1003         //expect_identifier();
1004         //extect_newline();
1005 }
1006
1007 static void endif_directive(void)
1008 {
1009         //expect_newline();
1010 }
1011
1012 static void parse_line_directive(void)
1013 {
1014         if(pp_token.type != T_INTEGER) {
1015                 parse_error("expected integer");
1016         } else {
1017                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1018                 next_pp_token();
1019         }
1020         if(pp_token.type == T_STRING_LITERAL) {
1021                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1022                 next_pp_token();
1023         }
1024
1025         eat_until_newline();
1026 }
1027
1028 static void parse_preprocessor_identifier(void)
1029 {
1030         assert(pp_token.type == T_IDENTIFIER);
1031         symbol_t *symbol = pp_token.v.symbol;
1032
1033         switch(symbol->pp_ID) {
1034         case TP_include:
1035                 printf("include - enable header name parsing!\n");
1036                 break;
1037         case TP_define:
1038                 define_directive();
1039                 break;
1040         case TP_ifdef:
1041                 ifdef_directive(0);
1042                 break;
1043         case TP_ifndef:
1044                 ifdef_directive(1);
1045                 break;
1046         case TP_endif:
1047                 endif_directive();
1048                 break;
1049         case TP_line:
1050                 next_pp_token();
1051                 parse_line_directive();
1052                 break;
1053         case TP_if:
1054         case TP_else:
1055         case TP_elif:
1056         case TP_undef:
1057         case TP_error:
1058                 error_directive();
1059                 break;
1060         case TP_pragma:
1061                 if (warning.unknown_pragmas) {
1062                         warningf(lexer_token.source_position, "encountered unknown #pragma");
1063                 }
1064                 eat_until_newline();
1065                 break;
1066         }
1067 }
1068
1069 static void parse_preprocessor_directive(void)
1070 {
1071         next_pp_token();
1072
1073         switch(pp_token.type) {
1074         case T_IDENTIFIER:
1075                 parse_preprocessor_identifier();
1076                 break;
1077         case T_INTEGER:
1078                 parse_line_directive();
1079                 break;
1080         default:
1081                 parse_error("invalid preprocessor directive");
1082                 eat_until_newline();
1083                 break;
1084         }
1085 }
1086
1087 #define MAYBE_PROLOG                                       \
1088                         next_char();                                   \
1089                         while(1) {                                     \
1090                                 switch(c) {
1091
1092 #define MAYBE(ch, set_type)                                \
1093                                 case ch:                                   \
1094                                         next_char();                           \
1095                                         lexer_token.type = set_type;           \
1096                                         return;
1097
1098 #define ELSE_CODE(code)                                    \
1099                                 default:                                   \
1100                                         code;                                  \
1101                                 }                                          \
1102                         } /* end of while(1) */                        \
1103                         break;
1104
1105 #define ELSE(set_type)                                     \
1106                 ELSE_CODE(                                         \
1107                         lexer_token.type = set_type;                   \
1108                         return;                                        \
1109                 )
1110
1111 void lexer_next_preprocessing_token(void)
1112 {
1113         while(1) {
1114                 switch(c) {
1115                 case ' ':
1116                 case '\t':
1117                         next_char();
1118                         break;
1119
1120                 MATCH_NEWLINE(
1121                         lexer_token.type = '\n';
1122                         return;
1123                 )
1124
1125                 SYMBOL_CHARS
1126                         parse_symbol();
1127                         /* might be a wide string ( L"string" ) */
1128                         if(lexer_token.type == T_IDENTIFIER &&
1129                             lexer_token.v.symbol == symbol_L) {
1130                             if(c == '"') {
1131                                         parse_wide_string_literal();
1132                                 } else if(c == '\'') {
1133                                         parse_wide_character_constant();
1134                                 }
1135                         }
1136                         return;
1137
1138                 DIGITS
1139                         parse_number();
1140                         return;
1141
1142                 case '"':
1143                         parse_string_literal();
1144                         return;
1145
1146                 case '\'':
1147                         parse_character_constant();
1148                         return;
1149
1150                 case '.':
1151                         MAYBE_PROLOG
1152                                 case '0':
1153                                 case '1':
1154                                 case '2':
1155                                 case '3':
1156                                 case '4':
1157                                 case '5':
1158                                 case '6':
1159                                 case '7':
1160                                 case '8':
1161                                 case '9':
1162                                         put_back(c);
1163                                         c = '.';
1164                                         parse_number_dec();
1165                                         return;
1166
1167                                 case '.':
1168                                         MAYBE_PROLOG
1169                                         MAYBE('.', T_DOTDOTDOT)
1170                                         ELSE_CODE(
1171                                                 put_back(c);
1172                                                 c = '.';
1173                                                 lexer_token.type = '.';
1174                                                 return;
1175                                         )
1176                         ELSE('.')
1177                 case '&':
1178                         MAYBE_PROLOG
1179                         MAYBE('&', T_ANDAND)
1180                         MAYBE('=', T_ANDEQUAL)
1181                         ELSE('&')
1182                 case '*':
1183                         MAYBE_PROLOG
1184                         MAYBE('=', T_ASTERISKEQUAL)
1185                         ELSE('*')
1186                 case '+':
1187                         MAYBE_PROLOG
1188                         MAYBE('+', T_PLUSPLUS)
1189                         MAYBE('=', T_PLUSEQUAL)
1190                         ELSE('+')
1191                 case '-':
1192                         MAYBE_PROLOG
1193                         MAYBE('>', T_MINUSGREATER)
1194                         MAYBE('-', T_MINUSMINUS)
1195                         MAYBE('=', T_MINUSEQUAL)
1196                         ELSE('-')
1197                 case '!':
1198                         MAYBE_PROLOG
1199                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1200                         ELSE('!')
1201                 case '/':
1202                         MAYBE_PROLOG
1203                         MAYBE('=', T_SLASHEQUAL)
1204                                 case '*':
1205                                         next_char();
1206                                         skip_multiline_comment();
1207                                         lexer_next_preprocessing_token();
1208                                         return;
1209                                 case '/':
1210                                         next_char();
1211                                         skip_line_comment();
1212                                         lexer_next_preprocessing_token();
1213                                         return;
1214                         ELSE('/')
1215                 case '%':
1216                         MAYBE_PROLOG
1217                         MAYBE('>', T_PERCENTGREATER)
1218                         MAYBE('=', T_PERCENTEQUAL)
1219                                 case ':':
1220                                         MAYBE_PROLOG
1221                                                 case '%':
1222                                                         MAYBE_PROLOG
1223                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1224                                                         ELSE_CODE(
1225                                                                 put_back(c);
1226                                                                 c = '%';
1227                                                                 lexer_token.type = T_PERCENTCOLON;
1228                                                                 return;
1229                                                         )
1230                                         ELSE(T_PERCENTCOLON)
1231                         ELSE('%')
1232                 case '<':
1233                         MAYBE_PROLOG
1234                         MAYBE(':', T_LESSCOLON)
1235                         MAYBE('%', T_LESSPERCENT)
1236                         MAYBE('=', T_LESSEQUAL)
1237                                 case '<':
1238                                         MAYBE_PROLOG
1239                                         MAYBE('=', T_LESSLESSEQUAL)
1240                                         ELSE(T_LESSLESS)
1241                         ELSE('<')
1242                 case '>':
1243                         MAYBE_PROLOG
1244                         MAYBE('=', T_GREATEREQUAL)
1245                                 case '>':
1246                                         MAYBE_PROLOG
1247                                         MAYBE('=', T_GREATERGREATEREQUAL)
1248                                         ELSE(T_GREATERGREATER)
1249                         ELSE('>')
1250                 case '^':
1251                         MAYBE_PROLOG
1252                         MAYBE('=', T_CARETEQUAL)
1253                         ELSE('^')
1254                 case '|':
1255                         MAYBE_PROLOG
1256                         MAYBE('=', T_PIPEEQUAL)
1257                         MAYBE('|', T_PIPEPIPE)
1258                         ELSE('|')
1259                 case ':':
1260                         MAYBE_PROLOG
1261                         MAYBE('>', T_COLONGREATER)
1262                         ELSE(':')
1263                 case '=':
1264                         MAYBE_PROLOG
1265                         MAYBE('=', T_EQUALEQUAL)
1266                         ELSE('=')
1267                 case '#':
1268                         MAYBE_PROLOG
1269                         MAYBE('#', T_HASHHASH)
1270                         ELSE('#')
1271
1272                 case '?':
1273                 case '[':
1274                 case ']':
1275                 case '(':
1276                 case ')':
1277                 case '{':
1278                 case '}':
1279                 case '~':
1280                 case ';':
1281                 case ',':
1282                 case '\\':
1283                         lexer_token.type = c;
1284                         next_char();
1285                         return;
1286
1287                 case EOF:
1288                         lexer_token.type = T_EOF;
1289                         return;
1290
1291                 default:
1292                         next_char();
1293                         error_prefix();
1294                         fprintf(stderr, "unknown character '%c' found\n", c);
1295                         lexer_token.type = T_ERROR;
1296                         return;
1297                 }
1298         }
1299 }
1300
1301 void lexer_next_token(void)
1302 {
1303         lexer_next_preprocessing_token();
1304         if(lexer_token.type != '\n')
1305                 return;
1306
1307 newline_found:
1308         do {
1309                 lexer_next_preprocessing_token();
1310         } while(lexer_token.type == '\n');
1311
1312         if(lexer_token.type == '#') {
1313                 parse_preprocessor_directive();
1314                 goto newline_found;
1315         }
1316 }
1317
1318 void init_lexer(void)
1319 {
1320         strset_init(&stringset);
1321 }
1322
1323 void lexer_open_stream(FILE *stream, const char *input_name)
1324 {
1325         input                                  = stream;
1326         lexer_token.source_position.linenr     = 0;
1327         lexer_token.source_position.input_name = input_name;
1328
1329         symbol_L = symbol_table_insert("L");
1330         bufpos = NULL;
1331         bufend = NULL;
1332
1333         /* place a virtual \n at the beginning so the lexer knows that we're
1334          * at the beginning of a line */
1335         c = '\n';
1336 }
1337
1338 void exit_lexer(void)
1339 {
1340         strset_destroy(&stringset);
1341 }
1342
1343 static __attribute__((unused))
1344 void dbg_pos(const source_position_t source_position)
1345 {
1346         fprintf(stdout, "%s:%u\n", source_position.input_name,
1347                 source_position.linenr);
1348         fflush(stdout);
1349 }