nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "diagnostic.h"
   4 #include "lexer.h"
   5 #include "token_t.h"
   6 #include "symbol_table_t.h"
   7 #include "adt/error.h"
   8 #include "adt/strset.h"
   9 #include "adt/util.h"
  10 #include "types.h"
  11 #include "type_t.h"
  12 #include "target_architecture.h"
  13 #include "parser.h"
  14
  15 #include <assert.h>
  16 #include <errno.h>
  17 #include <string.h>
  18 #include <stdbool.h>
  19 #include <ctype.h>
  20
  21 //#define DEBUG_CHARS
  22 #define MAX_PUTBACK 3
  23
  24 #ifdef _WIN32
  25 /* No strtold on windows and no replacement yet */
  26 #define strtold(s, e) strtod(s, e)
  27 #endif
  28
  29 #if defined HAS_SIGNED_CHAR
  30 typedef signed char char_type;
  31 #elif defined HAS_UNSIGNED_CHAR
  32 typedef unsigned char char_type;
  33 #else
  34 #       error signedness of char not determined
  35 #endif
  36
  37 static int         c;
  38 token_t            lexer_token;
  39 symbol_t          *symbol_L;
  40 static FILE       *input;
  41 static char        buf[1024 + MAX_PUTBACK];
  42 static const char *bufend;
  43 static const char *bufpos;
  44 static strset_t    stringset;
  45
  46 static void error_prefix_at(const char *input_name, unsigned linenr)
  47 {
  48         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
  49 }
  50
  51 static void error_prefix(void)
  52 {
  53         error_prefix_at(lexer_token.source_position.input_name,
  54                         lexer_token.source_position.linenr);
  55 }
  56
  57 static void parse_error(const char *msg)
  58 {
  59         error_prefix();
  60         fprintf(stderr, "%s\n", msg);
  61 }
  62
  63 static inline void next_real_char(void)
  64 {
  65         bufpos++;
  66         if(bufpos >= bufend) {
  67                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  68                                  input);
  69                 if(s == 0) {
  70                         c = EOF;
  71                         return;
  72                 }
  73                 bufpos = buf + MAX_PUTBACK;
  74                 bufend = buf + MAX_PUTBACK + s;
  75         }
  76         c = *(bufpos);
  77 }
  78
  79 static inline void put_back(int pc)
  80 {
  81         assert(bufpos >= buf);
  82         //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
  83
  84         char *p = buf + (bufpos - buf);
  85         *p = (char) pc;
  86
  87         /* going backwards in the buffer is legal as long as it's not more often
  88          * than MAX_PUTBACK */
  89         bufpos--;
  90
  91 #ifdef DEBUG_CHARS
  92         printf("putback '%c'\n", pc);
  93 #endif
  94 }
  95
  96 static inline void next_char(void);
  97
  98 #define MATCH_NEWLINE(code)                   \
  99         case '\r':                                \
 100                 next_char();                          \
 101                 if(c == '\n') {                       \
 102                         next_char();                      \
 103                 }                                     \
 104                 lexer_token.source_position.linenr++; \
 105                 code;                                 \
 106         case '\n':                                \
 107                 next_char();                          \
 108                 lexer_token.source_position.linenr++; \
 109                 code;
 110
 111 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 112
 113 static void maybe_concat_lines(void)
 114 {
 115         eat('\\');
 116
 117         switch(c) {
 118         MATCH_NEWLINE(return;)
 119
 120         default:
 121                 break;
 122         }
 123
 124         put_back(c);
 125         c = '\\';
 126 }
 127
 128 static inline void next_char(void)
 129 {
 130         next_real_char();
 131
 132         /* filter trigraphs */
 133         if(UNLIKELY(c == '\\')) {
 134                 maybe_concat_lines();
 135                 goto end_of_next_char;
 136         }
 137
 138         if(LIKELY(c != '?'))
 139                 goto end_of_next_char;
 140
 141         next_real_char();
 142         if(LIKELY(c != '?')) {
 143                 put_back(c);
 144                 c = '?';
 145                 goto end_of_next_char;
 146         }
 147
 148         next_real_char();
 149         switch(c) {
 150         case '=': c = '#'; break;
 151         case '(': c = '['; break;
 152         case '/': c = '\\'; maybe_concat_lines(); break;
 153         case ')': c = ']'; break;
 154         case '\'': c = '^'; break;
 155         case '<': c = '{'; break;
 156         case '!': c = '|'; break;
 157         case '>': c = '}'; break;
 158         case '-': c = '~'; break;
 159         default:
 160                 put_back(c);
 161                 put_back('?');
 162                 c = '?';
 163                 break;
 164         }
 165
 166 end_of_next_char:;
 167 #ifdef DEBUG_CHARS
 168         printf("nchar '%c'\n", c);
 169 #endif
 170 }
 171
 172 #define SYMBOL_CHARS  \
 173         case 'a':         \
 174         case 'b':         \
 175         case 'c':         \
 176         case 'd':         \
 177         case 'e':         \
 178         case 'f':         \
 179         case 'g':         \
 180         case 'h':         \
 181         case 'i':         \
 182         case 'j':         \
 183         case 'k':         \
 184         case 'l':         \
 185         case 'm':         \
 186         case 'n':         \
 187         case 'o':         \
 188         case 'p':         \
 189         case 'q':         \
 190         case 'r':         \
 191         case 's':         \
 192         case 't':         \
 193         case 'u':         \
 194         case 'v':         \
 195         case 'w':         \
 196         case 'x':         \
 197         case 'y':         \
 198         case 'z':         \
 199         case 'A':         \
 200         case 'B':         \
 201         case 'C':         \
 202         case 'D':         \
 203         case 'E':         \
 204         case 'F':         \
 205         case 'G':         \
 206         case 'H':         \
 207         case 'I':         \
 208         case 'J':         \
 209         case 'K':         \
 210         case 'L':         \
 211         case 'M':         \
 212         case 'N':         \
 213         case 'O':         \
 214         case 'P':         \
 215         case 'Q':         \
 216         case 'R':         \
 217         case 'S':         \
 218         case 'T':         \
 219         case 'U':         \
 220         case 'V':         \
 221         case 'W':         \
 222         case 'X':         \
 223         case 'Y':         \
 224         case 'Z':         \
 225         case '_':
 226
 227 #define DIGITS        \
 228         case '0':         \
 229         case '1':         \
 230         case '2':         \
 231         case '3':         \
 232         case '4':         \
 233         case '5':         \
 234         case '6':         \
 235         case '7':         \
 236         case '8':         \
 237         case '9':
 238
 239 static void parse_symbol(void)
 240 {
 241         symbol_t *symbol;
 242         char     *string;
 243
 244         obstack_1grow(&symbol_obstack, (char) c);
 245         next_char();
 246
 247         while(1) {
 248                 switch(c) {
 249                 DIGITS
 250                 SYMBOL_CHARS
 251                         obstack_1grow(&symbol_obstack, (char) c);
 252                         next_char();
 253                         break;
 254
 255                 default:
 256                         goto end_symbol;
 257                 }
 258         }
 259
 260 end_symbol:
 261         obstack_1grow(&symbol_obstack, '\0');
 262
 263         string = obstack_finish(&symbol_obstack);
 264         symbol = symbol_table_insert(string);
 265
 266         lexer_token.type     = symbol->ID;
 267         lexer_token.v.symbol = symbol;
 268
 269         if(symbol->string != string) {
 270                 obstack_free(&symbol_obstack, string);
 271         }
 272 }
 273
 274 static void parse_integer_suffix(bool is_oct_hex)
 275 {
 276         bool is_unsigned  = false;
 277         bool min_long     = false;
 278         bool min_longlong = false;
 279
 280         if(c == 'U' || c == 'u') {
 281                 is_unsigned = true;
 282                 next_char();
 283                 if(c == 'L' || c == 'l') {
 284                         min_long = true;
 285                         next_char();
 286                         if(c == 'L' || c == 'l') {
 287                                 min_longlong = true;
 288                                 next_char();
 289                         }
 290                 }
 291         } else if(c == 'l' || c == 'L') {
 292                 min_long = true;
 293                 next_char();
 294                 if(c == 'l' || c == 'L') {
 295                         min_longlong = true;
 296                         next_char();
 297                         if(c == 'u' || c == 'U') {
 298                                 is_unsigned = true;
 299                                 next_char();
 300                         }
 301                 } else if(c == 'u' || c == 'U') {
 302                         is_unsigned = true;
 303                         next_char();
 304                         lexer_token.datatype = type_unsigned_long;
 305                 }
 306         }
 307
 308         if(!is_unsigned) {
 309                 long long v = lexer_token.v.intvalue;
 310                 if(!min_long) {
 311                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 312                                 lexer_token.datatype = type_int;
 313                                 return;
 314                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 315                                 lexer_token.datatype = type_unsigned_int;
 316                                 return;
 317                         }
 318                 }
 319                 if(!min_longlong) {
 320                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 321                                 lexer_token.datatype = type_long;
 322                                 return;
 323                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
 324                                 lexer_token.datatype = type_unsigned_long;
 325                                 return;
 326                         }
 327                 }
 328                 unsigned long long uv = (unsigned long long) v;
 329                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 330                         lexer_token.datatype = type_unsigned_long_long;
 331                         return;
 332                 }
 333
 334                 lexer_token.datatype = type_long_long;
 335         } else {
 336                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 337                 if(!min_long && v <= TARGET_UINT_MAX) {
 338                         lexer_token.datatype = type_unsigned_int;
 339                         return;
 340                 }
 341                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
 342                         lexer_token.datatype = type_unsigned_long;
 343                         return;
 344                 }
 345                 lexer_token.datatype = type_unsigned_long_long;
 346         }
 347 }
 348
 349 static void parse_floating_suffix(void)
 350 {
 351         switch(c) {
 352         /* TODO: do something usefull with the suffixes... */
 353         case 'f':
 354         case 'F':
 355                 next_char();
 356                 lexer_token.datatype = type_float;
 357                 break;
 358         case 'l':
 359         case 'L':
 360                 next_char();
 361                 lexer_token.datatype = type_long_double;
 362                 break;
 363         default:
 364                 lexer_token.datatype = type_double;
 365                 break;
 366         }
 367 }
 368
 369 /**
 370  * A replacement for strtoull. Only those parts needed for
 371  * our parser are implemented.
 372  */
 373 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 374         unsigned long long v = 0;
 375
 376         switch (base) {
 377         case 16:
 378                 for (;; ++s) {
 379                         /* check for overrun */
 380                         if (v >= 0x1000000000000000ULL)
 381                                 break;
 382                         switch (tolower(*s)) {
 383                         case '0': v <<= 4; break;
 384                         case '1': v <<= 4; v |= 0x1; break;
 385                         case '2': v <<= 4; v |= 0x2; break;
 386                         case '3': v <<= 4; v |= 0x3; break;
 387                         case '4': v <<= 4; v |= 0x4; break;
 388                         case '5': v <<= 4; v |= 0x5; break;
 389                         case '6': v <<= 4; v |= 0x6; break;
 390                         case '7': v <<= 4; v |= 0x7; break;
 391                         case '8': v <<= 4; v |= 0x8; break;
 392                         case '9': v <<= 4; v |= 0x9; break;
 393                         case 'a': v <<= 4; v |= 0xa; break;
 394                         case 'b': v <<= 4; v |= 0xb; break;
 395                         case 'c': v <<= 4; v |= 0xc; break;
 396                         case 'd': v <<= 4; v |= 0xd; break;
 397                         case 'e': v <<= 4; v |= 0xe; break;
 398                         case 'f': v <<= 4; v |= 0xf; break;
 399                         default:
 400                                 goto end;
 401                         }
 402                 }
 403                 break;
 404         case 8:
 405                 for (;; ++s) {
 406                         /* check for overrun */
 407                         if (v >= 0x2000000000000000ULL)
 408                                 break;
 409                         switch (tolower(*s)) {
 410                         case '0': v <<= 3; break;
 411                         case '1': v <<= 3; v |= 1; break;
 412                         case '2': v <<= 3; v |= 2; break;
 413                         case '3': v <<= 3; v |= 3; break;
 414                         case '4': v <<= 3; v |= 4; break;
 415                         case '5': v <<= 3; v |= 5; break;
 416                         case '6': v <<= 3; v |= 6; break;
 417                         case '7': v <<= 3; v |= 7; break;
 418                         default:
 419                                 goto end;
 420                         }
 421                 }
 422                 break;
 423         case 10:
 424                 for (;; ++s) {
 425                         /* check for overrun */
 426                         if (v > 0x1999999999999999ULL)
 427                                 break;
 428                         switch (tolower(*s)) {
 429                         case '0': v *= 10; break;
 430                         case '1': v *= 10; v += 1; break;
 431                         case '2': v *= 10; v += 2; break;
 432                         case '3': v *= 10; v += 3; break;
 433                         case '4': v *= 10; v += 4; break;
 434                         case '5': v *= 10; v += 5; break;
 435                         case '6': v *= 10; v += 6; break;
 436                         case '7': v *= 10; v += 7; break;
 437                         case '8': v *= 10; v += 8; break;
 438                         case '9': v *= 10; v += 9; break;
 439                         default:
 440                                 goto end;
 441                         }
 442                 }
 443                 break;
 444         default:
 445                 assert(0);
 446                 break;
 447         }
 448 end:
 449         *endptr = s;
 450         return v;
 451 }
 452
 453 static void parse_number_hex(void)
 454 {
 455         assert(c == 'x' || c == 'X');
 456         next_char();
 457
 458         while(isxdigit(c)) {
 459                 obstack_1grow(&symbol_obstack, (char) c);
 460                 next_char();
 461         }
 462         obstack_1grow(&symbol_obstack, '\0');
 463         char *string = obstack_finish(&symbol_obstack);
 464
 465         if(c == '.' || c == 'p' || c == 'P') {
 466                 next_char();
 467                 panic("Hex floating point numbers not implemented yet");
 468         }
 469         if(*string == '\0') {
 470                 parse_error("invalid hex number");
 471                 lexer_token.type = T_ERROR;
 472         }
 473
 474         const char *endptr;
 475         lexer_token.type       = T_INTEGER;
 476         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
 477         if(*endptr != '\0') {
 478                 parse_error("hex number literal too long");
 479         }
 480
 481         obstack_free(&symbol_obstack, string);
 482         parse_integer_suffix(true);
 483 }
 484
 485 static inline bool is_octal_digit(int chr)
 486 {
 487         return '0' <= chr && chr <= '7';
 488 }
 489
 490 static void parse_number_oct(void)
 491 {
 492         while(is_octal_digit(c)) {
 493                 obstack_1grow(&symbol_obstack, (char) c);
 494                 next_char();
 495         }
 496         obstack_1grow(&symbol_obstack, '\0');
 497         char *string = obstack_finish(&symbol_obstack);
 498
 499         const char *endptr;
 500         lexer_token.type       = T_INTEGER;
 501         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 502         if(*endptr != '\0') {
 503                 parse_error("octal number literal too long");
 504         }
 505
 506         obstack_free(&symbol_obstack, string);
 507         parse_integer_suffix(true);
 508 }
 509
 510 static void parse_number_dec(void)
 511 {
 512         bool is_float = false;
 513         while(isdigit(c)) {
 514                 obstack_1grow(&symbol_obstack, (char) c);
 515                 next_char();
 516         }
 517
 518         if(c == '.') {
 519                 obstack_1grow(&symbol_obstack, '.');
 520                 next_char();
 521
 522                 while(isdigit(c)) {
 523                         obstack_1grow(&symbol_obstack, (char) c);
 524                         next_char();
 525                 }
 526                 is_float = true;
 527         }
 528         if(c == 'e' || c == 'E') {
 529                 obstack_1grow(&symbol_obstack, 'e');
 530                 next_char();
 531
 532                 if(c == '-' || c == '+') {
 533                         obstack_1grow(&symbol_obstack, (char) c);
 534                         next_char();
 535                 }
 536
 537                 while(isdigit(c)) {
 538                         obstack_1grow(&symbol_obstack, (char) c);
 539                         next_char();
 540                 }
 541                 is_float = true;
 542         }
 543
 544         obstack_1grow(&symbol_obstack, '\0');
 545         char *string = obstack_finish(&symbol_obstack);
 546
 547         if(is_float) {
 548                 char *endptr;
 549                 lexer_token.type         = T_FLOATINGPOINT;
 550                 lexer_token.v.floatvalue = strtold(string, &endptr);
 551
 552                 if(*endptr != '\0') {
 553                         parse_error("invalid number literal");
 554                 }
 555
 556                 parse_floating_suffix();
 557         } else {
 558                 const char *endptr;
 559                 lexer_token.type       = T_INTEGER;
 560                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 561
 562                 if(*endptr != '\0') {
 563                         parse_error("invalid number literal");
 564                 }
 565
 566                 parse_integer_suffix(false);
 567         }
 568         obstack_free(&symbol_obstack, string);
 569 }
 570
 571 static void parse_number(void)
 572 {
 573         if (c == '0') {
 574                 next_char();
 575                 switch (c) {
 576                         case 'X':
 577                         case 'x':
 578                                 parse_number_hex();
 579                                 break;
 580                         case '0':
 581                         case '1':
 582                         case '2':
 583                         case '3':
 584                         case '4':
 585                         case '5':
 586                         case '6':
 587                         case '7':
 588                                 parse_number_oct();
 589                                 break;
 590                         case '8':
 591                         case '9':
 592                                 next_char();
 593                                 parse_error("invalid octal number");
 594                                 lexer_token.type = T_ERROR;
 595                                 return;
 596                         case '.':
 597                         case 'e':
 598                         case 'E':
 599                         default:
 600                                 obstack_1grow(&symbol_obstack, '0');
 601                                 parse_number_dec();
 602                                 return;
 603                 }
 604         } else {
 605                 parse_number_dec();
 606         }
 607 }
 608
 609 static int parse_octal_sequence(const int first_digit)
 610 {
 611         assert(is_octal_digit(first_digit));
 612         int value = first_digit - '0';
 613         if (!is_octal_digit(c)) return value;
 614         value = 8 * value + c - '0';
 615         next_char();
 616         if (!is_octal_digit(c)) return value;
 617         value = 8 * value + c - '0';
 618         next_char();
 619         return (char_type)value;
 620 }
 621
 622 static int parse_hex_sequence(void)
 623 {
 624         int value = 0;
 625         while(1) {
 626                 if (c >= '0' && c <= '9') {
 627                         value = 16 * value + c - '0';
 628                 } else if ('A' <= c && c <= 'F') {
 629                         value = 16 * value + c - 'A' + 10;
 630                 } else if ('a' <= c && c <= 'f') {
 631                         value = 16 * value + c - 'a' + 10;
 632                 } else {
 633                         break;
 634                 }
 635                 next_char();
 636         }
 637
 638         return (char_type)value;
 639 }
 640
 641 static int parse_escape_sequence(void)
 642 {
 643         eat('\\');
 644
 645         int ec = c;
 646         next_char();
 647
 648         switch(ec) {
 649         case '"':  return '"';
 650         case '\'': return '\'';
 651         case '\\': return '\\';
 652         case '?': return '\?';
 653         case 'a': return '\a';
 654         case 'b': return '\b';
 655         case 'f': return '\f';
 656         case 'n': return '\n';
 657         case 'r': return '\r';
 658         case 't': return '\t';
 659         case 'v': return '\v';
 660         case 'x':
 661                 return parse_hex_sequence();
 662         case '0':
 663         case '1':
 664         case '2':
 665         case '3':
 666         case '4':
 667         case '5':
 668         case '6':
 669         case '7':
 670                 return parse_octal_sequence(ec);
 671         case EOF:
 672                 parse_error("reached end of file while parsing escape sequence");
 673                 return EOF;
 674         default:
 675                 parse_error("unknown escape sequence");
 676                 return EOF;
 677         }
 678 }
 679
 680 const char *concat_strings(const char *s1, const char *s2)
 681 {
 682         size_t  len1   = strlen(s1);
 683         size_t  len2   = strlen(s2);
 684
 685         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 686         memcpy(concat, s1, len1);
 687         memcpy(concat + len1, s2, len2 + 1);
 688
 689         const char *result = strset_insert(&stringset, concat);
 690         if(result != concat) {
 691                 obstack_free(&symbol_obstack, concat);
 692         }
 693
 694         return result;
 695 }
 696
 697 static void parse_string_literal(void)
 698 {
 699         unsigned    start_linenr = lexer_token.source_position.linenr;
 700         char       *string;
 701         const char *result;
 702
 703         assert(c == '"');
 704         next_char();
 705
 706         int tc;
 707         while(1) {
 708                 switch(c) {
 709                 case '\\':
 710                         tc = parse_escape_sequence();
 711                         obstack_1grow(&symbol_obstack, (char) tc);
 712                         break;
 713
 714                 case EOF:
 715                         error_prefix_at(lexer_token.source_position.input_name,
 716                                         start_linenr);
 717                         fprintf(stderr, "string has no end\n");
 718                         lexer_token.type = T_ERROR;
 719                         return;
 720
 721                 case '"':
 722                         next_char();
 723                         goto end_of_string;
 724
 725                 default:
 726                         obstack_1grow(&symbol_obstack, (char) c);
 727                         next_char();
 728                         break;
 729                 }
 730         }
 731
 732 end_of_string:
 733
 734         /* TODO: concatenate multiple strings separated by whitespace... */
 735
 736         /* add finishing 0 to the string */
 737         obstack_1grow(&symbol_obstack, '\0');
 738         string = obstack_finish(&symbol_obstack);
 739
 740         /* check if there is already a copy of the string */
 741         result = strset_insert(&stringset, string);
 742         if(result != string) {
 743                 obstack_free(&symbol_obstack, string);
 744         }
 745
 746         lexer_token.type     = T_STRING_LITERAL;
 747         lexer_token.v.string = result;
 748 }
 749
 750 static void parse_wide_character_constant(void)
 751 {
 752         eat('\'');
 753
 754         int found_char = 0;
 755         while(1) {
 756                 switch(c) {
 757                 case '\\':
 758                         found_char = parse_escape_sequence();
 759                         break;
 760
 761                 MATCH_NEWLINE(
 762                         parse_error("newline while parsing character constant");
 763                         break;
 764                 )
 765
 766                 case '\'':
 767                         next_char();
 768                         goto end_of_wide_char_constant;
 769
 770                 case EOF:
 771                         parse_error("EOF while parsing character constant");
 772                         lexer_token.type = T_ERROR;
 773                         return;
 774
 775                 default:
 776                         if(found_char != 0) {
 777                                 parse_error("more than 1 characters in character "
 778                                             "constant");
 779                                 goto end_of_wide_char_constant;
 780                         } else {
 781                                 found_char = c;
 782                                 next_char();
 783                         }
 784                         break;
 785                 }
 786         }
 787
 788 end_of_wide_char_constant:
 789         lexer_token.type       = T_INTEGER;
 790         lexer_token.v.intvalue = found_char;
 791         lexer_token.datatype   = type_wchar_t;
 792 }
 793
 794 static void parse_wide_string_literal(void)
 795 {
 796         const unsigned start_linenr = lexer_token.source_position.linenr;
 797
 798         assert(c == '"');
 799         next_char();
 800
 801         while(1) {
 802                 switch(c) {
 803                         case '\\': {
 804                                 wchar_rep_t tc = parse_escape_sequence();
 805                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 806                                 break;
 807                         }
 808
 809                         case EOF:
 810                                 error_prefix_at(lexer_token.source_position.input_name,
 811                                                 start_linenr);
 812                                 fprintf(stderr, "string has no end\n");
 813                                 lexer_token.type = T_ERROR;
 814                                 return;
 815
 816                         case '"':
 817                                 next_char();
 818                                 goto end_of_string;
 819
 820                         default: {
 821                                 wchar_rep_t tc = c;
 822                                 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 823                                 next_char();
 824                                 break;
 825                         }
 826                 }
 827         }
 828
 829 end_of_string:;
 830
 831         /* TODO: concatenate multiple strings separated by whitespace... */
 832
 833         /* add finishing 0 to the string */
 834         wchar_rep_t nul = L'\0';
 835         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 836         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 837         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
 838
 839 #if 0 /* TODO hash */
 840         /* check if there is already a copy of the string */
 841         const wchar_rep_t *const result = strset_insert(&stringset, string);
 842         if(result != string) {
 843                 obstack_free(&symbol_obstack, string);
 844         }
 845 #else
 846         const wchar_rep_t *const result = string;
 847 #endif
 848
 849         lexer_token.type                = T_WIDE_STRING_LITERAL;
 850         lexer_token.v.wide_string.begin = result;
 851         lexer_token.v.wide_string.size  = size;
 852 }
 853
 854 static void parse_character_constant(void)
 855 {
 856         eat('\'');
 857
 858         int found_char = 0;
 859         while(1) {
 860                 switch(c) {
 861                 case '\\':
 862                         found_char = parse_escape_sequence();
 863                         break;
 864
 865                 MATCH_NEWLINE(
 866                         parse_error("newline while parsing character constant");
 867                         break;
 868                 )
 869
 870                 case '\'':
 871                         next_char();
 872                         goto end_of_char_constant;
 873
 874                 case EOF:
 875                         parse_error("EOF while parsing character constant");
 876                         lexer_token.type = T_ERROR;
 877                         return;
 878
 879                 default:
 880                         if(found_char != 0) {
 881                                 parse_error("more than 1 characters in character "
 882                                             "constant");
 883                                 goto end_of_char_constant;
 884                         } else {
 885                                 found_char = c;
 886                                 next_char();
 887                         }
 888                         break;
 889                 }
 890         }
 891
 892 end_of_char_constant:
 893         lexer_token.type       = T_INTEGER;
 894         lexer_token.v.intvalue = found_char;
 895         lexer_token.datatype   = type_int;
 896 }
 897
 898 static void skip_multiline_comment(void)
 899 {
 900         unsigned start_linenr = lexer_token.source_position.linenr;
 901
 902         while(1) {
 903                 switch(c) {
 904                 case '*':
 905                         next_char();
 906                         if(c == '/') {
 907                                 next_char();
 908                                 return;
 909                         }
 910                         break;
 911
 912                 MATCH_NEWLINE(break;)
 913
 914                 case EOF:
 915                         error_prefix_at(lexer_token.source_position.input_name,
 916                                         start_linenr);
 917                         fprintf(stderr, "at end of file while looking for comment end\n");
 918                         return;
 919
 920                 default:
 921                         next_char();
 922                         break;
 923                 }
 924         }
 925 }
 926
 927 static void skip_line_comment(void)
 928 {
 929         while(1) {
 930                 switch(c) {
 931                 case EOF:
 932                         return;
 933
 934                 case '\n':
 935                 case '\r':
 936                         return;
 937
 938                 default:
 939                         next_char();
 940                         break;
 941                 }
 942         }
 943 }
 944
 945 static token_t pp_token;
 946
 947 static inline void next_pp_token(void)
 948 {
 949         lexer_next_preprocessing_token();
 950         pp_token = lexer_token;
 951 }
 952
 953 static void eat_until_newline(void)
 954 {
 955         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
 956                 next_pp_token();
 957         }
 958 }
 959
 960 static void error_directive(void)
 961 {
 962         error_prefix();
 963         fprintf(stderr, "#error directive: \n");
 964
 965         /* parse pp-tokens until new-line */
 966 }
 967
 968 static void define_directive(void)
 969 {
 970         lexer_next_preprocessing_token();
 971         if(lexer_token.type != T_IDENTIFIER) {
 972                 parse_error("expected identifier after #define\n");
 973                 eat_until_newline();
 974         }
 975 }
 976
 977 static void ifdef_directive(int is_ifndef)
 978 {
 979         (void) is_ifndef;
 980         lexer_next_preprocessing_token();
 981         //expect_identifier();
 982         //extect_newline();
 983 }
 984
 985 static void endif_directive(void)
 986 {
 987         //expect_newline();
 988 }
 989
 990 static void parse_line_directive(void)
 991 {
 992         if(pp_token.type != T_INTEGER) {
 993                 parse_error("expected integer");
 994         } else {
 995                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
 996                 next_pp_token();
 997         }
 998         if(pp_token.type == T_STRING_LITERAL) {
 999                 lexer_token.source_position.input_name = pp_token.v.string;
1000                 next_pp_token();
1001         }
1002
1003         eat_until_newline();
1004 }
1005
1006 static void parse_preprocessor_identifier(void)
1007 {
1008         assert(pp_token.type == T_IDENTIFIER);
1009         symbol_t *symbol = pp_token.v.symbol;
1010
1011         switch(symbol->pp_ID) {
1012         case TP_include:
1013                 printf("include - enable header name parsing!\n");
1014                 break;
1015         case TP_define:
1016                 define_directive();
1017                 break;
1018         case TP_ifdef:
1019                 ifdef_directive(0);
1020                 break;
1021         case TP_ifndef:
1022                 ifdef_directive(1);
1023                 break;
1024         case TP_endif:
1025                 endif_directive();
1026                 break;
1027         case TP_line:
1028                 next_pp_token();
1029                 parse_line_directive();
1030                 break;
1031         case TP_if:
1032         case TP_else:
1033         case TP_elif:
1034         case TP_undef:
1035         case TP_error:
1036                 error_directive();
1037                 break;
1038         case TP_pragma:
1039                 warningf(lexer_token.source_position, "encountered unknown #pragma");
1040                 eat_until_newline();
1041                 break;
1042         }
1043 }
1044
1045 static void parse_preprocessor_directive(void)
1046 {
1047         next_pp_token();
1048
1049         switch(pp_token.type) {
1050         case T_IDENTIFIER:
1051                 parse_preprocessor_identifier();
1052                 break;
1053         case T_INTEGER:
1054                 parse_line_directive();
1055                 break;
1056         default:
1057                 parse_error("invalid preprocessor directive");
1058                 eat_until_newline();
1059                 break;
1060         }
1061 }
1062
1063 #define MAYBE_PROLOG                                       \
1064                         next_char();                                   \
1065                         while(1) {                                     \
1066                                 switch(c) {
1067
1068 #define MAYBE(ch, set_type)                                \
1069                                 case ch:                                   \
1070                                         next_char();                           \
1071                                         lexer_token.type = set_type;           \
1072                                         return;
1073
1074 #define ELSE_CODE(code)                                    \
1075                                 default:                                   \
1076                                         code;                                  \
1077                                 }                                          \
1078                         } /* end of while(1) */                        \
1079                         break;
1080
1081 #define ELSE(set_type)                                     \
1082                 ELSE_CODE(                                         \
1083                         lexer_token.type = set_type;                   \
1084                         return;                                        \
1085                 )
1086
1087 void lexer_next_preprocessing_token(void)
1088 {
1089         while(1) {
1090                 switch(c) {
1091                 case ' ':
1092                 case '\t':
1093                         next_char();
1094                         break;
1095
1096                 MATCH_NEWLINE(
1097                         lexer_token.type = '\n';
1098                         return;
1099                 )
1100
1101                 SYMBOL_CHARS
1102                         parse_symbol();
1103                         /* might be a wide string ( L"string" ) */
1104                         if(lexer_token.type == T_IDENTIFIER &&
1105                             lexer_token.v.symbol == symbol_L) {
1106                             if(c == '"') {
1107                                         parse_wide_string_literal();
1108                                 } else if(c == '\'') {
1109                                         parse_wide_character_constant();
1110                                 }
1111                         }
1112                         return;
1113
1114                 DIGITS
1115                         parse_number();
1116                         return;
1117
1118                 case '"':
1119                         parse_string_literal();
1120                         return;
1121
1122                 case '\'':
1123                         parse_character_constant();
1124                         return;
1125
1126                 case '.':
1127                         MAYBE_PROLOG
1128                                 case '0':
1129                                 case '1':
1130                                 case '2':
1131                                 case '3':
1132                                 case '4':
1133                                 case '5':
1134                                 case '6':
1135                                 case '7':
1136                                 case '8':
1137                                 case '9':
1138                                         put_back(c);
1139                                         c = '.';
1140                                         parse_number_dec();
1141                                         return;
1142
1143                                 case '.':
1144                                         MAYBE_PROLOG
1145                                         MAYBE('.', T_DOTDOTDOT)
1146                                         ELSE_CODE(
1147                                                 put_back(c);
1148                                                 c = '.';
1149                                                 lexer_token.type = '.';
1150                                                 return;
1151                                         )
1152                         ELSE('.')
1153                 case '&':
1154                         MAYBE_PROLOG
1155                         MAYBE('&', T_ANDAND)
1156                         MAYBE('=', T_ANDEQUAL)
1157                         ELSE('&')
1158                 case '*':
1159                         MAYBE_PROLOG
1160                         MAYBE('=', T_ASTERISKEQUAL)
1161                         ELSE('*')
1162                 case '+':
1163                         MAYBE_PROLOG
1164                         MAYBE('+', T_PLUSPLUS)
1165                         MAYBE('=', T_PLUSEQUAL)
1166                         ELSE('+')
1167                 case '-':
1168                         MAYBE_PROLOG
1169                         MAYBE('>', T_MINUSGREATER)
1170                         MAYBE('-', T_MINUSMINUS)
1171                         MAYBE('=', T_MINUSEQUAL)
1172                         ELSE('-')
1173                 case '!':
1174                         MAYBE_PROLOG
1175                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1176                         ELSE('!')
1177                 case '/':
1178                         MAYBE_PROLOG
1179                         MAYBE('=', T_SLASHEQUAL)
1180                                 case '*':
1181                                         next_char();
1182                                         skip_multiline_comment();
1183                                         lexer_next_preprocessing_token();
1184                                         return;
1185                                 case '/':
1186                                         next_char();
1187                                         skip_line_comment();
1188                                         lexer_next_preprocessing_token();
1189                                         return;
1190                         ELSE('/')
1191                 case '%':
1192                         MAYBE_PROLOG
1193                         MAYBE('>', T_PERCENTGREATER)
1194                         MAYBE('=', T_PERCENTEQUAL)
1195                                 case ':':
1196                                         MAYBE_PROLOG
1197                                                 case '%':
1198                                                         MAYBE_PROLOG
1199                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1200                                                         ELSE_CODE(
1201                                                                 put_back(c);
1202                                                                 c = '%';
1203                                                                 lexer_token.type = T_PERCENTCOLON;
1204                                                                 return;
1205                                                         )
1206                                         ELSE(T_PERCENTCOLON)
1207                         ELSE('%')
1208                 case '<':
1209                         MAYBE_PROLOG
1210                         MAYBE(':', T_LESSCOLON)
1211                         MAYBE('%', T_LESSPERCENT)
1212                         MAYBE('=', T_LESSEQUAL)
1213                                 case '<':
1214                                         MAYBE_PROLOG
1215                                         MAYBE('=', T_LESSLESSEQUAL)
1216                                         ELSE(T_LESSLESS)
1217                         ELSE('<')
1218                 case '>':
1219                         MAYBE_PROLOG
1220                         MAYBE('=', T_GREATEREQUAL)
1221                                 case '>':
1222                                         MAYBE_PROLOG
1223                                         MAYBE('=', T_GREATERGREATEREQUAL)
1224                                         ELSE(T_GREATERGREATER)
1225                         ELSE('>')
1226                 case '^':
1227                         MAYBE_PROLOG
1228                         MAYBE('=', T_CARETEQUAL)
1229                         ELSE('^')
1230                 case '|':
1231                         MAYBE_PROLOG
1232                         MAYBE('=', T_PIPEEQUAL)
1233                         MAYBE('|', T_PIPEPIPE)
1234                         ELSE('|')
1235                 case ':':
1236                         MAYBE_PROLOG
1237                         MAYBE('>', T_COLONGREATER)
1238                         ELSE(':')
1239                 case '=':
1240                         MAYBE_PROLOG
1241                         MAYBE('=', T_EQUALEQUAL)
1242                         ELSE('=')
1243                 case '#':
1244                         MAYBE_PROLOG
1245                         MAYBE('#', T_HASHHASH)
1246                         ELSE('#')
1247
1248                 case '?':
1249                 case '[':
1250                 case ']':
1251                 case '(':
1252                 case ')':
1253                 case '{':
1254                 case '}':
1255                 case '~':
1256                 case ';':
1257                 case ',':
1258                 case '\\':
1259                         lexer_token.type = c;
1260                         next_char();
1261                         return;
1262
1263                 case EOF:
1264                         lexer_token.type = T_EOF;
1265                         return;
1266
1267                 default:
1268                         next_char();
1269                         error_prefix();
1270                         fprintf(stderr, "unknown character '%c' found\n", c);
1271                         lexer_token.type = T_ERROR;
1272                         return;
1273                 }
1274         }
1275 }
1276
1277 void lexer_next_token(void)
1278 {
1279         lexer_next_preprocessing_token();
1280         if(lexer_token.type != '\n')
1281                 return;
1282
1283 newline_found:
1284         do {
1285                 lexer_next_preprocessing_token();
1286         } while(lexer_token.type == '\n');
1287
1288         if(lexer_token.type == '#') {
1289                 parse_preprocessor_directive();
1290                 goto newline_found;
1291         }
1292 }
1293
1294 void init_lexer(void)
1295 {
1296         strset_init(&stringset);
1297 }
1298
1299 void lexer_open_stream(FILE *stream, const char *input_name)
1300 {
1301         input                                  = stream;
1302         lexer_token.source_position.linenr     = 0;
1303         lexer_token.source_position.input_name = input_name;
1304
1305         symbol_L = symbol_table_insert("L");
1306
1307         /* place a virtual \n at the beginning so the lexer knows that we're
1308          * at the beginning of a line */
1309         c = '\n';
1310 }
1311
1312 void exit_lexer(void)
1313 {
1314         strset_destroy(&stringset);
1315 }
1316
1317 static __attribute__((unused))
1318 void dbg_pos(const source_position_t source_position)
1319 {
1320         fprintf(stdout, "%s:%u\n", source_position.input_name,
1321                 source_position.linenr);
1322         fflush(stdout);
1323 }