nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "diagnostic.h"
   4 #include "lexer.h"
   5 #include "token_t.h"
   6 #include "symbol_table_t.h"
   7 #include "adt/error.h"
   8 #include "adt/strset.h"
   9 #include "adt/util.h"
  10 #include "types.h"
  11 #include "type_t.h"
  12 #include "target_architecture.h"
  13 #include "parser.h"
  14 #include "warning.h"
  15
  16 #include <assert.h>
  17 #include <errno.h>
  18 #include <string.h>
  19 #include <stdbool.h>
  20 #include <ctype.h>
  21
  22 //#define DEBUG_CHARS
  23 #define MAX_PUTBACK 3
  24
  25 #ifdef _WIN32
  26 /* No strtold on windows and no replacement yet */
  27 #define strtold(s, e) strtod(s, e)
  28 #endif
  29
  30 #if defined HAS_SIGNED_CHAR
  31 typedef signed char char_type;
  32 #elif defined HAS_UNSIGNED_CHAR
  33 typedef unsigned char char_type;
  34 #else
  35 #       error signedness of char not determined
  36 #endif
  37
  38 static int         c;
  39 token_t            lexer_token;
  40 symbol_t          *symbol_L;
  41 static FILE       *input;
  42 static char        buf[1024 + MAX_PUTBACK];
  43 static const char *bufend;
  44 static const char *bufpos;
  45 static strset_t    stringset;
  46
  47 /**
  48  * Prints a parse error message at the current token.
  49  *
  50  * @param msg   the error message
  51  */
  52 static void parse_error(const char *msg)
  53 {
  54         errorf(lexer_token.source_position,  "%s", msg);
  55 }
  56
  57 static inline void next_real_char(void)
  58 {
  59         assert(bufpos <= bufend);
  60         if (bufpos >= bufend) {
  61                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  62                                  input);
  63                 if(s == 0) {
  64                         c = EOF;
  65                         return;
  66                 }
  67                 bufpos = buf + MAX_PUTBACK;
  68                 bufend = buf + MAX_PUTBACK + s;
  69         }
  70         c = *bufpos++;
  71 }
  72
  73 static inline void put_back(int pc)
  74 {
  75         assert(bufpos > buf);
  76         *(--bufpos - buf + buf) = (char) pc;
  77
  78 #ifdef DEBUG_CHARS
  79         printf("putback '%c'\n", pc);
  80 #endif
  81 }
  82
  83 static inline void next_char(void);
  84
  85 #define MATCH_NEWLINE(code)                   \
  86         case '\r':                                \
  87                 next_char();                          \
  88                 if(c == '\n') {                       \
  89                         next_char();                      \
  90                 }                                     \
  91                 lexer_token.source_position.linenr++; \
  92                 code                                  \
  93         case '\n':                                \
  94                 next_char();                          \
  95                 lexer_token.source_position.linenr++; \
  96                 code
  97
  98 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
  99
 100 static void maybe_concat_lines(void)
 101 {
 102         eat('\\');
 103
 104         switch(c) {
 105         MATCH_NEWLINE(return;)
 106
 107         default:
 108                 break;
 109         }
 110
 111         put_back(c);
 112         c = '\\';
 113 }
 114
 115 static inline void next_char(void)
 116 {
 117         next_real_char();
 118
 119         /* filter trigraphs */
 120         if(UNLIKELY(c == '\\')) {
 121                 maybe_concat_lines();
 122                 goto end_of_next_char;
 123         }
 124
 125         if(LIKELY(c != '?'))
 126                 goto end_of_next_char;
 127
 128         next_real_char();
 129         if(LIKELY(c != '?')) {
 130                 put_back(c);
 131                 c = '?';
 132                 goto end_of_next_char;
 133         }
 134
 135         next_real_char();
 136         switch(c) {
 137         case '=': c = '#'; break;
 138         case '(': c = '['; break;
 139         case '/': c = '\\'; maybe_concat_lines(); break;
 140         case ')': c = ']'; break;
 141         case '\'': c = '^'; break;
 142         case '<': c = '{'; break;
 143         case '!': c = '|'; break;
 144         case '>': c = '}'; break;
 145         case '-': c = '~'; break;
 146         default:
 147                 put_back(c);
 148                 put_back('?');
 149                 c = '?';
 150                 break;
 151         }
 152
 153 end_of_next_char:;
 154 #ifdef DEBUG_CHARS
 155         printf("nchar '%c'\n", c);
 156 #endif
 157 }
 158
 159 #define SYMBOL_CHARS  \
 160         case 'a':         \
 161         case 'b':         \
 162         case 'c':         \
 163         case 'd':         \
 164         case 'e':         \
 165         case 'f':         \
 166         case 'g':         \
 167         case 'h':         \
 168         case 'i':         \
 169         case 'j':         \
 170         case 'k':         \
 171         case 'l':         \
 172         case 'm':         \
 173         case 'n':         \
 174         case 'o':         \
 175         case 'p':         \
 176         case 'q':         \
 177         case 'r':         \
 178         case 's':         \
 179         case 't':         \
 180         case 'u':         \
 181         case 'v':         \
 182         case 'w':         \
 183         case 'x':         \
 184         case 'y':         \
 185         case 'z':         \
 186         case 'A':         \
 187         case 'B':         \
 188         case 'C':         \
 189         case 'D':         \
 190         case 'E':         \
 191         case 'F':         \
 192         case 'G':         \
 193         case 'H':         \
 194         case 'I':         \
 195         case 'J':         \
 196         case 'K':         \
 197         case 'L':         \
 198         case 'M':         \
 199         case 'N':         \
 200         case 'O':         \
 201         case 'P':         \
 202         case 'Q':         \
 203         case 'R':         \
 204         case 'S':         \
 205         case 'T':         \
 206         case 'U':         \
 207         case 'V':         \
 208         case 'W':         \
 209         case 'X':         \
 210         case 'Y':         \
 211         case 'Z':         \
 212         case '_':
 213
 214 #define DIGITS        \
 215         case '0':         \
 216         case '1':         \
 217         case '2':         \
 218         case '3':         \
 219         case '4':         \
 220         case '5':         \
 221         case '6':         \
 222         case '7':         \
 223         case '8':         \
 224         case '9':
 225
 226 static void parse_symbol(void)
 227 {
 228         symbol_t *symbol;
 229         char     *string;
 230
 231         obstack_1grow(&symbol_obstack, (char) c);
 232         next_char();
 233
 234         while(1) {
 235                 switch(c) {
 236                 DIGITS
 237                 SYMBOL_CHARS
 238                         obstack_1grow(&symbol_obstack, (char) c);
 239                         next_char();
 240                         break;
 241
 242                 default:
 243                         goto end_symbol;
 244                 }
 245         }
 246
 247 end_symbol:
 248         obstack_1grow(&symbol_obstack, '\0');
 249
 250         string = obstack_finish(&symbol_obstack);
 251         symbol = symbol_table_insert(string);
 252
 253         lexer_token.type     = symbol->ID;
 254         lexer_token.v.symbol = symbol;
 255
 256         if(symbol->string != string) {
 257                 obstack_free(&symbol_obstack, string);
 258         }
 259 }
 260
 261 static void parse_integer_suffix(bool is_oct_hex)
 262 {
 263         bool is_unsigned  = false;
 264         bool min_long     = false;
 265         bool min_longlong = false;
 266
 267         if(c == 'U' || c == 'u') {
 268                 is_unsigned = true;
 269                 next_char();
 270                 if(c == 'L' || c == 'l') {
 271                         min_long = true;
 272                         next_char();
 273                         if(c == 'L' || c == 'l') {
 274                                 min_longlong = true;
 275                                 next_char();
 276                         }
 277                 }
 278         } else if(c == 'l' || c == 'L') {
 279                 min_long = true;
 280                 next_char();
 281                 if(c == 'l' || c == 'L') {
 282                         min_longlong = true;
 283                         next_char();
 284                         if(c == 'u' || c == 'U') {
 285                                 is_unsigned = true;
 286                                 next_char();
 287                         }
 288                 } else if(c == 'u' || c == 'U') {
 289                         is_unsigned = true;
 290                         next_char();
 291                         lexer_token.datatype = type_unsigned_long;
 292                 }
 293         }
 294
 295         if(!is_unsigned) {
 296                 long long v = lexer_token.v.intvalue;
 297                 if(!min_long) {
 298                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 299                                 lexer_token.datatype = type_int;
 300                                 return;
 301                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 302                                 lexer_token.datatype = type_unsigned_int;
 303                                 return;
 304                         }
 305                 }
 306                 if(!min_longlong) {
 307                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 308                                 lexer_token.datatype = type_long;
 309                                 return;
 310                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
 311                                 lexer_token.datatype = type_unsigned_long;
 312                                 return;
 313                         }
 314                 }
 315                 unsigned long long uv = (unsigned long long) v;
 316                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 317                         lexer_token.datatype = type_unsigned_long_long;
 318                         return;
 319                 }
 320
 321                 lexer_token.datatype = type_long_long;
 322         } else {
 323                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 324                 if(!min_long && v <= TARGET_UINT_MAX) {
 325                         lexer_token.datatype = type_unsigned_int;
 326                         return;
 327                 }
 328                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
 329                         lexer_token.datatype = type_unsigned_long;
 330                         return;
 331                 }
 332                 lexer_token.datatype = type_unsigned_long_long;
 333         }
 334 }
 335
 336 static void parse_floating_suffix(void)
 337 {
 338         switch(c) {
 339         /* TODO: do something usefull with the suffixes... */
 340         case 'f':
 341         case 'F':
 342                 next_char();
 343                 lexer_token.datatype = type_float;
 344                 break;
 345         case 'l':
 346         case 'L':
 347                 next_char();
 348                 lexer_token.datatype = type_long_double;
 349                 break;
 350         default:
 351                 lexer_token.datatype = type_double;
 352                 break;
 353         }
 354 }
 355
 356 /**
 357  * A replacement for strtoull. Only those parts needed for
 358  * our parser are implemented.
 359  */
 360 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 361         unsigned long long v = 0;
 362
 363         switch (base) {
 364         case 16:
 365                 for (;; ++s) {
 366                         /* check for overrun */
 367                         if (v >= 0x1000000000000000ULL)
 368                                 break;
 369                         switch (tolower(*s)) {
 370                         case '0': v <<= 4; break;
 371                         case '1': v <<= 4; v |= 0x1; break;
 372                         case '2': v <<= 4; v |= 0x2; break;
 373                         case '3': v <<= 4; v |= 0x3; break;
 374                         case '4': v <<= 4; v |= 0x4; break;
 375                         case '5': v <<= 4; v |= 0x5; break;
 376                         case '6': v <<= 4; v |= 0x6; break;
 377                         case '7': v <<= 4; v |= 0x7; break;
 378                         case '8': v <<= 4; v |= 0x8; break;
 379                         case '9': v <<= 4; v |= 0x9; break;
 380                         case 'a': v <<= 4; v |= 0xa; break;
 381                         case 'b': v <<= 4; v |= 0xb; break;
 382                         case 'c': v <<= 4; v |= 0xc; break;
 383                         case 'd': v <<= 4; v |= 0xd; break;
 384                         case 'e': v <<= 4; v |= 0xe; break;
 385                         case 'f': v <<= 4; v |= 0xf; break;
 386                         default:
 387                                 goto end;
 388                         }
 389                 }
 390                 break;
 391         case 8:
 392                 for (;; ++s) {
 393                         /* check for overrun */
 394                         if (v >= 0x2000000000000000ULL)
 395                                 break;
 396                         switch (tolower(*s)) {
 397                         case '0': v <<= 3; break;
 398                         case '1': v <<= 3; v |= 1; break;
 399                         case '2': v <<= 3; v |= 2; break;
 400                         case '3': v <<= 3; v |= 3; break;
 401                         case '4': v <<= 3; v |= 4; break;
 402                         case '5': v <<= 3; v |= 5; break;
 403                         case '6': v <<= 3; v |= 6; break;
 404                         case '7': v <<= 3; v |= 7; break;
 405                         default:
 406                                 goto end;
 407                         }
 408                 }
 409                 break;
 410         case 10:
 411                 for (;; ++s) {
 412                         /* check for overrun */
 413                         if (v > 0x1999999999999999ULL)
 414                                 break;
 415                         switch (tolower(*s)) {
 416                         case '0': v *= 10; break;
 417                         case '1': v *= 10; v += 1; break;
 418                         case '2': v *= 10; v += 2; break;
 419                         case '3': v *= 10; v += 3; break;
 420                         case '4': v *= 10; v += 4; break;
 421                         case '5': v *= 10; v += 5; break;
 422                         case '6': v *= 10; v += 6; break;
 423                         case '7': v *= 10; v += 7; break;
 424                         case '8': v *= 10; v += 8; break;
 425                         case '9': v *= 10; v += 9; break;
 426                         default:
 427                                 goto end;
 428                         }
 429                 }
 430                 break;
 431         default:
 432                 assert(0);
 433                 break;
 434         }
 435 end:
 436         *endptr = s;
 437         return v;
 438 }
 439
 440 static void parse_number_hex(void)
 441 {
 442         assert(c == 'x' || c == 'X');
 443         next_char();
 444
 445         while(isxdigit(c)) {
 446                 obstack_1grow(&symbol_obstack, (char) c);
 447                 next_char();
 448         }
 449         obstack_1grow(&symbol_obstack, '\0');
 450         char *string = obstack_finish(&symbol_obstack);
 451
 452         if(c == '.' || c == 'p' || c == 'P') {
 453                 next_char();
 454                 panic("Hex floating point numbers not implemented yet");
 455         }
 456         if(*string == '\0') {
 457                 parse_error("invalid hex number");
 458                 lexer_token.type = T_ERROR;
 459         }
 460
 461         const char *endptr;
 462         lexer_token.type       = T_INTEGER;
 463         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
 464         if(*endptr != '\0') {
 465                 parse_error("hex number literal too long");
 466         }
 467
 468         obstack_free(&symbol_obstack, string);
 469         parse_integer_suffix(true);
 470 }
 471
 472 static inline bool is_octal_digit(int chr)
 473 {
 474         return '0' <= chr && chr <= '7';
 475 }
 476
 477 static void parse_number_oct(void)
 478 {
 479         while(is_octal_digit(c)) {
 480                 obstack_1grow(&symbol_obstack, (char) c);
 481                 next_char();
 482         }
 483         obstack_1grow(&symbol_obstack, '\0');
 484         char *string = obstack_finish(&symbol_obstack);
 485
 486         const char *endptr;
 487         lexer_token.type       = T_INTEGER;
 488         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 489         if(*endptr != '\0') {
 490                 parse_error("octal number literal too long");
 491         }
 492
 493         obstack_free(&symbol_obstack, string);
 494         parse_integer_suffix(true);
 495 }
 496
 497 static void parse_number_dec(void)
 498 {
 499         bool is_float = false;
 500         while(isdigit(c)) {
 501                 obstack_1grow(&symbol_obstack, (char) c);
 502                 next_char();
 503         }
 504
 505         if(c == '.') {
 506                 obstack_1grow(&symbol_obstack, '.');
 507                 next_char();
 508
 509                 while(isdigit(c)) {
 510                         obstack_1grow(&symbol_obstack, (char) c);
 511                         next_char();
 512                 }
 513                 is_float = true;
 514         }
 515         if(c == 'e' || c == 'E') {
 516                 obstack_1grow(&symbol_obstack, 'e');
 517                 next_char();
 518
 519                 if(c == '-' || c == '+') {
 520                         obstack_1grow(&symbol_obstack, (char) c);
 521                         next_char();
 522                 }
 523
 524                 while(isdigit(c)) {
 525                         obstack_1grow(&symbol_obstack, (char) c);
 526                         next_char();
 527                 }
 528                 is_float = true;
 529         }
 530
 531         obstack_1grow(&symbol_obstack, '\0');
 532         char *string = obstack_finish(&symbol_obstack);
 533
 534         if(is_float) {
 535                 char *endptr;
 536                 lexer_token.type         = T_FLOATINGPOINT;
 537                 lexer_token.v.floatvalue = strtold(string, &endptr);
 538
 539                 if(*endptr != '\0') {
 540                         parse_error("invalid number literal");
 541                 }
 542
 543                 parse_floating_suffix();
 544         } else {
 545                 const char *endptr;
 546                 lexer_token.type       = T_INTEGER;
 547                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 548
 549                 if(*endptr != '\0') {
 550                         parse_error("invalid number literal");
 551                 }
 552
 553                 parse_integer_suffix(false);
 554         }
 555         obstack_free(&symbol_obstack, string);
 556 }
 557
 558 static void parse_number(void)
 559 {
 560         if (c == '0') {
 561                 next_char();
 562                 switch (c) {
 563                         case 'X':
 564                         case 'x':
 565                                 parse_number_hex();
 566                                 break;
 567                         case '0':
 568                         case '1':
 569                         case '2':
 570                         case '3':
 571                         case '4':
 572                         case '5':
 573                         case '6':
 574                         case '7':
 575                                 parse_number_oct();
 576                                 break;
 577                         case '8':
 578                         case '9':
 579                                 next_char();
 580                                 parse_error("invalid octal number");
 581                                 lexer_token.type = T_ERROR;
 582                                 return;
 583                         case '.':
 584                         case 'e':
 585                         case 'E':
 586                         default:
 587                                 obstack_1grow(&symbol_obstack, '0');
 588                                 parse_number_dec();
 589                                 return;
 590                 }
 591         } else {
 592                 parse_number_dec();
 593         }
 594 }
 595
 596 static int parse_octal_sequence(const int first_digit)
 597 {
 598         assert(is_octal_digit(first_digit));
 599         int value = first_digit - '0';
 600         if (!is_octal_digit(c)) return value;
 601         value = 8 * value + c - '0';
 602         next_char();
 603         if (!is_octal_digit(c)) return value;
 604         value = 8 * value + c - '0';
 605         next_char();
 606         return (char_type)value;
 607 }
 608
 609 static int parse_hex_sequence(void)
 610 {
 611         int value = 0;
 612         while(1) {
 613                 if (c >= '0' && c <= '9') {
 614                         value = 16 * value + c - '0';
 615                 } else if ('A' <= c && c <= 'F') {
 616                         value = 16 * value + c - 'A' + 10;
 617                 } else if ('a' <= c && c <= 'f') {
 618                         value = 16 * value + c - 'a' + 10;
 619                 } else {
 620                         break;
 621                 }
 622                 next_char();
 623         }
 624
 625         return (char_type)value;
 626 }
 627
 628 static int parse_escape_sequence(void)
 629 {
 630         eat('\\');
 631
 632         int ec = c;
 633         next_char();
 634
 635         switch(ec) {
 636         case '"':  return '"';
 637         case '\'': return '\'';
 638         case '\\': return '\\';
 639         case '?': return '\?';
 640         case 'a': return '\a';
 641         case 'b': return '\b';
 642         case 'f': return '\f';
 643         case 'n': return '\n';
 644         case 'r': return '\r';
 645         case 't': return '\t';
 646         case 'v': return '\v';
 647         case 'x':
 648                 return parse_hex_sequence();
 649         case '0':
 650         case '1':
 651         case '2':
 652         case '3':
 653         case '4':
 654         case '5':
 655         case '6':
 656         case '7':
 657                 return parse_octal_sequence(ec);
 658         case EOF:
 659                 parse_error("reached end of file while parsing escape sequence");
 660                 return EOF;
 661         default:
 662                 parse_error("unknown escape sequence");
 663                 return EOF;
 664         }
 665 }
 666
 667 string_t concat_strings(const string_t *const s1, const string_t *const s2)
 668 {
 669         const size_t len1 = s1->size - 1;
 670         const size_t len2 = s2->size - 1;
 671
 672         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 673         memcpy(concat, s1->begin, len1);
 674         memcpy(concat + len1, s2->begin, len2 + 1);
 675
 676 #if 0 /* TODO hash */
 677         const char *result = strset_insert(&stringset, concat);
 678         if(result != concat) {
 679                 obstack_free(&symbol_obstack, concat);
 680         }
 681
 682         return result;
 683 #else
 684         return (string_t){ concat, len1 + len2 + 1 };
 685 #endif
 686 }
 687
 688 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
 689 {
 690         const size_t len1 = s1->size - 1;
 691         const size_t len2 = s2->size - 1;
 692
 693         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
 694         const char *const src = s1->begin;
 695         for (size_t i = 0; i != len1; ++i) {
 696                 concat[i] = src[i];
 697         }
 698         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
 699
 700         return (wide_string_t){ concat, len1 + len2 + 1 };
 701 }
 702
 703 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
 704 {
 705         const size_t len1 = s1->size - 1;
 706         const size_t len2 = s2->size - 1;
 707
 708         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
 709         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
 710         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
 711
 712         return (wide_string_t){ concat, len1 + len2 + 1 };
 713 }
 714
 715 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
 716 {
 717         const size_t len1 = s1->size - 1;
 718         const size_t len2 = s2->size - 1;
 719
 720         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
 721         memcpy(concat, s1->begin, len1 * sizeof(*concat));
 722         const char *const src = s2->begin;
 723         for (size_t i = 0; i != len2 + 1; ++i) {
 724                 concat[i] = src[i];
 725         }
 726
 727         return (wide_string_t){ concat, len1 + len2 + 1 };
 728 }
 729
 730 static void parse_string_literal(void)
 731 {
 732         const unsigned start_linenr = lexer_token.source_position.linenr;
 733
 734         eat('"');
 735
 736         int tc;
 737         while(1) {
 738                 switch(c) {
 739                 case '\\':
 740                         tc = parse_escape_sequence();
 741                         obstack_1grow(&symbol_obstack, (char) tc);
 742                         break;
 743
 744                 case EOF: {
 745                         source_position_t source_position;
 746                         source_position.input_name = lexer_token.source_position.input_name;
 747                         source_position.linenr     = start_linenr;
 748                         errorf(source_position, "string has no end");
 749                         lexer_token.type = T_ERROR;
 750                         return;
 751                 }
 752
 753                 case '"':
 754                         next_char();
 755                         goto end_of_string;
 756
 757                 default:
 758                         obstack_1grow(&symbol_obstack, (char) c);
 759                         next_char();
 760                         break;
 761                 }
 762         }
 763
 764 end_of_string:
 765
 766         /* TODO: concatenate multiple strings separated by whitespace... */
 767
 768         /* add finishing 0 to the string */
 769         obstack_1grow(&symbol_obstack, '\0');
 770         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 771         const char *const string = obstack_finish(&symbol_obstack);
 772
 773 #if 0 /* TODO hash */
 774         /* check if there is already a copy of the string */
 775         result = strset_insert(&stringset, string);
 776         if(result != string) {
 777                 obstack_free(&symbol_obstack, string);
 778         }
 779 #else
 780         const char *const result = string;
 781 #endif
 782
 783         lexer_token.type           = T_STRING_LITERAL;
 784         lexer_token.v.string.begin = result;
 785         lexer_token.v.string.size  = size;
 786 }
 787
 788 static void parse_wide_character_constant(void)
 789 {
 790         eat('\'');
 791
 792         int found_char = 0;
 793         while(1) {
 794                 switch(c) {
 795                 case '\\':
 796                         found_char = parse_escape_sequence();
 797                         break;
 798
 799                 MATCH_NEWLINE(
 800                         parse_error("newline while parsing character constant");
 801                         break;
 802                 )
 803
 804                 case '\'':
 805                         next_char();
 806                         goto end_of_wide_char_constant;
 807
 808                 case EOF:
 809                         parse_error("EOF while parsing character constant");
 810                         lexer_token.type = T_ERROR;
 811                         return;
 812
 813                 default:
 814                         if(found_char != 0) {
 815                                 parse_error("more than 1 characters in character "
 816                                             "constant");
 817                                 goto end_of_wide_char_constant;
 818                         } else {
 819                                 found_char = c;
 820                                 next_char();
 821                         }
 822                         break;
 823                 }
 824         }
 825
 826 end_of_wide_char_constant:
 827         lexer_token.type       = T_INTEGER;
 828         lexer_token.v.intvalue = found_char;
 829         lexer_token.datatype   = type_wchar_t;
 830 }
 831
 832 static void parse_wide_string_literal(void)
 833 {
 834         const unsigned start_linenr = lexer_token.source_position.linenr;
 835
 836         assert(c == '"');
 837         next_char();
 838
 839         while(1) {
 840                 switch(c) {
 841                 case '\\': {
 842                         wchar_rep_t tc = parse_escape_sequence();
 843                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 844                         break;
 845                 }
 846
 847                 case EOF: {
 848                         source_position_t source_position;
 849                         source_position.input_name = lexer_token.source_position.input_name;
 850                         source_position.linenr     = start_linenr;
 851                         errorf(source_position, "string has no end");
 852                         lexer_token.type = T_ERROR;
 853                         return;
 854                 }
 855
 856                 case '"':
 857                         next_char();
 858                         goto end_of_string;
 859
 860                 default: {
 861                         wchar_rep_t tc = c;
 862                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 863                         next_char();
 864                         break;
 865                 }
 866                 }
 867         }
 868
 869 end_of_string:;
 870
 871         /* TODO: concatenate multiple strings separated by whitespace... */
 872
 873         /* add finishing 0 to the string */
 874         wchar_rep_t nul = L'\0';
 875         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 876         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 877         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
 878
 879 #if 0 /* TODO hash */
 880         /* check if there is already a copy of the string */
 881         const wchar_rep_t *const result = strset_insert(&stringset, string);
 882         if(result != string) {
 883                 obstack_free(&symbol_obstack, string);
 884         }
 885 #else
 886         const wchar_rep_t *const result = string;
 887 #endif
 888
 889         lexer_token.type                = T_WIDE_STRING_LITERAL;
 890         lexer_token.v.wide_string.begin = result;
 891         lexer_token.v.wide_string.size  = size;
 892 }
 893
 894 static void parse_character_constant(void)
 895 {
 896         const unsigned start_linenr = lexer_token.source_position.linenr;
 897
 898         eat('\'');
 899
 900         int tc;
 901         while(1) {
 902                 switch(c) {
 903                 case '\\':
 904                         tc = parse_escape_sequence();
 905                         obstack_1grow(&symbol_obstack, (char) tc);
 906                         break;
 907
 908                 MATCH_NEWLINE(
 909                         parse_error("newline while parsing character constant");
 910                         break;
 911                 )
 912
 913                 case EOF: {
 914                         source_position_t source_position;
 915                         source_position.input_name = lexer_token.source_position.input_name;
 916                         source_position.linenr     = start_linenr;
 917                         errorf(source_position, "EOF while parsing character constant");
 918                         lexer_token.type = T_ERROR;
 919                         return;
 920                 }
 921
 922                 case '\'':
 923                         next_char();
 924                         goto end_of_char_constant;
 925
 926                 default:
 927                         obstack_1grow(&symbol_obstack, (char) c);
 928                         next_char();
 929                         break;
 930
 931                 }
 932         }
 933
 934 end_of_char_constant:;
 935         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 936         const char *const string = obstack_finish(&symbol_obstack);
 937
 938         lexer_token.type           = T_CHARS;
 939         lexer_token.v.string.begin = string;
 940         lexer_token.v.string.size  = size;
 941         lexer_token.datatype       = type_int;
 942 }
 943
 944 static void skip_multiline_comment(void)
 945 {
 946         unsigned start_linenr = lexer_token.source_position.linenr;
 947
 948         while(1) {
 949                 switch(c) {
 950                 case '/':
 951                         next_char();
 952                         if (c == '*') {
 953                                 /* TODO: nested comment, warn here */
 954                         }
 955                         break;
 956                 case '*':
 957                         next_char();
 958                         if(c == '/') {
 959                                 next_char();
 960                                 return;
 961                         }
 962                         break;
 963
 964                 MATCH_NEWLINE(break;)
 965
 966                 case EOF: {
 967                         source_position_t source_position;
 968                         source_position.input_name = lexer_token.source_position.input_name;
 969                         source_position.linenr     = start_linenr;
 970                         errorf(source_position, "at end of file while looking for comment end");
 971                         return;
 972                 }
 973
 974                 default:
 975                         next_char();
 976                         break;
 977                 }
 978         }
 979 }
 980
 981 static void skip_line_comment(void)
 982 {
 983         while(1) {
 984                 switch(c) {
 985                 case EOF:
 986                         return;
 987
 988                 case '\n':
 989                 case '\r':
 990                         return;
 991
 992                 default:
 993                         next_char();
 994                         break;
 995                 }
 996         }
 997 }
 998
 999 static token_t pp_token;
1000
1001 static inline void next_pp_token(void)
1002 {
1003         lexer_next_preprocessing_token();
1004         pp_token = lexer_token;
1005 }
1006
1007 static void eat_until_newline(void)
1008 {
1009         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1010                 next_pp_token();
1011         }
1012 }
1013
1014 static void define_directive(void)
1015 {
1016         lexer_next_preprocessing_token();
1017         if(lexer_token.type != T_IDENTIFIER) {
1018                 parse_error("expected identifier after #define\n");
1019                 eat_until_newline();
1020         }
1021 }
1022
1023 static void ifdef_directive(int is_ifndef)
1024 {
1025         (void) is_ifndef;
1026         lexer_next_preprocessing_token();
1027         //expect_identifier();
1028         //extect_newline();
1029 }
1030
1031 static void endif_directive(void)
1032 {
1033         //expect_newline();
1034 }
1035
1036 static void parse_line_directive(void)
1037 {
1038         if(pp_token.type != T_INTEGER) {
1039                 parse_error("expected integer");
1040         } else {
1041                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1042                 next_pp_token();
1043         }
1044         if(pp_token.type == T_STRING_LITERAL) {
1045                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1046                 next_pp_token();
1047         }
1048
1049         eat_until_newline();
1050 }
1051
1052 static void parse_preprocessor_identifier(void)
1053 {
1054         assert(pp_token.type == T_IDENTIFIER);
1055         symbol_t *symbol = pp_token.v.symbol;
1056
1057         switch(symbol->pp_ID) {
1058         case TP_include:
1059                 printf("include - enable header name parsing!\n");
1060                 break;
1061         case TP_define:
1062                 define_directive();
1063                 break;
1064         case TP_ifdef:
1065                 ifdef_directive(0);
1066                 break;
1067         case TP_ifndef:
1068                 ifdef_directive(1);
1069                 break;
1070         case TP_endif:
1071                 endif_directive();
1072                 break;
1073         case TP_line:
1074                 next_pp_token();
1075                 parse_line_directive();
1076                 break;
1077         case TP_if:
1078         case TP_else:
1079         case TP_elif:
1080         case TP_undef:
1081         case TP_error:
1082                 /* TODO; output the rest of the line */
1083                 parse_error("#error directive: ");
1084                 break;
1085         case TP_pragma:
1086                 if (warning.unknown_pragmas) {
1087                         warningf(lexer_token.source_position, "encountered unknown #pragma");
1088                 }
1089                 eat_until_newline();
1090                 break;
1091         }
1092 }
1093
1094 static void parse_preprocessor_directive(void)
1095 {
1096         next_pp_token();
1097
1098         switch(pp_token.type) {
1099         case T_IDENTIFIER:
1100                 parse_preprocessor_identifier();
1101                 break;
1102         case T_INTEGER:
1103                 parse_line_directive();
1104                 break;
1105         default:
1106                 parse_error("invalid preprocessor directive");
1107                 eat_until_newline();
1108                 break;
1109         }
1110 }
1111
1112 #define MAYBE_PROLOG                                       \
1113                         next_char();                                   \
1114                         while(1) {                                     \
1115                                 switch(c) {
1116
1117 #define MAYBE(ch, set_type)                                \
1118                                 case ch:                                   \
1119                                         next_char();                           \
1120                                         lexer_token.type = set_type;           \
1121                                         return;
1122
1123 #define ELSE_CODE(code)                                    \
1124                                 default:                                   \
1125                                         code;                                  \
1126                                 }                                          \
1127                         } /* end of while(1) */                        \
1128                         break;
1129
1130 #define ELSE(set_type)                                     \
1131                 ELSE_CODE(                                         \
1132                         lexer_token.type = set_type;                   \
1133                         return;                                        \
1134                 )
1135
1136 void lexer_next_preprocessing_token(void)
1137 {
1138         while(1) {
1139                 switch(c) {
1140                 case ' ':
1141                 case '\t':
1142                         next_char();
1143                         break;
1144
1145                 MATCH_NEWLINE(
1146                         lexer_token.type = '\n';
1147                         return;
1148                 )
1149
1150                 SYMBOL_CHARS
1151                         parse_symbol();
1152                         /* might be a wide string ( L"string" ) */
1153                         if(lexer_token.type == T_IDENTIFIER &&
1154                             lexer_token.v.symbol == symbol_L) {
1155                             if(c == '"') {
1156                                         parse_wide_string_literal();
1157                                 } else if(c == '\'') {
1158                                         parse_wide_character_constant();
1159                                 }
1160                         }
1161                         return;
1162
1163                 DIGITS
1164                         parse_number();
1165                         return;
1166
1167                 case '"':
1168                         parse_string_literal();
1169                         return;
1170
1171                 case '\'':
1172                         parse_character_constant();
1173                         return;
1174
1175                 case '.':
1176                         MAYBE_PROLOG
1177                                 case '0':
1178                                 case '1':
1179                                 case '2':
1180                                 case '3':
1181                                 case '4':
1182                                 case '5':
1183                                 case '6':
1184                                 case '7':
1185                                 case '8':
1186                                 case '9':
1187                                         put_back(c);
1188                                         c = '.';
1189                                         parse_number_dec();
1190                                         return;
1191
1192                                 case '.':
1193                                         MAYBE_PROLOG
1194                                         MAYBE('.', T_DOTDOTDOT)
1195                                         ELSE_CODE(
1196                                                 put_back(c);
1197                                                 c = '.';
1198                                                 lexer_token.type = '.';
1199                                                 return;
1200                                         )
1201                         ELSE('.')
1202                 case '&':
1203                         MAYBE_PROLOG
1204                         MAYBE('&', T_ANDAND)
1205                         MAYBE('=', T_ANDEQUAL)
1206                         ELSE('&')
1207                 case '*':
1208                         MAYBE_PROLOG
1209                         MAYBE('=', T_ASTERISKEQUAL)
1210                         ELSE('*')
1211                 case '+':
1212                         MAYBE_PROLOG
1213                         MAYBE('+', T_PLUSPLUS)
1214                         MAYBE('=', T_PLUSEQUAL)
1215                         ELSE('+')
1216                 case '-':
1217                         MAYBE_PROLOG
1218                         MAYBE('>', T_MINUSGREATER)
1219                         MAYBE('-', T_MINUSMINUS)
1220                         MAYBE('=', T_MINUSEQUAL)
1221                         ELSE('-')
1222                 case '!':
1223                         MAYBE_PROLOG
1224                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1225                         ELSE('!')
1226                 case '/':
1227                         MAYBE_PROLOG
1228                         MAYBE('=', T_SLASHEQUAL)
1229                                 case '*':
1230                                         next_char();
1231                                         skip_multiline_comment();
1232                                         lexer_next_preprocessing_token();
1233                                         return;
1234                                 case '/':
1235                                         next_char();
1236                                         skip_line_comment();
1237                                         lexer_next_preprocessing_token();
1238                                         return;
1239                         ELSE('/')
1240                 case '%':
1241                         MAYBE_PROLOG
1242                         MAYBE('>', T_PERCENTGREATER)
1243                         MAYBE('=', T_PERCENTEQUAL)
1244                                 case ':':
1245                                         MAYBE_PROLOG
1246                                                 case '%':
1247                                                         MAYBE_PROLOG
1248                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1249                                                         ELSE_CODE(
1250                                                                 put_back(c);
1251                                                                 c = '%';
1252                                                                 lexer_token.type = T_PERCENTCOLON;
1253                                                                 return;
1254                                                         )
1255                                         ELSE(T_PERCENTCOLON)
1256                         ELSE('%')
1257                 case '<':
1258                         MAYBE_PROLOG
1259                         MAYBE(':', T_LESSCOLON)
1260                         MAYBE('%', T_LESSPERCENT)
1261                         MAYBE('=', T_LESSEQUAL)
1262                                 case '<':
1263                                         MAYBE_PROLOG
1264                                         MAYBE('=', T_LESSLESSEQUAL)
1265                                         ELSE(T_LESSLESS)
1266                         ELSE('<')
1267                 case '>':
1268                         MAYBE_PROLOG
1269                         MAYBE('=', T_GREATEREQUAL)
1270                                 case '>':
1271                                         MAYBE_PROLOG
1272                                         MAYBE('=', T_GREATERGREATEREQUAL)
1273                                         ELSE(T_GREATERGREATER)
1274                         ELSE('>')
1275                 case '^':
1276                         MAYBE_PROLOG
1277                         MAYBE('=', T_CARETEQUAL)
1278                         ELSE('^')
1279                 case '|':
1280                         MAYBE_PROLOG
1281                         MAYBE('=', T_PIPEEQUAL)
1282                         MAYBE('|', T_PIPEPIPE)
1283                         ELSE('|')
1284                 case ':':
1285                         MAYBE_PROLOG
1286                         MAYBE('>', T_COLONGREATER)
1287                         ELSE(':')
1288                 case '=':
1289                         MAYBE_PROLOG
1290                         MAYBE('=', T_EQUALEQUAL)
1291                         ELSE('=')
1292                 case '#':
1293                         MAYBE_PROLOG
1294                         MAYBE('#', T_HASHHASH)
1295                         ELSE('#')
1296
1297                 case '?':
1298                 case '[':
1299                 case ']':
1300                 case '(':
1301                 case ')':
1302                 case '{':
1303                 case '}':
1304                 case '~':
1305                 case ';':
1306                 case ',':
1307                 case '\\':
1308                         lexer_token.type = c;
1309                         next_char();
1310                         return;
1311
1312                 case EOF:
1313                         lexer_token.type = T_EOF;
1314                         return;
1315
1316                 default:
1317                         next_char();
1318                         errorf(lexer_token.source_position, "unknown character '%c' found\n", c);
1319                         lexer_token.type = T_ERROR;
1320                         return;
1321                 }
1322         }
1323 }
1324
1325 void lexer_next_token(void)
1326 {
1327         lexer_next_preprocessing_token();
1328         if(lexer_token.type != '\n')
1329                 return;
1330
1331 newline_found:
1332         do {
1333                 lexer_next_preprocessing_token();
1334         } while(lexer_token.type == '\n');
1335
1336         if(lexer_token.type == '#') {
1337                 parse_preprocessor_directive();
1338                 goto newline_found;
1339         }
1340 }
1341
1342 void init_lexer(void)
1343 {
1344         strset_init(&stringset);
1345 }
1346
1347 void lexer_open_stream(FILE *stream, const char *input_name)
1348 {
1349         input                                  = stream;
1350         lexer_token.source_position.linenr     = 0;
1351         lexer_token.source_position.input_name = input_name;
1352
1353         symbol_L = symbol_table_insert("L");
1354         bufpos = NULL;
1355         bufend = NULL;
1356
1357         /* place a virtual \n at the beginning so the lexer knows that we're
1358          * at the beginning of a line */
1359         c = '\n';
1360 }
1361
1362 void exit_lexer(void)
1363 {
1364         strset_destroy(&stringset);
1365 }
1366
1367 static __attribute__((unused))
1368 void dbg_pos(const source_position_t source_position)
1369 {
1370         fprintf(stdout, "%s:%u\n", source_position.input_name,
1371                 source_position.linenr);
1372         fflush(stdout);
1373 }