nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "token_t.h"
  25 #include "symbol_table_t.h"
  26 #include "adt/error.h"
  27 #include "adt/strset.h"
  28 #include "adt/util.h"
  29 #include "types.h"
  30 #include "type_t.h"
  31 #include "target_architecture.h"
  32 #include "parser.h"
  33 #include "warning.h"
  34 #include "lang_features.h"
  35
  36 #include <assert.h>
  37 #include <errno.h>
  38 #include <string.h>
  39 #include <stdbool.h>
  40 #include <ctype.h>
  41
  42 //#define DEBUG_CHARS
  43 #define MAX_PUTBACK 3
  44
  45 #ifdef _WIN32
  46 /* No strtold on windows and no replacement yet */
  47 #define strtold(s, e) strtod(s, e)
  48 #endif
  49
  50 static int         c;
  51 token_t            lexer_token;
  52 symbol_t          *symbol_L;
  53 static FILE       *input;
  54 static char        buf[1024 + MAX_PUTBACK];
  55 static const char *bufend;
  56 static const char *bufpos;
  57 static strset_t    stringset;
  58
  59 /**
  60  * Prints a parse error message at the current token.
  61  *
  62  * @param msg   the error message
  63  */
  64 static void parse_error(const char *msg)
  65 {
  66         errorf(lexer_token.source_position,  "%s", msg);
  67 }
  68
  69 static inline void next_real_char(void)
  70 {
  71         assert(bufpos <= bufend);
  72         if (bufpos >= bufend) {
  73                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
  74                                  input);
  75                 if(s == 0) {
  76                         c = EOF;
  77                         return;
  78                 }
  79                 bufpos = buf + MAX_PUTBACK;
  80                 bufend = buf + MAX_PUTBACK + s;
  81         }
  82         c = *bufpos++;
  83 }
  84
  85 /**
  86  * Put a character back into the buffer.
  87  *
  88  * @param pc  the character to put back
  89  */
  90 static inline void put_back(int pc)
  91 {
  92         assert(bufpos > buf);
  93         *(--bufpos - buf + buf) = (char) pc;
  94
  95 #ifdef DEBUG_CHARS
  96         printf("putback '%c'\n", pc);
  97 #endif
  98 }
  99
 100 static inline void next_char(void);
 101
 102 #define MATCH_NEWLINE(code)                   \
 103         case '\r':                                \
 104                 next_char();                          \
 105                 if(c == '\n') {                       \
 106                         next_char();                      \
 107                 }                                     \
 108                 lexer_token.source_position.linenr++; \
 109                 code                                  \
 110         case '\n':                                \
 111                 next_char();                          \
 112                 lexer_token.source_position.linenr++; \
 113                 code
 114
 115 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 116
 117 static void maybe_concat_lines(void)
 118 {
 119         eat('\\');
 120
 121         switch(c) {
 122         MATCH_NEWLINE(return;)
 123
 124         default:
 125                 break;
 126         }
 127
 128         put_back(c);
 129         c = '\\';
 130 }
 131
 132 /**
 133  * Set c to the next input character, ie.
 134  * after expanding trigraphs.
 135  */
 136 static inline void next_char(void)
 137 {
 138         next_real_char();
 139
 140         /* filter trigraphs */
 141         if(UNLIKELY(c == '\\')) {
 142                 maybe_concat_lines();
 143                 goto end_of_next_char;
 144         }
 145
 146         if(LIKELY(c != '?'))
 147                 goto end_of_next_char;
 148
 149         next_real_char();
 150         if(LIKELY(c != '?')) {
 151                 put_back(c);
 152                 c = '?';
 153                 goto end_of_next_char;
 154         }
 155
 156         next_real_char();
 157         switch(c) {
 158         case '=': c = '#'; break;
 159         case '(': c = '['; break;
 160         case '/': c = '\\'; maybe_concat_lines(); break;
 161         case ')': c = ']'; break;
 162         case '\'': c = '^'; break;
 163         case '<': c = '{'; break;
 164         case '!': c = '|'; break;
 165         case '>': c = '}'; break;
 166         case '-': c = '~'; break;
 167         default:
 168                 put_back(c);
 169                 put_back('?');
 170                 c = '?';
 171                 break;
 172         }
 173
 174 end_of_next_char:;
 175 #ifdef DEBUG_CHARS
 176         printf("nchar '%c'\n", c);
 177 #endif
 178 }
 179
 180 #define SYMBOL_CHARS  \
 181         case 'a':         \
 182         case 'b':         \
 183         case 'c':         \
 184         case 'd':         \
 185         case 'e':         \
 186         case 'f':         \
 187         case 'g':         \
 188         case 'h':         \
 189         case 'i':         \
 190         case 'j':         \
 191         case 'k':         \
 192         case 'l':         \
 193         case 'm':         \
 194         case 'n':         \
 195         case 'o':         \
 196         case 'p':         \
 197         case 'q':         \
 198         case 'r':         \
 199         case 's':         \
 200         case 't':         \
 201         case 'u':         \
 202         case 'v':         \
 203         case 'w':         \
 204         case 'x':         \
 205         case 'y':         \
 206         case 'z':         \
 207         case 'A':         \
 208         case 'B':         \
 209         case 'C':         \
 210         case 'D':         \
 211         case 'E':         \
 212         case 'F':         \
 213         case 'G':         \
 214         case 'H':         \
 215         case 'I':         \
 216         case 'J':         \
 217         case 'K':         \
 218         case 'L':         \
 219         case 'M':         \
 220         case 'N':         \
 221         case 'O':         \
 222         case 'P':         \
 223         case 'Q':         \
 224         case 'R':         \
 225         case 'S':         \
 226         case 'T':         \
 227         case 'U':         \
 228         case 'V':         \
 229         case 'W':         \
 230         case 'X':         \
 231         case 'Y':         \
 232         case 'Z':         \
 233         case '_':
 234
 235 #define DIGITS        \
 236         case '0':         \
 237         case '1':         \
 238         case '2':         \
 239         case '3':         \
 240         case '4':         \
 241         case '5':         \
 242         case '6':         \
 243         case '7':         \
 244         case '8':         \
 245         case '9':
 246
 247 /**
 248  * Read a symbol from the input and build
 249  * the lexer_token.
 250  */
 251 static void parse_symbol(void)
 252 {
 253         symbol_t *symbol;
 254         char     *string;
 255
 256         obstack_1grow(&symbol_obstack, (char) c);
 257         next_char();
 258
 259         while(1) {
 260                 switch(c) {
 261                 DIGITS
 262                 SYMBOL_CHARS
 263                         obstack_1grow(&symbol_obstack, (char) c);
 264                         next_char();
 265                         break;
 266
 267                 default:
 268                         goto end_symbol;
 269                 }
 270         }
 271
 272 end_symbol:
 273         obstack_1grow(&symbol_obstack, '\0');
 274
 275         string = obstack_finish(&symbol_obstack);
 276         symbol = symbol_table_insert(string);
 277
 278         lexer_token.type     = symbol->ID;
 279         lexer_token.v.symbol = symbol;
 280
 281         if(symbol->string != string) {
 282                 obstack_free(&symbol_obstack, string);
 283         }
 284 }
 285
 286 static void parse_integer_suffix(bool is_oct_hex)
 287 {
 288         bool is_unsigned  = false;
 289         bool min_long     = false;
 290         bool min_longlong = false;
 291
 292         if(c == 'U' || c == 'u') {
 293                 is_unsigned = true;
 294                 next_char();
 295                 if(c == 'L' || c == 'l') {
 296                         min_long = true;
 297                         next_char();
 298                         if(c == 'L' || c == 'l') {
 299                                 min_longlong = true;
 300                                 next_char();
 301                         }
 302                 }
 303         } else if(c == 'l' || c == 'L') {
 304                 min_long = true;
 305                 next_char();
 306                 if(c == 'l' || c == 'L') {
 307                         min_longlong = true;
 308                         next_char();
 309                         if(c == 'u' || c == 'U') {
 310                                 is_unsigned = true;
 311                                 next_char();
 312                         }
 313                 } else if(c == 'u' || c == 'U') {
 314                         is_unsigned = true;
 315                         next_char();
 316                         lexer_token.datatype = type_unsigned_long;
 317                 }
 318         }
 319
 320         if(!is_unsigned) {
 321                 long long v = lexer_token.v.intvalue;
 322                 if(!min_long) {
 323                         if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 324                                 lexer_token.datatype = type_int;
 325                                 return;
 326                         } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 327                                 lexer_token.datatype = type_unsigned_int;
 328                                 return;
 329                         }
 330                 }
 331                 if(!min_longlong) {
 332                         if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 333                                 lexer_token.datatype = type_long;
 334                                 return;
 335                         } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
 336                                 lexer_token.datatype = type_unsigned_long;
 337                                 return;
 338                         }
 339                 }
 340                 unsigned long long uv = (unsigned long long) v;
 341                 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 342                         lexer_token.datatype = type_unsigned_long_long;
 343                         return;
 344                 }
 345
 346                 lexer_token.datatype = type_long_long;
 347         } else {
 348                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 349                 if(!min_long && v <= TARGET_UINT_MAX) {
 350                         lexer_token.datatype = type_unsigned_int;
 351                         return;
 352                 }
 353                 if(!min_longlong && v <= TARGET_ULONG_MAX) {
 354                         lexer_token.datatype = type_unsigned_long;
 355                         return;
 356                 }
 357                 lexer_token.datatype = type_unsigned_long_long;
 358         }
 359 }
 360
 361 static void parse_floating_suffix(void)
 362 {
 363         switch(c) {
 364         /* TODO: do something useful with the suffixes... */
 365         case 'f':
 366         case 'F':
 367                 next_char();
 368                 lexer_token.datatype = type_float;
 369                 break;
 370         case 'l':
 371         case 'L':
 372                 next_char();
 373                 lexer_token.datatype = type_long_double;
 374                 break;
 375         default:
 376                 lexer_token.datatype = type_double;
 377                 break;
 378         }
 379 }
 380
 381 /**
 382  * A replacement for strtoull. Only those parts needed for
 383  * our parser are implemented.
 384  */
 385 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 386         unsigned long long v = 0;
 387
 388         switch (base) {
 389         case 16:
 390                 for (;; ++s) {
 391                         /* check for overrun */
 392                         if (v >= 0x1000000000000000ULL)
 393                                 break;
 394                         switch (tolower(*s)) {
 395                         case '0': v <<= 4; break;
 396                         case '1': v <<= 4; v |= 0x1; break;
 397                         case '2': v <<= 4; v |= 0x2; break;
 398                         case '3': v <<= 4; v |= 0x3; break;
 399                         case '4': v <<= 4; v |= 0x4; break;
 400                         case '5': v <<= 4; v |= 0x5; break;
 401                         case '6': v <<= 4; v |= 0x6; break;
 402                         case '7': v <<= 4; v |= 0x7; break;
 403                         case '8': v <<= 4; v |= 0x8; break;
 404                         case '9': v <<= 4; v |= 0x9; break;
 405                         case 'a': v <<= 4; v |= 0xa; break;
 406                         case 'b': v <<= 4; v |= 0xb; break;
 407                         case 'c': v <<= 4; v |= 0xc; break;
 408                         case 'd': v <<= 4; v |= 0xd; break;
 409                         case 'e': v <<= 4; v |= 0xe; break;
 410                         case 'f': v <<= 4; v |= 0xf; break;
 411                         default:
 412                                 goto end;
 413                         }
 414                 }
 415                 break;
 416         case 8:
 417                 for (;; ++s) {
 418                         /* check for overrun */
 419                         if (v >= 0x2000000000000000ULL)
 420                                 break;
 421                         switch (tolower(*s)) {
 422                         case '0': v <<= 3; break;
 423                         case '1': v <<= 3; v |= 1; break;
 424                         case '2': v <<= 3; v |= 2; break;
 425                         case '3': v <<= 3; v |= 3; break;
 426                         case '4': v <<= 3; v |= 4; break;
 427                         case '5': v <<= 3; v |= 5; break;
 428                         case '6': v <<= 3; v |= 6; break;
 429                         case '7': v <<= 3; v |= 7; break;
 430                         default:
 431                                 goto end;
 432                         }
 433                 }
 434                 break;
 435         case 10:
 436                 for (;; ++s) {
 437                         /* check for overrun */
 438                         if (v > 0x1999999999999999ULL)
 439                                 break;
 440                         switch (tolower(*s)) {
 441                         case '0': v *= 10; break;
 442                         case '1': v *= 10; v += 1; break;
 443                         case '2': v *= 10; v += 2; break;
 444                         case '3': v *= 10; v += 3; break;
 445                         case '4': v *= 10; v += 4; break;
 446                         case '5': v *= 10; v += 5; break;
 447                         case '6': v *= 10; v += 6; break;
 448                         case '7': v *= 10; v += 7; break;
 449                         case '8': v *= 10; v += 8; break;
 450                         case '9': v *= 10; v += 9; break;
 451                         default:
 452                                 goto end;
 453                         }
 454                 }
 455                 break;
 456         default:
 457                 assert(0);
 458                 break;
 459         }
 460 end:
 461         *endptr = s;
 462         return v;
 463 }
 464
 465 /**
 466  * Parses a hex number including hex floats and set the
 467  * lexer_token.
 468  */
 469 static void parse_number_hex(void)
 470 {
 471         assert(c == 'x' || c == 'X');
 472         next_char();
 473
 474         while(isxdigit(c)) {
 475                 obstack_1grow(&symbol_obstack, (char) c);
 476                 next_char();
 477         }
 478         obstack_1grow(&symbol_obstack, '\0');
 479         char *string = obstack_finish(&symbol_obstack);
 480
 481         if(c == '.' || c == 'p' || c == 'P') {
 482                 next_char();
 483                 panic("Hex floating point numbers not implemented yet");
 484         }
 485         if(*string == '\0') {
 486                 parse_error("invalid hex number");
 487                 lexer_token.type = T_ERROR;
 488         }
 489
 490         const char *endptr;
 491         lexer_token.type       = T_INTEGER;
 492         lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
 493         if(*endptr != '\0') {
 494                 parse_error("hex number literal too long");
 495         }
 496
 497         obstack_free(&symbol_obstack, string);
 498         parse_integer_suffix(true);
 499 }
 500
 501 /**
 502  * Returns true if the given char is a octal digit.
 503  *
 504  * @param char  the character to check
 505  */
 506 static inline bool is_octal_digit(int chr)
 507 {
 508         switch(chr) {
 509         case '0':
 510         case '1':
 511         case '2':
 512         case '3':
 513         case '4':
 514         case '5':
 515         case '6':
 516         case '7':
 517                 return true;
 518         default:
 519                 return false;
 520         }
 521 }
 522
 523 /**
 524  * Parses a octal number and set the lexer_token.
 525  */
 526 static void parse_number_oct(void)
 527 {
 528         while(is_octal_digit(c)) {
 529                 obstack_1grow(&symbol_obstack, (char) c);
 530                 next_char();
 531         }
 532         obstack_1grow(&symbol_obstack, '\0');
 533         char *string = obstack_finish(&symbol_obstack);
 534
 535         const char *endptr;
 536         lexer_token.type       = T_INTEGER;
 537         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 538         if(*endptr != '\0') {
 539                 parse_error("octal number literal too long");
 540         }
 541
 542         obstack_free(&symbol_obstack, string);
 543         parse_integer_suffix(true);
 544 }
 545
 546 /**
 547  * Parses a decimal including float number and set the
 548  * lexer_token.
 549  */
 550 static void parse_number_dec(void)
 551 {
 552         bool is_float = false;
 553         while(isdigit(c)) {
 554                 obstack_1grow(&symbol_obstack, (char) c);
 555                 next_char();
 556         }
 557
 558         if(c == '.') {
 559                 obstack_1grow(&symbol_obstack, '.');
 560                 next_char();
 561
 562                 while(isdigit(c)) {
 563                         obstack_1grow(&symbol_obstack, (char) c);
 564                         next_char();
 565                 }
 566                 is_float = true;
 567         }
 568         if(c == 'e' || c == 'E') {
 569                 obstack_1grow(&symbol_obstack, 'e');
 570                 next_char();
 571
 572                 if(c == '-' || c == '+') {
 573                         obstack_1grow(&symbol_obstack, (char) c);
 574                         next_char();
 575                 }
 576
 577                 while(isdigit(c)) {
 578                         obstack_1grow(&symbol_obstack, (char) c);
 579                         next_char();
 580                 }
 581                 is_float = true;
 582         }
 583
 584         obstack_1grow(&symbol_obstack, '\0');
 585         char *string = obstack_finish(&symbol_obstack);
 586
 587         if(is_float) {
 588                 char *endptr;
 589                 lexer_token.type         = T_FLOATINGPOINT;
 590                 lexer_token.v.floatvalue = strtold(string, &endptr);
 591
 592                 if(*endptr != '\0') {
 593                         parse_error("invalid number literal");
 594                 }
 595
 596                 parse_floating_suffix();
 597         } else {
 598                 const char *endptr;
 599                 lexer_token.type       = T_INTEGER;
 600                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 601
 602                 if(*endptr != '\0') {
 603                         parse_error("invalid number literal");
 604                 }
 605
 606                 parse_integer_suffix(false);
 607         }
 608         obstack_free(&symbol_obstack, string);
 609 }
 610
 611 /**
 612  * Parses a number and sets the lexer_token.
 613  */
 614 static void parse_number(void)
 615 {
 616         if (c == '0') {
 617                 next_char();
 618                 switch (c) {
 619                         case 'X':
 620                         case 'x':
 621                                 parse_number_hex();
 622                                 break;
 623                         case '0':
 624                         case '1':
 625                         case '2':
 626                         case '3':
 627                         case '4':
 628                         case '5':
 629                         case '6':
 630                         case '7':
 631                                 parse_number_oct();
 632                                 break;
 633                         case '8':
 634                         case '9':
 635                                 next_char();
 636                                 parse_error("invalid octal number");
 637                                 lexer_token.type = T_ERROR;
 638                                 return;
 639                         case '.':
 640                         case 'e':
 641                         case 'E':
 642                         default:
 643                                 obstack_1grow(&symbol_obstack, '0');
 644                                 parse_number_dec();
 645                                 return;
 646                 }
 647         } else {
 648                 parse_number_dec();
 649         }
 650 }
 651
 652 /**
 653  * Returns the value of a digit.
 654  * The only portable way to do it ...
 655  */
 656 static int digit_value(int digit) {
 657         switch (digit) {
 658         case '0': return 0;
 659         case '1': return 1;
 660         case '2': return 2;
 661         case '3': return 3;
 662         case '4': return 4;
 663         case '5': return 5;
 664         case '6': return 6;
 665         case '7': return 7;
 666         case '8': return 8;
 667         case '9': return 9;
 668         case 'a':
 669         case 'A': return 10;
 670         case 'b':
 671         case 'B': return 11;
 672         case 'c':
 673         case 'C': return 12;
 674         case 'd':
 675         case 'D': return 13;
 676         case 'e':
 677         case 'E': return 14;
 678         case 'f':
 679         case 'F': return 15;
 680         default:
 681                 panic("wrong character given");
 682         }
 683 }
 684
 685 /**
 686  * Parses an octal character sequence.
 687  *
 688  * @param first_digit  the already read first digit
 689  */
 690 static int parse_octal_sequence(const int first_digit)
 691 {
 692         assert(is_octal_digit(first_digit));
 693         int value = digit_value(first_digit);
 694         if (!is_octal_digit(c)) return value;
 695         value = 8 * value + digit_value(c);
 696         next_char();
 697         if (!is_octal_digit(c)) return value;
 698         value = 8 * value + digit_value(c);
 699         next_char();
 700
 701         if(char_is_signed) {
 702                 return (signed char) value;
 703         } else {
 704                 return (unsigned char) value;
 705         }
 706 }
 707
 708 /**
 709  * Parses a hex character sequence.
 710  */
 711 static int parse_hex_sequence(void)
 712 {
 713         int value = 0;
 714         while(isxdigit(c)) {
 715                 value = 16 * value + digit_value(c);
 716                 next_char();
 717         }
 718
 719         if(char_is_signed) {
 720                 return (signed char) value;
 721         } else {
 722                 return (unsigned char) value;
 723         }
 724 }
 725
 726 /**
 727  * Parse an escape sequence.
 728  */
 729 static int parse_escape_sequence(void)
 730 {
 731         eat('\\');
 732
 733         int ec = c;
 734         next_char();
 735
 736         switch(ec) {
 737         case '"':  return '"';
 738         case '\'': return '\'';
 739         case '\\': return '\\';
 740         case '?': return '\?';
 741         case 'a': return '\a';
 742         case 'b': return '\b';
 743         case 'f': return '\f';
 744         case 'n': return '\n';
 745         case 'r': return '\r';
 746         case 't': return '\t';
 747         case 'v': return '\v';
 748         case 'x':
 749                 return parse_hex_sequence();
 750         case '0':
 751         case '1':
 752         case '2':
 753         case '3':
 754         case '4':
 755         case '5':
 756         case '6':
 757         case '7':
 758                 return parse_octal_sequence(ec);
 759         case EOF:
 760                 parse_error("reached end of file while parsing escape sequence");
 761                 return EOF;
 762         default:
 763                 parse_error("unknown escape sequence");
 764                 return EOF;
 765         }
 766 }
 767
 768 /**
 769  * Concatenate two strings.
 770  */
 771 string_t concat_strings(const string_t *const s1, const string_t *const s2)
 772 {
 773         const size_t len1 = s1->size - 1;
 774         const size_t len2 = s2->size - 1;
 775
 776         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 777         memcpy(concat, s1->begin, len1);
 778         memcpy(concat + len1, s2->begin, len2 + 1);
 779
 780 #if 0 /* TODO hash */
 781         const char *result = strset_insert(&stringset, concat);
 782         if(result != concat) {
 783                 obstack_free(&symbol_obstack, concat);
 784         }
 785
 786         return result;
 787 #else
 788         return (string_t){ concat, len1 + len2 + 1 };
 789 #endif
 790 }
 791
 792 /**
 793  * Concatenate a string and a wide string.
 794  */
 795 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
 796 {
 797         const size_t len1 = s1->size - 1;
 798         const size_t len2 = s2->size - 1;
 799
 800         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
 801         const char *const src = s1->begin;
 802         for (size_t i = 0; i != len1; ++i) {
 803                 concat[i] = src[i];
 804         }
 805         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
 806
 807         return (wide_string_t){ concat, len1 + len2 + 1 };
 808 }
 809
 810 /**
 811  * Concatenate two wide strings.
 812  */
 813 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
 814 {
 815         const size_t len1 = s1->size - 1;
 816         const size_t len2 = s2->size - 1;
 817
 818         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
 819         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
 820         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
 821
 822         return (wide_string_t){ concat, len1 + len2 + 1 };
 823 }
 824
 825 /**
 826  * Concatenate a wide string and a string.
 827  */
 828 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
 829 {
 830         const size_t len1 = s1->size - 1;
 831         const size_t len2 = s2->size - 1;
 832
 833         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
 834         memcpy(concat, s1->begin, len1 * sizeof(*concat));
 835         const char *const src = s2->begin;
 836         for (size_t i = 0; i != len2 + 1; ++i) {
 837                 concat[i] = src[i];
 838         }
 839
 840         return (wide_string_t){ concat, len1 + len2 + 1 };
 841 }
 842
 843 /**
 844  * Parse a string literal and set lexer_token.
 845  */
 846 static void parse_string_literal(void)
 847 {
 848         const unsigned start_linenr = lexer_token.source_position.linenr;
 849
 850         eat('"');
 851
 852         int tc;
 853         while(1) {
 854                 switch(c) {
 855                 case '\\':
 856                         tc = parse_escape_sequence();
 857                         obstack_1grow(&symbol_obstack, (char) tc);
 858                         break;
 859
 860                 case EOF: {
 861                         source_position_t source_position;
 862                         source_position.input_name = lexer_token.source_position.input_name;
 863                         source_position.linenr     = start_linenr;
 864                         errorf(source_position, "string has no end");
 865                         lexer_token.type = T_ERROR;
 866                         return;
 867                 }
 868
 869                 case '"':
 870                         next_char();
 871                         goto end_of_string;
 872
 873                 default:
 874                         obstack_1grow(&symbol_obstack, (char) c);
 875                         next_char();
 876                         break;
 877                 }
 878         }
 879
 880 end_of_string:
 881
 882         /* TODO: concatenate multiple strings separated by whitespace... */
 883
 884         /* add finishing 0 to the string */
 885         obstack_1grow(&symbol_obstack, '\0');
 886         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
 887         const char *const string = obstack_finish(&symbol_obstack);
 888
 889 #if 0 /* TODO hash */
 890         /* check if there is already a copy of the string */
 891         result = strset_insert(&stringset, string);
 892         if(result != string) {
 893                 obstack_free(&symbol_obstack, string);
 894         }
 895 #else
 896         const char *const result = string;
 897 #endif
 898
 899         lexer_token.type           = T_STRING_LITERAL;
 900         lexer_token.v.string.begin = result;
 901         lexer_token.v.string.size  = size;
 902 }
 903
 904 /**
 905  * Parse a wide character constant and set lexer_token.
 906  */
 907 static void parse_wide_character_constant(void)
 908 {
 909         eat('\'');
 910
 911         int found_char = 0;
 912         while(1) {
 913                 switch(c) {
 914                 case '\\':
 915                         found_char = parse_escape_sequence();
 916                         break;
 917
 918                 MATCH_NEWLINE(
 919                         parse_error("newline while parsing character constant");
 920                         break;
 921                 )
 922
 923                 case '\'':
 924                         next_char();
 925                         goto end_of_wide_char_constant;
 926
 927                 case EOF:
 928                         parse_error("EOF while parsing character constant");
 929                         lexer_token.type = T_ERROR;
 930                         return;
 931
 932                 default:
 933                         if(found_char != 0) {
 934                                 parse_error("more than 1 characters in character "
 935                                             "constant");
 936                                 goto end_of_wide_char_constant;
 937                         } else {
 938                                 found_char = c;
 939                                 next_char();
 940                         }
 941                         break;
 942                 }
 943         }
 944
 945 end_of_wide_char_constant:
 946         lexer_token.type       = T_INTEGER;
 947         lexer_token.v.intvalue = found_char;
 948         lexer_token.datatype   = type_wchar_t;
 949 }
 950
 951 /**
 952  * Parse a wide string literal and set lexer_token.
 953  */
 954 static void parse_wide_string_literal(void)
 955 {
 956         const unsigned start_linenr = lexer_token.source_position.linenr;
 957
 958         assert(c == '"');
 959         next_char();
 960
 961         while(1) {
 962                 switch(c) {
 963                 case '\\': {
 964                         wchar_rep_t tc = parse_escape_sequence();
 965                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 966                         break;
 967                 }
 968
 969                 case EOF: {
 970                         source_position_t source_position;
 971                         source_position.input_name = lexer_token.source_position.input_name;
 972                         source_position.linenr     = start_linenr;
 973                         errorf(source_position, "string has no end");
 974                         lexer_token.type = T_ERROR;
 975                         return;
 976                 }
 977
 978                 case '"':
 979                         next_char();
 980                         goto end_of_string;
 981
 982                 default: {
 983                         wchar_rep_t tc = c;
 984                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
 985                         next_char();
 986                         break;
 987                 }
 988                 }
 989         }
 990
 991 end_of_string:;
 992
 993         /* TODO: concatenate multiple strings separated by whitespace... */
 994
 995         /* add finishing 0 to the string */
 996         wchar_rep_t nul = L'\0';
 997         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
 998         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
 999         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1000
1001 #if 0 /* TODO hash */
1002         /* check if there is already a copy of the string */
1003         const wchar_rep_t *const result = strset_insert(&stringset, string);
1004         if(result != string) {
1005                 obstack_free(&symbol_obstack, string);
1006         }
1007 #else
1008         const wchar_rep_t *const result = string;
1009 #endif
1010
1011         lexer_token.type                = T_WIDE_STRING_LITERAL;
1012         lexer_token.v.wide_string.begin = result;
1013         lexer_token.v.wide_string.size  = size;
1014 }
1015
1016 /**
1017  * Parse a character constant and set lexer_token.
1018  */
1019 static void parse_character_constant(void)
1020 {
1021         const unsigned start_linenr = lexer_token.source_position.linenr;
1022
1023         eat('\'');
1024
1025         int tc;
1026         while(1) {
1027                 switch(c) {
1028                 case '\\':
1029                         tc = parse_escape_sequence();
1030                         obstack_1grow(&symbol_obstack, (char) tc);
1031                         break;
1032
1033                 MATCH_NEWLINE(
1034                         parse_error("newline while parsing character constant");
1035                         break;
1036                 )
1037
1038                 case EOF: {
1039                         source_position_t source_position;
1040                         source_position.input_name = lexer_token.source_position.input_name;
1041                         source_position.linenr     = start_linenr;
1042                         errorf(source_position, "EOF while parsing character constant");
1043                         lexer_token.type = T_ERROR;
1044                         return;
1045                 }
1046
1047                 case '\'':
1048                         next_char();
1049                         goto end_of_char_constant;
1050
1051                 default:
1052                         obstack_1grow(&symbol_obstack, (char) c);
1053                         next_char();
1054                         break;
1055
1056                 }
1057         }
1058
1059 end_of_char_constant:;
1060         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1061         const char *const string = obstack_finish(&symbol_obstack);
1062
1063         lexer_token.type           = T_CHARS;
1064         lexer_token.v.string.begin = string;
1065         lexer_token.v.string.size  = size;
1066         lexer_token.datatype       = type_int;
1067 }
1068
1069 /**
1070  * Skip a multiline comment.
1071  */
1072 static void skip_multiline_comment(void)
1073 {
1074         unsigned start_linenr = lexer_token.source_position.linenr;
1075
1076         while(1) {
1077                 switch(c) {
1078                 case '/':
1079                         next_char();
1080                         if (c == '*') {
1081                                 /* TODO: nested comment, warn here */
1082                         }
1083                         break;
1084                 case '*':
1085                         next_char();
1086                         if(c == '/') {
1087                                 next_char();
1088                                 return;
1089                         }
1090                         break;
1091
1092                 MATCH_NEWLINE(break;)
1093
1094                 case EOF: {
1095                         source_position_t source_position;
1096                         source_position.input_name = lexer_token.source_position.input_name;
1097                         source_position.linenr     = start_linenr;
1098                         errorf(source_position, "at end of file while looking for comment end");
1099                         return;
1100                 }
1101
1102                 default:
1103                         next_char();
1104                         break;
1105                 }
1106         }
1107 }
1108
1109 /**
1110  * Skip a single line comment.
1111  */
1112 static void skip_line_comment(void)
1113 {
1114         while(1) {
1115                 switch(c) {
1116                 case EOF:
1117                         return;
1118
1119                 case '\n':
1120                 case '\r':
1121                         return;
1122
1123                 default:
1124                         next_char();
1125                         break;
1126                 }
1127         }
1128 }
1129
1130 /** The current preprocessor token. */
1131 static token_t pp_token;
1132
1133 /**
1134  * Read the next preprocessor token.
1135  */
1136 static inline void next_pp_token(void)
1137 {
1138         lexer_next_preprocessing_token();
1139         pp_token = lexer_token;
1140 }
1141
1142 /**
1143  * Eat all preprocessor tokens until newline.
1144  */
1145 static void eat_until_newline(void)
1146 {
1147         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1148                 next_pp_token();
1149         }
1150 }
1151
1152 /**
1153  * Handle the define directive.
1154  */
1155 static void define_directive(void)
1156 {
1157         lexer_next_preprocessing_token();
1158         if(lexer_token.type != T_IDENTIFIER) {
1159                 parse_error("expected identifier after #define\n");
1160                 eat_until_newline();
1161         }
1162 }
1163
1164 /**
1165  * Handle the ifdef directive.
1166  */
1167 static void ifdef_directive(int is_ifndef)
1168 {
1169         (void) is_ifndef;
1170         lexer_next_preprocessing_token();
1171         //expect_identifier();
1172         //extect_newline();
1173 }
1174
1175 /**
1176  * Handle the endif directive.
1177  */
1178 static void endif_directive(void)
1179 {
1180         //expect_newline();
1181 }
1182
1183 /**
1184  * Parse the line directive.
1185  */
1186 static void parse_line_directive(void)
1187 {
1188         if(pp_token.type != T_INTEGER) {
1189                 parse_error("expected integer");
1190         } else {
1191                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1192                 next_pp_token();
1193         }
1194         if(pp_token.type == T_STRING_LITERAL) {
1195                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1196                 next_pp_token();
1197         }
1198
1199         eat_until_newline();
1200 }
1201
1202 /**
1203  * STDC pragmas.
1204  */
1205 typedef enum {
1206         STDC_UNKNOWN,
1207         STDC_FP_CONTRACT,
1208         STDC_FENV_ACCESS,
1209         STDC_CX_LIMITED_RANGE
1210 } stdc_pragma_kind_t;
1211
1212 /**
1213  * STDC pragma values.
1214  */
1215 typedef enum {
1216         STDC_VALUE_UNKNOWN,
1217         STDC_VALUE_ON,
1218         STDC_VALUE_OFF,
1219         STDC_VALUE_DEFAULT
1220 } stdc_pragma_value_kind_t;
1221
1222 /**
1223  * Parse a pragma directive.
1224  */
1225 static void parse_pragma(void) {
1226         bool unknown_pragma = true;
1227
1228         next_pp_token();
1229         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1230                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1231                 /* a STDC pragma */
1232                 if (c_mode & _C99) {
1233                         next_pp_token();
1234
1235                         switch (pp_token.v.symbol->pp_ID) {
1236                         case TP_FP_CONTRACT:
1237                                 kind = STDC_FP_CONTRACT;
1238                                 break;
1239                         case TP_FENV_ACCESS:
1240                                 kind = STDC_FENV_ACCESS;
1241                                 break;
1242                         case TP_CX_LIMITED_RANGE:
1243                                 kind = STDC_CX_LIMITED_RANGE;
1244                                 break;
1245                         default:
1246                                 break;
1247                         }
1248                         if (kind != STDC_UNKNOWN) {
1249                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1250                                 next_pp_token();
1251                                 switch (pp_token.v.symbol->pp_ID) {
1252                                 case TP_ON:
1253                                         value = STDC_VALUE_ON;
1254                                         break;
1255                                 case TP_OFF:
1256                                         value = STDC_VALUE_OFF;
1257                                         break;
1258                                 case TP_DEFAULT:
1259                                         value = STDC_VALUE_DEFAULT;
1260                                         break;
1261                                 default:
1262                                         break;
1263                                 }
1264                                 if (value != STDC_VALUE_UNKNOWN) {
1265                                         unknown_pragma = false;
1266                                 } else {
1267                                         errorf(pp_token.source_position, "bad STDC pragma argument");
1268                                 }
1269                         }
1270                 }
1271         } else {
1272                 unknown_pragma = true;
1273         }
1274         eat_until_newline();
1275         if (unknown_pragma && warning.unknown_pragmas) {
1276                 warningf(pp_token.source_position, "encountered unknown #pragma");
1277         }
1278 }
1279
1280 /**
1281  * Parse a preprocessor non-null directive.
1282  */
1283 static void parse_preprocessor_identifier(void)
1284 {
1285         assert(pp_token.type == T_IDENTIFIER);
1286         symbol_t *symbol = pp_token.v.symbol;
1287
1288         switch(symbol->pp_ID) {
1289         case TP_include:
1290                 printf("include - enable header name parsing!\n");
1291                 break;
1292         case TP_define:
1293                 define_directive();
1294                 break;
1295         case TP_ifdef:
1296                 ifdef_directive(0);
1297                 break;
1298         case TP_ifndef:
1299                 ifdef_directive(1);
1300                 break;
1301         case TP_endif:
1302                 endif_directive();
1303                 break;
1304         case TP_line:
1305                 next_pp_token();
1306                 parse_line_directive();
1307                 break;
1308         case TP_if:
1309         case TP_else:
1310         case TP_elif:
1311         case TP_undef:
1312         case TP_error:
1313                 /* TODO; output the rest of the line */
1314                 parse_error("#error directive: ");
1315                 break;
1316         case TP_pragma:
1317                 parse_pragma();
1318                 break;
1319         }
1320 }
1321
1322 /**
1323  * Parse a preprocessor directive.
1324  */
1325 static void parse_preprocessor_directive(void)
1326 {
1327         next_pp_token();
1328
1329         switch(pp_token.type) {
1330         case T_IDENTIFIER:
1331                 parse_preprocessor_identifier();
1332                 break;
1333         case T_INTEGER:
1334                 parse_line_directive();
1335                 break;
1336         case '\n':
1337                 /* NULL directive, see § 6.10.7 */
1338                 break;
1339         default:
1340                 parse_error("invalid preprocessor directive");
1341                 eat_until_newline();
1342                 break;
1343         }
1344 }
1345
1346 #define MAYBE_PROLOG                                       \
1347                         next_char();                                   \
1348                         while(1) {                                     \
1349                                 switch(c) {
1350
1351 #define MAYBE(ch, set_type)                                \
1352                                 case ch:                                   \
1353                                         next_char();                           \
1354                                         lexer_token.type = set_type;           \
1355                                         return;
1356
1357 #define ELSE_CODE(code)                                    \
1358                                 default:                                   \
1359                                         code;                                  \
1360                                 }                                          \
1361                         } /* end of while(1) */                        \
1362                         break;
1363
1364 #define ELSE(set_type)                                     \
1365                 ELSE_CODE(                                         \
1366                         lexer_token.type = set_type;                   \
1367                         return;                                        \
1368                 )
1369
1370 void lexer_next_preprocessing_token(void)
1371 {
1372         while(1) {
1373                 switch(c) {
1374                 case ' ':
1375                 case '\t':
1376                         next_char();
1377                         break;
1378
1379                 MATCH_NEWLINE(
1380                         lexer_token.type = '\n';
1381                         return;
1382                 )
1383
1384                 SYMBOL_CHARS
1385                         parse_symbol();
1386                         /* might be a wide string ( L"string" ) */
1387                         if(lexer_token.type == T_IDENTIFIER &&
1388                             lexer_token.v.symbol == symbol_L) {
1389                             if(c == '"') {
1390                                         parse_wide_string_literal();
1391                                 } else if(c == '\'') {
1392                                         parse_wide_character_constant();
1393                                 }
1394                         }
1395                         return;
1396
1397                 DIGITS
1398                         parse_number();
1399                         return;
1400
1401                 case '"':
1402                         parse_string_literal();
1403                         return;
1404
1405                 case '\'':
1406                         parse_character_constant();
1407                         return;
1408
1409                 case '.':
1410                         MAYBE_PROLOG
1411                                 case '0':
1412                                 case '1':
1413                                 case '2':
1414                                 case '3':
1415                                 case '4':
1416                                 case '5':
1417                                 case '6':
1418                                 case '7':
1419                                 case '8':
1420                                 case '9':
1421                                         put_back(c);
1422                                         c = '.';
1423                                         parse_number_dec();
1424                                         return;
1425
1426                                 case '.':
1427                                         MAYBE_PROLOG
1428                                         MAYBE('.', T_DOTDOTDOT)
1429                                         ELSE_CODE(
1430                                                 put_back(c);
1431                                                 c = '.';
1432                                                 lexer_token.type = '.';
1433                                                 return;
1434                                         )
1435                         ELSE('.')
1436                 case '&':
1437                         MAYBE_PROLOG
1438                         MAYBE('&', T_ANDAND)
1439                         MAYBE('=', T_ANDEQUAL)
1440                         ELSE('&')
1441                 case '*':
1442                         MAYBE_PROLOG
1443                         MAYBE('=', T_ASTERISKEQUAL)
1444                         ELSE('*')
1445                 case '+':
1446                         MAYBE_PROLOG
1447                         MAYBE('+', T_PLUSPLUS)
1448                         MAYBE('=', T_PLUSEQUAL)
1449                         ELSE('+')
1450                 case '-':
1451                         MAYBE_PROLOG
1452                         MAYBE('>', T_MINUSGREATER)
1453                         MAYBE('-', T_MINUSMINUS)
1454                         MAYBE('=', T_MINUSEQUAL)
1455                         ELSE('-')
1456                 case '!':
1457                         MAYBE_PROLOG
1458                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1459                         ELSE('!')
1460                 case '/':
1461                         MAYBE_PROLOG
1462                         MAYBE('=', T_SLASHEQUAL)
1463                                 case '*':
1464                                         next_char();
1465                                         skip_multiline_comment();
1466                                         lexer_next_preprocessing_token();
1467                                         return;
1468                                 case '/':
1469                                         next_char();
1470                                         skip_line_comment();
1471                                         lexer_next_preprocessing_token();
1472                                         return;
1473                         ELSE('/')
1474                 case '%':
1475                         MAYBE_PROLOG
1476                         MAYBE('>', T_PERCENTGREATER)
1477                         MAYBE('=', T_PERCENTEQUAL)
1478                                 case ':':
1479                                         MAYBE_PROLOG
1480                                                 case '%':
1481                                                         MAYBE_PROLOG
1482                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1483                                                         ELSE_CODE(
1484                                                                 put_back(c);
1485                                                                 c = '%';
1486                                                                 lexer_token.type = T_PERCENTCOLON;
1487                                                                 return;
1488                                                         )
1489                                         ELSE(T_PERCENTCOLON)
1490                         ELSE('%')
1491                 case '<':
1492                         MAYBE_PROLOG
1493                         MAYBE(':', T_LESSCOLON)
1494                         MAYBE('%', T_LESSPERCENT)
1495                         MAYBE('=', T_LESSEQUAL)
1496                                 case '<':
1497                                         MAYBE_PROLOG
1498                                         MAYBE('=', T_LESSLESSEQUAL)
1499                                         ELSE(T_LESSLESS)
1500                         ELSE('<')
1501                 case '>':
1502                         MAYBE_PROLOG
1503                         MAYBE('=', T_GREATEREQUAL)
1504                                 case '>':
1505                                         MAYBE_PROLOG
1506                                         MAYBE('=', T_GREATERGREATEREQUAL)
1507                                         ELSE(T_GREATERGREATER)
1508                         ELSE('>')
1509                 case '^':
1510                         MAYBE_PROLOG
1511                         MAYBE('=', T_CARETEQUAL)
1512                         ELSE('^')
1513                 case '|':
1514                         MAYBE_PROLOG
1515                         MAYBE('=', T_PIPEEQUAL)
1516                         MAYBE('|', T_PIPEPIPE)
1517                         ELSE('|')
1518                 case ':':
1519                         MAYBE_PROLOG
1520                         MAYBE('>', T_COLONGREATER)
1521                         ELSE(':')
1522                 case '=':
1523                         MAYBE_PROLOG
1524                         MAYBE('=', T_EQUALEQUAL)
1525                         ELSE('=')
1526                 case '#':
1527                         MAYBE_PROLOG
1528                         MAYBE('#', T_HASHHASH)
1529                         ELSE('#')
1530
1531                 case '?':
1532                 case '[':
1533                 case ']':
1534                 case '(':
1535                 case ')':
1536                 case '{':
1537                 case '}':
1538                 case '~':
1539                 case ';':
1540                 case ',':
1541                 case '\\':
1542                         lexer_token.type = c;
1543                         next_char();
1544                         return;
1545
1546                 case EOF:
1547                         lexer_token.type = T_EOF;
1548                         return;
1549
1550                 default:
1551                         next_char();
1552                         errorf(lexer_token.source_position, "unknown character '%c' found\n", c);
1553                         lexer_token.type = T_ERROR;
1554                         return;
1555                 }
1556         }
1557 }
1558
1559 void lexer_next_token(void)
1560 {
1561         lexer_next_preprocessing_token();
1562         if(lexer_token.type != '\n')
1563                 return;
1564
1565 newline_found:
1566         do {
1567                 lexer_next_preprocessing_token();
1568         } while(lexer_token.type == '\n');
1569
1570         if(lexer_token.type == '#') {
1571                 parse_preprocessor_directive();
1572                 goto newline_found;
1573         }
1574 }
1575
1576 void init_lexer(void)
1577 {
1578         strset_init(&stringset);
1579 }
1580
1581 void lexer_open_stream(FILE *stream, const char *input_name)
1582 {
1583         input                                  = stream;
1584         lexer_token.source_position.linenr     = 0;
1585         lexer_token.source_position.input_name = input_name;
1586
1587         symbol_L = symbol_table_insert("L");
1588         bufpos = NULL;
1589         bufend = NULL;
1590
1591         /* place a virtual \n at the beginning so the lexer knows that we're
1592          * at the beginning of a line */
1593         c = '\n';
1594 }
1595
1596 void exit_lexer(void)
1597 {
1598         strset_destroy(&stringset);
1599 }
1600
1601 static __attribute__((unused))
1602 void dbg_pos(const source_position_t source_position)
1603 {
1604         fprintf(stdout, "%s:%u\n", source_position.input_name,
1605                 source_position.linenr);
1606         fflush(stdout);
1607 }