nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <strings.h>
  41 #include <stdbool.h>
  42 #include <ctype.h>
  43
  44 //#define DEBUG_CHARS
  45 #define MAX_PUTBACK 3
  46 #define BUF_SIZE    1024
  47
  48 #if defined(_WIN32) || defined(__CYGWIN__)
  49 /* No strtold on windows and no replacement yet */
  50 #define strtold(s, e) strtod(s, e)
  51 #endif
  52
  53 typedef unsigned int utf32;
  54
  55 static utf32        c;
  56 token_t             lexer_token;
  57 symbol_t           *symbol_L;
  58 static FILE        *input;
  59 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  60 static const utf32 *bufend;
  61 static const utf32 *bufpos;
  62 static strset_t     stringset;
  63 bool                allow_dollar_in_symbol = true;
  64
  65 /**
  66  * Prints a parse error message at the current token.
  67  *
  68  * @param msg   the error message
  69  */
  70 static void parse_error(const char *msg)
  71 {
  72         errorf(&lexer_token.source_position, "%s", msg);
  73 }
  74
  75 /**
  76  * Prints an internal error message at the current token.
  77  *
  78  * @param msg   the error message
  79  */
  80 static NORETURN internal_error(const char *msg)
  81 {
  82         internal_errorf(&lexer_token.source_position, "%s", msg);
  83 }
  84
  85 static size_t read_block(unsigned char *const read_buf, size_t const n)
  86 {
  87         size_t const s = fread(read_buf, 1, n, input);
  88         if (s == 0) {
  89                 if (ferror(input))
  90                         parse_error("read from input failed");
  91                 buf[MAX_PUTBACK] = EOF;
  92                 bufpos           = buf + MAX_PUTBACK;
  93                 bufend           = buf + MAX_PUTBACK + 1;
  94         }
  95         return s;
  96 }
  97
  98 static void decode_iso_8859_1(void)
  99 {
 100         unsigned char read_buf[BUF_SIZE];
 101         size_t const s = read_block(read_buf, sizeof(read_buf));
 102         if (s == 0)
 103                 return;
 104
 105         unsigned char const *src = read_buf;
 106         unsigned char const *end = read_buf + s;
 107         utf32               *dst = buf + MAX_PUTBACK;
 108         while (src != end)
 109                 *dst++ = *src++;
 110
 111         bufpos = buf + MAX_PUTBACK;
 112         bufend = dst;
 113 }
 114
 115 static void decode_iso_8859_15(void)
 116 {
 117         unsigned char read_buf[BUF_SIZE];
 118         size_t const s = read_block(read_buf, sizeof(read_buf));
 119         if (s == 0)
 120                 return;
 121
 122         unsigned char const *src = read_buf;
 123         unsigned char const *end = read_buf + s;
 124         utf32               *dst = buf + MAX_PUTBACK;
 125         while (src != end) {
 126                 utf32 tc = *src++;
 127                 switch (tc) {
 128                         case 0xA4: tc = 0x20AC; break; // €
 129                         case 0xA6: tc = 0x0160; break; // Š
 130                         case 0xA8: tc = 0x0161; break; // š
 131                         case 0xB4: tc = 0x017D; break; // Ž
 132                         case 0xB8: tc = 0x017E; break; // ž
 133                         case 0xBC: tc = 0x0152; break; // Œ
 134                         case 0xBD: tc = 0x0153; break; // œ
 135                         case 0xBE: tc = 0x0178; break; // Ÿ
 136                 }
 137                 *dst++ = tc;
 138         }
 139
 140         bufpos = buf + MAX_PUTBACK;
 141         bufend = dst;
 142 }
 143
 144 static void decode_utf8(void)
 145 {
 146         static utf32  part_decoded_min_code;
 147         static utf32  part_decoded_char;
 148         static size_t part_decoded_rest_len;
 149
 150         do {
 151                 unsigned char read_buf[BUF_SIZE];
 152                 size_t const s = read_block(read_buf, sizeof(read_buf));
 153                 if (s == 0) {
 154                         if (part_decoded_rest_len > 0)
 155                                 parse_error("incomplete input char at end of input");
 156                         return;
 157                 }
 158
 159                 unsigned char const *src = read_buf;
 160                 unsigned char const *end = read_buf + s;
 161                 utf32               *dst = buf + MAX_PUTBACK;
 162                 utf32                decoded;
 163                 utf32                min_code;
 164
 165                 if (part_decoded_rest_len != 0) {
 166                         min_code              = part_decoded_min_code;
 167                         decoded               = part_decoded_char;
 168                         size_t const rest_len = part_decoded_rest_len;
 169                         part_decoded_rest_len = 0;
 170                         switch (rest_len) {
 171                                 case 4:  goto realign;
 172                                 case 3:  goto three_more;
 173                                 case 2:  goto two_more;
 174                                 default: goto one_more;
 175                         }
 176                 }
 177
 178                 while (src != end) {
 179                         if ((*src & 0x80) == 0) {
 180                                 decoded = *src++;
 181                         } else if ((*src & 0xE0) == 0xC0) {
 182                                 min_code = 0x80;
 183                                 decoded  = *src++ & 0x1F;
 184 one_more:
 185                                 if (src == end) {
 186                                         part_decoded_min_code = min_code;
 187                                         part_decoded_char     = decoded;
 188                                         part_decoded_rest_len = 1;
 189                                         break;
 190                                 }
 191                                 if ((*src & 0xC0) == 0x80) {
 192                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 193                                 } else {
 194                                         goto invalid_char;
 195                                 }
 196                                 if (decoded < min_code                      ||
 197                                                 decoded > 0x10FFFF                      ||
 198                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 199                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 200                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 201                                         parse_error("invalid byte sequence in input");
 202                                 }
 203                         } else if ((*src & 0xF0) == 0xE0) {
 204                                 min_code = 0x800;
 205                                 decoded  = *src++ & 0x0F;
 206 two_more:
 207                                 if (src == end) {
 208                                         part_decoded_min_code = min_code;
 209                                         part_decoded_char     = decoded;
 210                                         part_decoded_rest_len = 2;
 211                                         break;
 212                                 }
 213                                 if ((*src & 0xC0) == 0x80) {
 214                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 215                                 } else {
 216                                         goto invalid_char;
 217                                 }
 218                                 goto one_more;
 219                         } else if ((*src & 0xF8) == 0xF0) {
 220                                 min_code = 0x10000;
 221                                 decoded  = *src++ & 0x07;
 222 three_more:
 223                                 if (src == end) {
 224                                         part_decoded_min_code = min_code;
 225                                         part_decoded_char     = decoded;
 226                                         part_decoded_rest_len = 3;
 227                                         break;
 228                                 }
 229                                 if ((*src & 0xC0) == 0x80) {
 230                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 231                                 } else {
 232                                         goto invalid_char;
 233                                 }
 234                                 goto two_more;
 235                         } else {
 236 invalid_char:
 237                                 parse_error("invalid byte sequence in input");
 238 realign:
 239                                 do {
 240                                         ++src;
 241                                         if (src == end) {
 242                                                 part_decoded_rest_len = 4;
 243                                                 break;
 244                                         }
 245                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 246                                 continue;
 247                         }
 248                         *dst++ = decoded;
 249                 }
 250
 251                 bufpos = buf + MAX_PUTBACK;
 252                 bufend = dst;
 253         } while (bufpos == bufend);
 254 }
 255
 256 typedef void (*decoder_t)(void);
 257
 258 static decoder_t decoder = decode_utf8;
 259
 260 typedef struct named_decoder_t {
 261         char const *name;
 262         decoder_t   decoder;
 263 } named_decoder_t;
 264
 265 static named_decoder_t const decoders[] = {
 266         { "CP819",           decode_iso_8859_1  }, // offical alias
 267         { "IBM819",          decode_iso_8859_1  }, // offical alias
 268         { "ISO-8859-1",      decode_iso_8859_1  }, // offical alias
 269         { "ISO-8859-15",     decode_iso_8859_15 }, // offical name
 270         { "ISO8859-1",       decode_iso_8859_1  },
 271         { "ISO8859-15",      decode_iso_8859_15 },
 272         { "ISO_8859-1",      decode_iso_8859_1  }, // offical alias
 273         { "ISO_8859-15",     decode_iso_8859_15 }, // offical alias
 274         { "ISO_8859-1:1987", decode_iso_8859_1  }, // offical name
 275         { "Latin-9",         decode_iso_8859_15 }, // offical alias
 276         { "UTF-8",           decode_utf8        }, // offical name
 277         { "csISOLatin1",     decode_iso_8859_1  }, // offical alias
 278         { "iso-ir-100",      decode_iso_8859_1  }, // offical alias
 279         { "l1",              decode_iso_8859_1  }, // offical alias
 280         { "latin1",          decode_iso_8859_1  }, // offical alias
 281
 282         { NULL,              NULL               }
 283 };
 284
 285 void select_input_encoding(char const* const encoding)
 286 {
 287         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 288                 if (strcasecmp(encoding, i->name) != 0)
 289                         continue;
 290                 decoder = i->decoder;
 291                 return;
 292         }
 293         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 294 }
 295
 296 static inline void next_real_char(void)
 297 {
 298         assert(bufpos <= bufend);
 299         if (bufpos >= bufend) {
 300                 if (input == NULL) {
 301                         c = EOF;
 302                         return;
 303                 }
 304                 decoder();
 305         }
 306         c = *bufpos++;
 307 }
 308
 309 /**
 310  * Put a character back into the buffer.
 311  *
 312  * @param pc  the character to put back
 313  */
 314 static inline void put_back(utf32 const pc)
 315 {
 316         assert(bufpos > buf);
 317         *(--bufpos - buf + buf) = pc;
 318
 319 #ifdef DEBUG_CHARS
 320         printf("putback '%lc'\n", pc);
 321 #endif
 322 }
 323
 324 static inline void next_char(void);
 325
 326 #define MATCH_NEWLINE(code)                   \
 327         case '\r':                                \
 328                 next_char();                          \
 329                 if(c == '\n') {                       \
 330                         next_char();                      \
 331                 }                                     \
 332                 lexer_token.source_position.linenr++; \
 333                 code                                  \
 334         case '\n':                                \
 335                 next_char();                          \
 336                 lexer_token.source_position.linenr++; \
 337                 code
 338
 339 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 340
 341 static void maybe_concat_lines(void)
 342 {
 343         eat('\\');
 344
 345         switch(c) {
 346         MATCH_NEWLINE(return;)
 347
 348         default:
 349                 break;
 350         }
 351
 352         put_back(c);
 353         c = '\\';
 354 }
 355
 356 /**
 357  * Set c to the next input character, ie.
 358  * after expanding trigraphs.
 359  */
 360 static inline void next_char(void)
 361 {
 362         next_real_char();
 363
 364         /* filter trigraphs */
 365         if(UNLIKELY(c == '\\')) {
 366                 maybe_concat_lines();
 367                 goto end_of_next_char;
 368         }
 369
 370         if(LIKELY(c != '?'))
 371                 goto end_of_next_char;
 372
 373         next_real_char();
 374         if(LIKELY(c != '?')) {
 375                 put_back(c);
 376                 c = '?';
 377                 goto end_of_next_char;
 378         }
 379
 380         next_real_char();
 381         switch(c) {
 382         case '=': c = '#'; break;
 383         case '(': c = '['; break;
 384         case '/': c = '\\'; maybe_concat_lines(); break;
 385         case ')': c = ']'; break;
 386         case '\'': c = '^'; break;
 387         case '<': c = '{'; break;
 388         case '!': c = '|'; break;
 389         case '>': c = '}'; break;
 390         case '-': c = '~'; break;
 391         default:
 392                 put_back(c);
 393                 put_back('?');
 394                 c = '?';
 395                 break;
 396         }
 397
 398 end_of_next_char:;
 399 #ifdef DEBUG_CHARS
 400         printf("nchar '%c'\n", c);
 401 #endif
 402 }
 403
 404 #define SYMBOL_CHARS  \
 405         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 406         case 'a':         \
 407         case 'b':         \
 408         case 'c':         \
 409         case 'd':         \
 410         case 'e':         \
 411         case 'f':         \
 412         case 'g':         \
 413         case 'h':         \
 414         case 'i':         \
 415         case 'j':         \
 416         case 'k':         \
 417         case 'l':         \
 418         case 'm':         \
 419         case 'n':         \
 420         case 'o':         \
 421         case 'p':         \
 422         case 'q':         \
 423         case 'r':         \
 424         case 's':         \
 425         case 't':         \
 426         case 'u':         \
 427         case 'v':         \
 428         case 'w':         \
 429         case 'x':         \
 430         case 'y':         \
 431         case 'z':         \
 432         case 'A':         \
 433         case 'B':         \
 434         case 'C':         \
 435         case 'D':         \
 436         case 'E':         \
 437         case 'F':         \
 438         case 'G':         \
 439         case 'H':         \
 440         case 'I':         \
 441         case 'J':         \
 442         case 'K':         \
 443         case 'L':         \
 444         case 'M':         \
 445         case 'N':         \
 446         case 'O':         \
 447         case 'P':         \
 448         case 'Q':         \
 449         case 'R':         \
 450         case 'S':         \
 451         case 'T':         \
 452         case 'U':         \
 453         case 'V':         \
 454         case 'W':         \
 455         case 'X':         \
 456         case 'Y':         \
 457         case 'Z':         \
 458         case '_':
 459
 460 #define DIGITS        \
 461         case '0':         \
 462         case '1':         \
 463         case '2':         \
 464         case '3':         \
 465         case '4':         \
 466         case '5':         \
 467         case '6':         \
 468         case '7':         \
 469         case '8':         \
 470         case '9':
 471
 472 /**
 473  * Read a symbol from the input and build
 474  * the lexer_token.
 475  */
 476 static void parse_symbol(void)
 477 {
 478         symbol_t *symbol;
 479         char     *string;
 480
 481         obstack_1grow(&symbol_obstack, (char) c);
 482         next_char();
 483
 484         while(1) {
 485                 switch(c) {
 486                 DIGITS
 487                 SYMBOL_CHARS
 488                         obstack_1grow(&symbol_obstack, (char) c);
 489                         next_char();
 490                         break;
 491
 492                 default:
 493 dollar_sign:
 494                         goto end_symbol;
 495                 }
 496         }
 497
 498 end_symbol:
 499         obstack_1grow(&symbol_obstack, '\0');
 500
 501         string = obstack_finish(&symbol_obstack);
 502         symbol = symbol_table_insert(string);
 503
 504         lexer_token.type     = symbol->ID;
 505         lexer_token.v.symbol = symbol;
 506
 507         if(symbol->string != string) {
 508                 obstack_free(&symbol_obstack, string);
 509         }
 510 }
 511
 512 static void parse_integer_suffix(bool is_oct_hex)
 513 {
 514         bool is_unsigned     = false;
 515         bool min_long        = false;
 516         bool min_longlong    = false;
 517         bool not_traditional = false;
 518         int  pos             = 0;
 519         char suffix[4];
 520
 521         if (c == 'U' || c == 'u') {
 522                 not_traditional = true;
 523                 suffix[pos++]   = toupper(c);
 524                 is_unsigned     = true;
 525                 next_char();
 526                 if (c == 'L' || c == 'l') {
 527                         suffix[pos++] = toupper(c);
 528                         min_long = true;
 529                         next_char();
 530                         if (c == 'L' || c == 'l') {
 531                                 suffix[pos++] = toupper(c);
 532                                 min_longlong = true;
 533                                 next_char();
 534                         }
 535                 }
 536         } else if (c == 'l' || c == 'L') {
 537                 suffix[pos++] = toupper(c);
 538                 min_long = true;
 539                 next_char();
 540                 if (c == 'l' || c == 'L') {
 541                         not_traditional = true;
 542                         suffix[pos++]   = toupper(c);
 543                         min_longlong    = true;
 544                         next_char();
 545                         if (c == 'u' || c == 'U') {
 546                                 suffix[pos++] = toupper(c);
 547                                 is_unsigned   = true;
 548                                 next_char();
 549                         }
 550                 } else if (c == 'u' || c == 'U') {
 551                         not_traditional = true;
 552                         suffix[pos++]   = toupper(c);
 553                         is_unsigned     = true;
 554                         next_char();
 555                         lexer_token.datatype = type_unsigned_long;
 556                 }
 557         }
 558
 559         if (warning.traditional && not_traditional) {
 560                 suffix[pos] = '\0';
 561                 warningf(&lexer_token.source_position,
 562                         "traditional C rejects the '%s' suffix", suffix);
 563         }
 564         if (!is_unsigned) {
 565                 long long v = lexer_token.v.intvalue;
 566                 if (!min_long) {
 567                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 568                                 lexer_token.datatype = type_int;
 569                                 return;
 570                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 571                                 lexer_token.datatype = type_unsigned_int;
 572                                 return;
 573                         }
 574                 }
 575                 if (!min_longlong) {
 576                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 577                                 lexer_token.datatype = type_long;
 578                                 return;
 579                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
 580                                 lexer_token.datatype = type_unsigned_long;
 581                                 return;
 582                         }
 583                 }
 584                 unsigned long long uv = (unsigned long long) v;
 585                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 586                         lexer_token.datatype = type_unsigned_long_long;
 587                         return;
 588                 }
 589
 590                 lexer_token.datatype = type_long_long;
 591         } else {
 592                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 593                 if (!min_long && v <= TARGET_UINT_MAX) {
 594                         lexer_token.datatype = type_unsigned_int;
 595                         return;
 596                 }
 597                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
 598                         lexer_token.datatype = type_unsigned_long;
 599                         return;
 600                 }
 601                 lexer_token.datatype = type_unsigned_long_long;
 602         }
 603 }
 604
 605 static void parse_floating_suffix(void)
 606 {
 607         switch(c) {
 608         /* TODO: do something useful with the suffixes... */
 609         case 'f':
 610         case 'F':
 611                 if (warning.traditional) {
 612                         warningf(&lexer_token.source_position,
 613                                 "traditional C rejects the 'F' suffix");
 614                 }
 615                 next_char();
 616                 lexer_token.datatype = type_float;
 617                 break;
 618         case 'l':
 619         case 'L':
 620                 if (warning.traditional) {
 621                         warningf(&lexer_token.source_position,
 622                                 "traditional C rejects the 'F' suffix");
 623                 }
 624                 next_char();
 625                 lexer_token.datatype = type_long_double;
 626                 break;
 627         default:
 628                 lexer_token.datatype = type_double;
 629                 break;
 630         }
 631 }
 632
 633 /**
 634  * A replacement for strtoull. Only those parts needed for
 635  * our parser are implemented.
 636  */
 637 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
 638 {
 639         unsigned long long v = 0;
 640
 641         switch (base) {
 642         case 16:
 643                 for (;; ++s) {
 644                         /* check for overrun */
 645                         if (v >= 0x1000000000000000ULL)
 646                                 break;
 647                         switch (tolower(*s)) {
 648                         case '0': v <<= 4; break;
 649                         case '1': v <<= 4; v |= 0x1; break;
 650                         case '2': v <<= 4; v |= 0x2; break;
 651                         case '3': v <<= 4; v |= 0x3; break;
 652                         case '4': v <<= 4; v |= 0x4; break;
 653                         case '5': v <<= 4; v |= 0x5; break;
 654                         case '6': v <<= 4; v |= 0x6; break;
 655                         case '7': v <<= 4; v |= 0x7; break;
 656                         case '8': v <<= 4; v |= 0x8; break;
 657                         case '9': v <<= 4; v |= 0x9; break;
 658                         case 'a': v <<= 4; v |= 0xa; break;
 659                         case 'b': v <<= 4; v |= 0xb; break;
 660                         case 'c': v <<= 4; v |= 0xc; break;
 661                         case 'd': v <<= 4; v |= 0xd; break;
 662                         case 'e': v <<= 4; v |= 0xe; break;
 663                         case 'f': v <<= 4; v |= 0xf; break;
 664                         default:
 665                                 goto end;
 666                         }
 667                 }
 668                 break;
 669         case 8:
 670                 for (;; ++s) {
 671                         /* check for overrun */
 672                         if (v >= 0x2000000000000000ULL)
 673                                 break;
 674                         switch (tolower(*s)) {
 675                         case '0': v <<= 3; break;
 676                         case '1': v <<= 3; v |= 1; break;
 677                         case '2': v <<= 3; v |= 2; break;
 678                         case '3': v <<= 3; v |= 3; break;
 679                         case '4': v <<= 3; v |= 4; break;
 680                         case '5': v <<= 3; v |= 5; break;
 681                         case '6': v <<= 3; v |= 6; break;
 682                         case '7': v <<= 3; v |= 7; break;
 683                         default:
 684                                 goto end;
 685                         }
 686                 }
 687                 break;
 688         case 10:
 689                 for (;; ++s) {
 690                         /* check for overrun */
 691                         if (v > 0x1999999999999999ULL)
 692                                 break;
 693                         switch (tolower(*s)) {
 694                         case '0': v *= 10; break;
 695                         case '1': v *= 10; v += 1; break;
 696                         case '2': v *= 10; v += 2; break;
 697                         case '3': v *= 10; v += 3; break;
 698                         case '4': v *= 10; v += 4; break;
 699                         case '5': v *= 10; v += 5; break;
 700                         case '6': v *= 10; v += 6; break;
 701                         case '7': v *= 10; v += 7; break;
 702                         case '8': v *= 10; v += 8; break;
 703                         case '9': v *= 10; v += 9; break;
 704                         default:
 705                                 goto end;
 706                         }
 707                 }
 708                 break;
 709         default:
 710                 assert(0);
 711                 break;
 712         }
 713 end:
 714         *endptr = s;
 715         return v;
 716 }
 717
 718 /**
 719  * Parses a hex number including hex floats and set the
 720  * lexer_token.
 721  */
 722 static void parse_number_hex(void)
 723 {
 724         bool is_float = false;
 725         assert(c == 'x' || c == 'X');
 726         next_char();
 727
 728         obstack_1grow(&symbol_obstack, '0');
 729         obstack_1grow(&symbol_obstack, 'x');
 730
 731         while(isxdigit(c)) {
 732                 obstack_1grow(&symbol_obstack, (char) c);
 733                 next_char();
 734         }
 735
 736         if (c == '.') {
 737                 obstack_1grow(&symbol_obstack, (char) c);
 738                 next_char();
 739
 740                 while (isxdigit(c)) {
 741                         obstack_1grow(&symbol_obstack, (char) c);
 742                         next_char();
 743                 }
 744                 is_float = true;
 745         }
 746         if (c == 'p' || c == 'P') {
 747                 obstack_1grow(&symbol_obstack, (char) c);
 748                 next_char();
 749
 750                 if (c == '-' || c == '+') {
 751                         obstack_1grow(&symbol_obstack, (char) c);
 752                         next_char();
 753                 }
 754
 755                 while (isxdigit(c)) {
 756                         obstack_1grow(&symbol_obstack, (char) c);
 757                         next_char();
 758                 }
 759                 is_float = true;
 760         }
 761
 762         obstack_1grow(&symbol_obstack, '\0');
 763         char *string = obstack_finish(&symbol_obstack);
 764         if(*string == '\0') {
 765                 parse_error("invalid hex number");
 766                 lexer_token.type = T_ERROR;
 767                 obstack_free(&symbol_obstack, string);
 768                 return;
 769         }
 770
 771         if (is_float) {
 772                 char *endptr;
 773                 lexer_token.type         = T_FLOATINGPOINT;
 774                 lexer_token.v.floatvalue = strtold(string, &endptr);
 775
 776                 if(*endptr != '\0') {
 777                         parse_error("invalid hex float literal");
 778                 }
 779
 780                 parse_floating_suffix();
 781         } else {
 782                 const char *endptr;
 783                 lexer_token.type       = T_INTEGER;
 784                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
 785                 if(*endptr != '\0') {
 786                         parse_error("hex number literal too long");
 787                 }
 788                 parse_integer_suffix(true);
 789         }
 790
 791         obstack_free(&symbol_obstack, string);
 792 }
 793
 794 /**
 795  * Returns true if the given char is a octal digit.
 796  *
 797  * @param char  the character to check
 798  */
 799 static inline bool is_octal_digit(utf32 chr)
 800 {
 801         switch(chr) {
 802         case '0':
 803         case '1':
 804         case '2':
 805         case '3':
 806         case '4':
 807         case '5':
 808         case '6':
 809         case '7':
 810                 return true;
 811         default:
 812                 return false;
 813         }
 814 }
 815
 816 /**
 817  * Parses a octal number and set the lexer_token.
 818  */
 819 static void parse_number_oct(void)
 820 {
 821         while(is_octal_digit(c)) {
 822                 obstack_1grow(&symbol_obstack, (char) c);
 823                 next_char();
 824         }
 825         obstack_1grow(&symbol_obstack, '\0');
 826         char *string = obstack_finish(&symbol_obstack);
 827
 828         const char *endptr;
 829         lexer_token.type       = T_INTEGER;
 830         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 831         if(*endptr != '\0') {
 832                 parse_error("octal number literal too long");
 833         }
 834
 835         obstack_free(&symbol_obstack, string);
 836         parse_integer_suffix(true);
 837 }
 838
 839 /**
 840  * Parses a decimal including float number and set the
 841  * lexer_token.
 842  */
 843 static void parse_number_dec(void)
 844 {
 845         bool is_float = false;
 846         while (isdigit(c)) {
 847                 obstack_1grow(&symbol_obstack, (char) c);
 848                 next_char();
 849         }
 850
 851         if (c == '.') {
 852                 obstack_1grow(&symbol_obstack, '.');
 853                 next_char();
 854
 855                 while (isdigit(c)) {
 856                         obstack_1grow(&symbol_obstack, (char) c);
 857                         next_char();
 858                 }
 859                 is_float = true;
 860         }
 861         if(c == 'e' || c == 'E') {
 862                 obstack_1grow(&symbol_obstack, (char) c);
 863                 next_char();
 864
 865                 if(c == '-' || c == '+') {
 866                         obstack_1grow(&symbol_obstack, (char) c);
 867                         next_char();
 868                 }
 869
 870                 while(isdigit(c)) {
 871                         obstack_1grow(&symbol_obstack, (char) c);
 872                         next_char();
 873                 }
 874                 is_float = true;
 875         }
 876
 877         obstack_1grow(&symbol_obstack, '\0');
 878         char *string = obstack_finish(&symbol_obstack);
 879
 880         if(is_float) {
 881                 char *endptr;
 882                 lexer_token.type         = T_FLOATINGPOINT;
 883                 lexer_token.v.floatvalue = strtold(string, &endptr);
 884
 885                 if(*endptr != '\0') {
 886                         parse_error("invalid number literal");
 887                 }
 888
 889                 parse_floating_suffix();
 890         } else {
 891                 const char *endptr;
 892                 lexer_token.type       = T_INTEGER;
 893                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 894
 895                 if(*endptr != '\0') {
 896                         parse_error("invalid number literal");
 897                 }
 898
 899                 parse_integer_suffix(false);
 900         }
 901         obstack_free(&symbol_obstack, string);
 902 }
 903
 904 /**
 905  * Parses a number and sets the lexer_token.
 906  */
 907 static void parse_number(void)
 908 {
 909         if (c == '0') {
 910                 next_char();
 911                 switch (c) {
 912                         case 'X':
 913                         case 'x':
 914                                 parse_number_hex();
 915                                 break;
 916                         case '0':
 917                         case '1':
 918                         case '2':
 919                         case '3':
 920                         case '4':
 921                         case '5':
 922                         case '6':
 923                         case '7':
 924                                 parse_number_oct();
 925                                 break;
 926                         case '8':
 927                         case '9':
 928                                 next_char();
 929                                 parse_error("invalid octal number");
 930                                 lexer_token.type = T_ERROR;
 931                                 return;
 932                         case '.':
 933                         case 'e':
 934                         case 'E':
 935                         default:
 936                                 obstack_1grow(&symbol_obstack, '0');
 937                                 parse_number_dec();
 938                                 return;
 939                 }
 940         } else {
 941                 parse_number_dec();
 942         }
 943 }
 944
 945 /**
 946  * Returns the value of a digit.
 947  * The only portable way to do it ...
 948  */
 949 static int digit_value(utf32 const digit)
 950 {
 951         switch (digit) {
 952         case '0': return 0;
 953         case '1': return 1;
 954         case '2': return 2;
 955         case '3': return 3;
 956         case '4': return 4;
 957         case '5': return 5;
 958         case '6': return 6;
 959         case '7': return 7;
 960         case '8': return 8;
 961         case '9': return 9;
 962         case 'a':
 963         case 'A': return 10;
 964         case 'b':
 965         case 'B': return 11;
 966         case 'c':
 967         case 'C': return 12;
 968         case 'd':
 969         case 'D': return 13;
 970         case 'e':
 971         case 'E': return 14;
 972         case 'f':
 973         case 'F': return 15;
 974         default:
 975                 internal_error("wrong character given");
 976         }
 977 }
 978
 979 /**
 980  * Parses an octal character sequence.
 981  *
 982  * @param first_digit  the already read first digit
 983  */
 984 static utf32 parse_octal_sequence(utf32 const first_digit)
 985 {
 986         assert(is_octal_digit(first_digit));
 987         utf32 value = digit_value(first_digit);
 988         if (!is_octal_digit(c)) return value;
 989         value = 8 * value + digit_value(c);
 990         next_char();
 991         if (!is_octal_digit(c)) return value;
 992         value = 8 * value + digit_value(c);
 993         next_char();
 994         return value;
 995 }
 996
 997 /**
 998  * Parses a hex character sequence.
 999  */
1000 static utf32 parse_hex_sequence(void)
1001 {
1002         utf32 value = 0;
1003         while(isxdigit(c)) {
1004                 value = 16 * value + digit_value(c);
1005                 next_char();
1006         }
1007         return value;
1008 }
1009
1010 /**
1011  * Parse an escape sequence.
1012  */
1013 static utf32 parse_escape_sequence(void)
1014 {
1015         eat('\\');
1016
1017         utf32 const ec = c;
1018         next_char();
1019
1020         switch (ec) {
1021         case '"':  return '"';
1022         case '\'': return '\'';
1023         case '\\': return '\\';
1024         case '?': return '\?';
1025         case 'a': return '\a';
1026         case 'b': return '\b';
1027         case 'f': return '\f';
1028         case 'n': return '\n';
1029         case 'r': return '\r';
1030         case 't': return '\t';
1031         case 'v': return '\v';
1032         case 'x':
1033                 return parse_hex_sequence();
1034         case '0':
1035         case '1':
1036         case '2':
1037         case '3':
1038         case '4':
1039         case '5':
1040         case '6':
1041         case '7':
1042                 return parse_octal_sequence(ec);
1043         case EOF:
1044                 parse_error("reached end of file while parsing escape sequence");
1045                 return EOF;
1046         /* \E is not documented, but handled, by GCC.  It is acceptable according
1047          * to §6.11.4, whereas \e is not. */
1048         case 'E':
1049         case 'e':
1050                 if (c_mode & _GNUC)
1051                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1052                 /* FALLTHROUGH */
1053         default:
1054                 /* §6.4.4.4:8 footnote 64 */
1055                 parse_error("unknown escape sequence");
1056                 return EOF;
1057         }
1058 }
1059
1060 /**
1061  * Concatenate two strings.
1062  */
1063 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1064 {
1065         const size_t len1 = s1->size - 1;
1066         const size_t len2 = s2->size - 1;
1067
1068         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1069         memcpy(concat, s1->begin, len1);
1070         memcpy(concat + len1, s2->begin, len2 + 1);
1071
1072         if (warning.traditional) {
1073                 warningf(&lexer_token.source_position,
1074                         "traditional C rejects string constant concatenation");
1075         }
1076 #if 0 /* TODO hash */
1077         const char *result = strset_insert(&stringset, concat);
1078         if(result != concat) {
1079                 obstack_free(&symbol_obstack, concat);
1080         }
1081
1082         return result;
1083 #else
1084         return (string_t){ concat, len1 + len2 + 1 };
1085 #endif
1086 }
1087
1088 /**
1089  * Concatenate a string and a wide string.
1090  */
1091 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1092 {
1093         const size_t len1 = s1->size - 1;
1094         const size_t len2 = s2->size - 1;
1095
1096         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1097         const char *const src = s1->begin;
1098         for (size_t i = 0; i != len1; ++i) {
1099                 concat[i] = src[i];
1100         }
1101         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1102         if (warning.traditional) {
1103                 warningf(&lexer_token.source_position,
1104                         "traditional C rejects string constant concatenation");
1105         }
1106
1107         return (wide_string_t){ concat, len1 + len2 + 1 };
1108 }
1109
1110 /**
1111  * Concatenate two wide strings.
1112  */
1113 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1114 {
1115         const size_t len1 = s1->size - 1;
1116         const size_t len2 = s2->size - 1;
1117
1118         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1119         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1120         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1121         if (warning.traditional) {
1122                 warningf(&lexer_token.source_position,
1123                         "traditional C rejects string constant concatenation");
1124         }
1125
1126         return (wide_string_t){ concat, len1 + len2 + 1 };
1127 }
1128
1129 /**
1130  * Concatenate a wide string and a string.
1131  */
1132 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1133 {
1134         const size_t len1 = s1->size - 1;
1135         const size_t len2 = s2->size - 1;
1136
1137         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1138         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1139         const char  *const src = s2->begin;
1140         wchar_rep_t *const dst = concat + len1;
1141         for (size_t i = 0; i != len2 + 1; ++i) {
1142                 dst[i] = src[i];
1143         }
1144         if (warning.traditional) {
1145                 warningf(&lexer_token.source_position,
1146                         "traditional C rejects string constant concatenation");
1147         }
1148
1149         return (wide_string_t){ concat, len1 + len2 + 1 };
1150 }
1151
1152 static void grow_symbol(utf32 const tc)
1153 {
1154         struct obstack *const o  = &symbol_obstack;
1155         if (tc < 0x80U) {
1156                 obstack_1grow(o, tc);
1157         } else if (tc < 0x800) {
1158                 obstack_1grow(o, 0xC0 | (tc >> 6));
1159                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1160         } else if (tc < 0x10000) {
1161                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1162                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1163                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1164         } else {
1165                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1166                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1167                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1168                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1169         }
1170 }
1171
1172 /**
1173  * Parse a string literal and set lexer_token.
1174  */
1175 static void parse_string_literal(void)
1176 {
1177         const unsigned start_linenr = lexer_token.source_position.linenr;
1178
1179         eat('"');
1180
1181         while(1) {
1182                 switch(c) {
1183                 case '\\': {
1184                         utf32 const tc = parse_escape_sequence();
1185                         if (tc >= 0x100) {
1186                                 warningf(&lexer_token.source_position,
1187                                                 "escape sequence out of range");
1188                         }
1189                         obstack_1grow(&symbol_obstack, tc);
1190                         break;
1191                 }
1192
1193                 case EOF: {
1194                         source_position_t source_position;
1195                         source_position.input_name = lexer_token.source_position.input_name;
1196                         source_position.linenr     = start_linenr;
1197                         errorf(&source_position, "string has no end");
1198                         lexer_token.type = T_ERROR;
1199                         return;
1200                 }
1201
1202                 case '"':
1203                         next_char();
1204                         goto end_of_string;
1205
1206                 default:
1207                         grow_symbol(c);
1208                         next_char();
1209                         break;
1210                 }
1211         }
1212
1213 end_of_string:
1214
1215         /* TODO: concatenate multiple strings separated by whitespace... */
1216
1217         /* add finishing 0 to the string */
1218         obstack_1grow(&symbol_obstack, '\0');
1219         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1220         const char *const string = obstack_finish(&symbol_obstack);
1221
1222 #if 0 /* TODO hash */
1223         /* check if there is already a copy of the string */
1224         result = strset_insert(&stringset, string);
1225         if(result != string) {
1226                 obstack_free(&symbol_obstack, string);
1227         }
1228 #else
1229         const char *const result = string;
1230 #endif
1231
1232         lexer_token.type           = T_STRING_LITERAL;
1233         lexer_token.v.string.begin = result;
1234         lexer_token.v.string.size  = size;
1235 }
1236
1237 /**
1238  * Parse a wide character constant and set lexer_token.
1239  */
1240 static void parse_wide_character_constant(void)
1241 {
1242         const unsigned start_linenr = lexer_token.source_position.linenr;
1243
1244         eat('\'');
1245
1246         while(1) {
1247                 switch(c) {
1248                 case '\\': {
1249                         wchar_rep_t tc = parse_escape_sequence();
1250                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1251                         break;
1252                 }
1253
1254                 MATCH_NEWLINE(
1255                         parse_error("newline while parsing character constant");
1256                         break;
1257                 )
1258
1259                 case '\'':
1260                         next_char();
1261                         goto end_of_wide_char_constant;
1262
1263                 case EOF: {
1264                         source_position_t source_position = lexer_token.source_position;
1265                         source_position.linenr = start_linenr;
1266                         errorf(&source_position, "EOF while parsing character constant");
1267                         lexer_token.type = T_ERROR;
1268                         return;
1269                 }
1270
1271                 default: {
1272                         wchar_rep_t tc = (wchar_rep_t) c;
1273                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1274                         next_char();
1275                         break;
1276                 }
1277                 }
1278         }
1279
1280 end_of_wide_char_constant:;
1281         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1282         assert(size % sizeof(wchar_rep_t) == 0);
1283         size /= sizeof(wchar_rep_t);
1284
1285         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1286
1287         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1288         lexer_token.v.wide_string.begin = string;
1289         lexer_token.v.wide_string.size  = size;
1290         lexer_token.datatype            = type_wchar_t;
1291 }
1292
1293 /**
1294  * Parse a wide string literal and set lexer_token.
1295  */
1296 static void parse_wide_string_literal(void)
1297 {
1298         const unsigned start_linenr = lexer_token.source_position.linenr;
1299
1300         assert(c == '"');
1301         next_char();
1302
1303         while(1) {
1304                 switch(c) {
1305                 case '\\': {
1306                         wchar_rep_t tc = parse_escape_sequence();
1307                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1308                         break;
1309                 }
1310
1311                 case EOF: {
1312                         source_position_t source_position;
1313                         source_position.input_name = lexer_token.source_position.input_name;
1314                         source_position.linenr     = start_linenr;
1315                         errorf(&source_position, "string has no end");
1316                         lexer_token.type = T_ERROR;
1317                         return;
1318                 }
1319
1320                 case '"':
1321                         next_char();
1322                         goto end_of_string;
1323
1324                 default: {
1325                         wchar_rep_t tc = c;
1326                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1327                         next_char();
1328                         break;
1329                 }
1330                 }
1331         }
1332
1333 end_of_string:;
1334
1335         /* TODO: concatenate multiple strings separated by whitespace... */
1336
1337         /* add finishing 0 to the string */
1338         wchar_rep_t nul = L'\0';
1339         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1340         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1341         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1342
1343 #if 0 /* TODO hash */
1344         /* check if there is already a copy of the string */
1345         const wchar_rep_t *const result = strset_insert(&stringset, string);
1346         if(result != string) {
1347                 obstack_free(&symbol_obstack, string);
1348         }
1349 #else
1350         const wchar_rep_t *const result = string;
1351 #endif
1352
1353         lexer_token.type                = T_WIDE_STRING_LITERAL;
1354         lexer_token.v.wide_string.begin = result;
1355         lexer_token.v.wide_string.size  = size;
1356 }
1357
1358 /**
1359  * Parse a character constant and set lexer_token.
1360  */
1361 static void parse_character_constant(void)
1362 {
1363         const unsigned start_linenr = lexer_token.source_position.linenr;
1364
1365         eat('\'');
1366
1367         while(1) {
1368                 switch(c) {
1369                 case '\\': {
1370                         utf32 const tc = parse_escape_sequence();
1371                         if (tc >= 0x100) {
1372                                 warningf(&lexer_token.source_position,
1373                                                 "escape sequence out of range");
1374                         }
1375                         obstack_1grow(&symbol_obstack, tc);
1376                         break;
1377                 }
1378
1379                 MATCH_NEWLINE(
1380                         parse_error("newline while parsing character constant");
1381                         break;
1382                 )
1383
1384                 case '\'':
1385                         next_char();
1386                         goto end_of_char_constant;
1387
1388                 case EOF: {
1389                         source_position_t source_position;
1390                         source_position.input_name = lexer_token.source_position.input_name;
1391                         source_position.linenr     = start_linenr;
1392                         errorf(&source_position, "EOF while parsing character constant");
1393                         lexer_token.type = T_ERROR;
1394                         return;
1395                 }
1396
1397                 default:
1398                         grow_symbol(c);
1399                         next_char();
1400                         break;
1401
1402                 }
1403         }
1404
1405 end_of_char_constant:;
1406         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1407         const char *const string = obstack_finish(&symbol_obstack);
1408
1409         lexer_token.type           = T_CHARACTER_CONSTANT;
1410         lexer_token.v.string.begin = string;
1411         lexer_token.v.string.size  = size;
1412         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1413 }
1414
1415 /**
1416  * Skip a multiline comment.
1417  */
1418 static void skip_multiline_comment(void)
1419 {
1420         unsigned start_linenr = lexer_token.source_position.linenr;
1421
1422         while(1) {
1423                 switch(c) {
1424                 case '/':
1425                         next_char();
1426                         if (c == '*') {
1427                                 /* nested comment, warn here */
1428                                 if (warning.comment) {
1429                                         warningf(&lexer_token.source_position, "'/*' within comment");
1430                                 }
1431                         }
1432                         break;
1433                 case '*':
1434                         next_char();
1435                         if(c == '/') {
1436                                 next_char();
1437                                 return;
1438                         }
1439                         break;
1440
1441                 MATCH_NEWLINE(break;)
1442
1443                 case EOF: {
1444                         source_position_t source_position;
1445                         source_position.input_name = lexer_token.source_position.input_name;
1446                         source_position.linenr     = start_linenr;
1447                         errorf(&source_position, "at end of file while looking for comment end");
1448                         return;
1449                 }
1450
1451                 default:
1452                         next_char();
1453                         break;
1454                 }
1455         }
1456 }
1457
1458 /**
1459  * Skip a single line comment.
1460  */
1461 static void skip_line_comment(void)
1462 {
1463         while(1) {
1464                 switch(c) {
1465                 case EOF:
1466                         return;
1467
1468                 case '\n':
1469                 case '\r':
1470                         return;
1471
1472                 case '\\':
1473                         next_char();
1474                         if (c == '\n' || c == '\r') {
1475                                 if (warning.comment)
1476                                         warningf(&lexer_token.source_position, "multi-line comment");
1477                                 return;
1478                         }
1479                         break;
1480
1481                 default:
1482                         next_char();
1483                         break;
1484                 }
1485         }
1486 }
1487
1488 /** The current preprocessor token. */
1489 static token_t pp_token;
1490
1491 /**
1492  * Read the next preprocessor token.
1493  */
1494 static inline void next_pp_token(void)
1495 {
1496         lexer_next_preprocessing_token();
1497         pp_token = lexer_token;
1498 }
1499
1500 /**
1501  * Eat all preprocessor tokens until newline.
1502  */
1503 static void eat_until_newline(void)
1504 {
1505         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1506                 next_pp_token();
1507         }
1508 }
1509
1510 /**
1511  * Handle the define directive.
1512  */
1513 static void define_directive(void)
1514 {
1515         lexer_next_preprocessing_token();
1516         if(lexer_token.type != T_IDENTIFIER) {
1517                 parse_error("expected identifier after #define\n");
1518                 eat_until_newline();
1519         }
1520 }
1521
1522 /**
1523  * Handle the ifdef directive.
1524  */
1525 static void ifdef_directive(int is_ifndef)
1526 {
1527         (void) is_ifndef;
1528         lexer_next_preprocessing_token();
1529         //expect_identifier();
1530         //extect_newline();
1531 }
1532
1533 /**
1534  * Handle the endif directive.
1535  */
1536 static void endif_directive(void)
1537 {
1538         //expect_newline();
1539 }
1540
1541 /**
1542  * Parse the line directive.
1543  */
1544 static void parse_line_directive(void)
1545 {
1546         if(pp_token.type != T_INTEGER) {
1547                 parse_error("expected integer");
1548         } else {
1549                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1550                 next_pp_token();
1551         }
1552         if(pp_token.type == T_STRING_LITERAL) {
1553                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1554                 next_pp_token();
1555         }
1556
1557         eat_until_newline();
1558 }
1559
1560 /**
1561  * STDC pragmas.
1562  */
1563 typedef enum stdc_pragma_kind_t {
1564         STDC_UNKNOWN,
1565         STDC_FP_CONTRACT,
1566         STDC_FENV_ACCESS,
1567         STDC_CX_LIMITED_RANGE
1568 } stdc_pragma_kind_t;
1569
1570 /**
1571  * STDC pragma values.
1572  */
1573 typedef enum stdc_pragma_value_kind_t {
1574         STDC_VALUE_UNKNOWN,
1575         STDC_VALUE_ON,
1576         STDC_VALUE_OFF,
1577         STDC_VALUE_DEFAULT
1578 } stdc_pragma_value_kind_t;
1579
1580 /**
1581  * Parse a pragma directive.
1582  */
1583 static void parse_pragma(void)
1584 {
1585         bool unknown_pragma = true;
1586
1587         next_pp_token();
1588         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1589                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1590                 /* a STDC pragma */
1591                 if (c_mode & _C99) {
1592                         next_pp_token();
1593
1594                         switch (pp_token.v.symbol->pp_ID) {
1595                         case TP_FP_CONTRACT:
1596                                 kind = STDC_FP_CONTRACT;
1597                                 break;
1598                         case TP_FENV_ACCESS:
1599                                 kind = STDC_FENV_ACCESS;
1600                                 break;
1601                         case TP_CX_LIMITED_RANGE:
1602                                 kind = STDC_CX_LIMITED_RANGE;
1603                                 break;
1604                         default:
1605                                 break;
1606                         }
1607                         if (kind != STDC_UNKNOWN) {
1608                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1609                                 next_pp_token();
1610                                 switch (pp_token.v.symbol->pp_ID) {
1611                                 case TP_ON:
1612                                         value = STDC_VALUE_ON;
1613                                         break;
1614                                 case TP_OFF:
1615                                         value = STDC_VALUE_OFF;
1616                                         break;
1617                                 case TP_DEFAULT:
1618                                         value = STDC_VALUE_DEFAULT;
1619                                         break;
1620                                 default:
1621                                         break;
1622                                 }
1623                                 if (value != STDC_VALUE_UNKNOWN) {
1624                                         unknown_pragma = false;
1625                                 } else {
1626                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1627                                 }
1628                         }
1629                 }
1630         } else {
1631                 unknown_pragma = true;
1632         }
1633         eat_until_newline();
1634         if (unknown_pragma && warning.unknown_pragmas) {
1635                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1636         }
1637 }
1638
1639 /**
1640  * Parse a preprocessor non-null directive.
1641  */
1642 static void parse_preprocessor_identifier(void)
1643 {
1644         assert(pp_token.type == T_IDENTIFIER);
1645         symbol_t *symbol = pp_token.v.symbol;
1646
1647         switch(symbol->pp_ID) {
1648         case TP_include:
1649                 printf("include - enable header name parsing!\n");
1650                 break;
1651         case TP_define:
1652                 define_directive();
1653                 break;
1654         case TP_ifdef:
1655                 ifdef_directive(0);
1656                 break;
1657         case TP_ifndef:
1658                 ifdef_directive(1);
1659                 break;
1660         case TP_endif:
1661                 endif_directive();
1662                 break;
1663         case TP_line:
1664                 next_pp_token();
1665                 parse_line_directive();
1666                 break;
1667         case TP_if:
1668         case TP_else:
1669         case TP_elif:
1670         case TP_undef:
1671         case TP_error:
1672                 /* TODO; output the rest of the line */
1673                 parse_error("#error directive: ");
1674                 break;
1675         case TP_pragma:
1676                 parse_pragma();
1677                 break;
1678         }
1679 }
1680
1681 /**
1682  * Parse a preprocessor directive.
1683  */
1684 static void parse_preprocessor_directive(void)
1685 {
1686         next_pp_token();
1687
1688         switch(pp_token.type) {
1689         case T_IDENTIFIER:
1690                 parse_preprocessor_identifier();
1691                 break;
1692         case T_INTEGER:
1693                 parse_line_directive();
1694                 break;
1695         case '\n':
1696                 /* NULL directive, see § 6.10.7 */
1697                 break;
1698         default:
1699                 parse_error("invalid preprocessor directive");
1700                 eat_until_newline();
1701                 break;
1702         }
1703 }
1704
1705 #define MAYBE_PROLOG                                       \
1706                         next_char();                                   \
1707                         while(1) {                                     \
1708                                 switch(c) {
1709
1710 #define MAYBE(ch, set_type)                                \
1711                                 case ch:                                   \
1712                                         next_char();                           \
1713                                         lexer_token.type = set_type;           \
1714                                         return;
1715
1716 #define ELSE_CODE(code)                                    \
1717                                 default:                                   \
1718                                         code                                   \
1719                                 }                                          \
1720                         } /* end of while(1) */                        \
1721                         break;
1722
1723 #define ELSE(set_type)                                     \
1724                 ELSE_CODE(                                         \
1725                         lexer_token.type = set_type;                   \
1726                         return;                                        \
1727                 )
1728
1729 void lexer_next_preprocessing_token(void)
1730 {
1731         while(1) {
1732                 switch(c) {
1733                 case ' ':
1734                 case '\t':
1735                         next_char();
1736                         break;
1737
1738                 MATCH_NEWLINE(
1739                         lexer_token.type = '\n';
1740                         return;
1741                 )
1742
1743                 SYMBOL_CHARS
1744                         parse_symbol();
1745                         /* might be a wide string ( L"string" ) */
1746                         if(lexer_token.type == T_IDENTIFIER &&
1747                             lexer_token.v.symbol == symbol_L) {
1748                             if(c == '"') {
1749                                         parse_wide_string_literal();
1750                                 } else if(c == '\'') {
1751                                         parse_wide_character_constant();
1752                                 }
1753                         }
1754                         return;
1755
1756                 DIGITS
1757                         parse_number();
1758                         return;
1759
1760                 case '"':
1761                         parse_string_literal();
1762                         return;
1763
1764                 case '\'':
1765                         parse_character_constant();
1766                         return;
1767
1768                 case '.':
1769                         MAYBE_PROLOG
1770                                 DIGITS
1771                                         put_back(c);
1772                                         c = '.';
1773                                         parse_number_dec();
1774                                         return;
1775
1776                                 case '.':
1777                                         MAYBE_PROLOG
1778                                         MAYBE('.', T_DOTDOTDOT)
1779                                         ELSE_CODE(
1780                                                 put_back(c);
1781                                                 c = '.';
1782                                                 lexer_token.type = '.';
1783                                                 return;
1784                                         )
1785                         ELSE('.')
1786                 case '&':
1787                         MAYBE_PROLOG
1788                         MAYBE('&', T_ANDAND)
1789                         MAYBE('=', T_ANDEQUAL)
1790                         ELSE('&')
1791                 case '*':
1792                         MAYBE_PROLOG
1793                         MAYBE('=', T_ASTERISKEQUAL)
1794                         ELSE('*')
1795                 case '+':
1796                         MAYBE_PROLOG
1797                         MAYBE('+', T_PLUSPLUS)
1798                         MAYBE('=', T_PLUSEQUAL)
1799                         ELSE('+')
1800                 case '-':
1801                         MAYBE_PROLOG
1802                         MAYBE('>', T_MINUSGREATER)
1803                         MAYBE('-', T_MINUSMINUS)
1804                         MAYBE('=', T_MINUSEQUAL)
1805                         ELSE('-')
1806                 case '!':
1807                         MAYBE_PROLOG
1808                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1809                         ELSE('!')
1810                 case '/':
1811                         MAYBE_PROLOG
1812                         MAYBE('=', T_SLASHEQUAL)
1813                                 case '*':
1814                                         next_char();
1815                                         skip_multiline_comment();
1816                                         lexer_next_preprocessing_token();
1817                                         return;
1818                                 case '/':
1819                                         next_char();
1820                                         skip_line_comment();
1821                                         lexer_next_preprocessing_token();
1822                                         return;
1823                         ELSE('/')
1824                 case '%':
1825                         MAYBE_PROLOG
1826                         MAYBE('>', '}')
1827                         MAYBE('=', T_PERCENTEQUAL)
1828                                 case ':':
1829                                         MAYBE_PROLOG
1830                                                 case '%':
1831                                                         MAYBE_PROLOG
1832                                                         MAYBE(':', T_HASHHASH)
1833                                                         ELSE_CODE(
1834                                                                 put_back(c);
1835                                                                 c = '%';
1836                                                                 lexer_token.type = '#';
1837                                                                 return;
1838                                                         )
1839                                         ELSE('#')
1840                         ELSE('%')
1841                 case '<':
1842                         MAYBE_PROLOG
1843                         MAYBE(':', '[')
1844                         MAYBE('%', '{')
1845                         MAYBE('=', T_LESSEQUAL)
1846                                 case '<':
1847                                         MAYBE_PROLOG
1848                                         MAYBE('=', T_LESSLESSEQUAL)
1849                                         ELSE(T_LESSLESS)
1850                         ELSE('<')
1851                 case '>':
1852                         MAYBE_PROLOG
1853                         MAYBE('=', T_GREATEREQUAL)
1854                                 case '>':
1855                                         MAYBE_PROLOG
1856                                         MAYBE('=', T_GREATERGREATEREQUAL)
1857                                         ELSE(T_GREATERGREATER)
1858                         ELSE('>')
1859                 case '^':
1860                         MAYBE_PROLOG
1861                         MAYBE('=', T_CARETEQUAL)
1862                         ELSE('^')
1863                 case '|':
1864                         MAYBE_PROLOG
1865                         MAYBE('=', T_PIPEEQUAL)
1866                         MAYBE('|', T_PIPEPIPE)
1867                         ELSE('|')
1868                 case ':':
1869                         MAYBE_PROLOG
1870                         MAYBE('>', ']')
1871                         ELSE(':')
1872                 case '=':
1873                         MAYBE_PROLOG
1874                         MAYBE('=', T_EQUALEQUAL)
1875                         ELSE('=')
1876                 case '#':
1877                         MAYBE_PROLOG
1878                         MAYBE('#', T_HASHHASH)
1879                         ELSE('#')
1880
1881                 case '?':
1882                 case '[':
1883                 case ']':
1884                 case '(':
1885                 case ')':
1886                 case '{':
1887                 case '}':
1888                 case '~':
1889                 case ';':
1890                 case ',':
1891                 case '\\':
1892                         lexer_token.type = c;
1893                         next_char();
1894                         return;
1895
1896                 case EOF:
1897                         lexer_token.type = T_EOF;
1898                         return;
1899
1900                 default:
1901 dollar_sign:
1902                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1903                         next_char();
1904                         lexer_token.type = T_ERROR;
1905                         return;
1906                 }
1907         }
1908 }
1909
1910 void lexer_next_token(void)
1911 {
1912         lexer_next_preprocessing_token();
1913
1914         while (lexer_token.type == '\n') {
1915 newline_found:
1916                 lexer_next_preprocessing_token();
1917         }
1918
1919         if (lexer_token.type == '#') {
1920                 parse_preprocessor_directive();
1921                 goto newline_found;
1922         }
1923 }
1924
1925 void init_lexer(void)
1926 {
1927         strset_init(&stringset);
1928         symbol_L = symbol_table_insert("L");
1929 }
1930
1931 void lexer_open_stream(FILE *stream, const char *input_name)
1932 {
1933         input                                  = stream;
1934         lexer_token.source_position.linenr     = 0;
1935         lexer_token.source_position.input_name = input_name;
1936
1937         bufpos = NULL;
1938         bufend = NULL;
1939
1940         /* place a virtual \n at the beginning so the lexer knows that we're
1941          * at the beginning of a line */
1942         c = '\n';
1943 }
1944
1945 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1946 {
1947         input                                  = NULL;
1948         lexer_token.source_position.linenr     = 0;
1949         lexer_token.source_position.input_name = input_name;
1950
1951 #if 0 // TODO
1952         bufpos = buffer;
1953         bufend = buffer + len;
1954 #else
1955         (void)buffer;
1956         (void)len;
1957         panic("builtin lexing not done yet");
1958 #endif
1959
1960         /* place a virtual \n at the beginning so the lexer knows that we're
1961          * at the beginning of a line */
1962         c = '\n';
1963 }
1964
1965 void exit_lexer(void)
1966 {
1967         strset_destroy(&stringset);
1968 }
1969
1970 static __attribute__((unused))
1971 void dbg_pos(const source_position_t source_position)
1972 {
1973         fprintf(stdout, "%s:%u\n", source_position.input_name,
1974                 source_position.linenr);
1975         fflush(stdout);
1976 }