nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <strings.h>
  41 #include <stdbool.h>
  42 #include <ctype.h>
  43
  44 //#define DEBUG_CHARS
  45 #define MAX_PUTBACK 3
  46 #define BUF_SIZE    1024
  47
  48 #if defined(_WIN32) || defined(__CYGWIN__)
  49 /* No strtold on windows and no replacement yet */
  50 #define strtold(s, e) strtod(s, e)
  51 #endif
  52
  53 typedef unsigned int utf32;
  54
  55 static utf32        c;
  56 token_t             lexer_token;
  57 symbol_t           *symbol_L;
  58 static FILE        *input;
  59 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  60 static const utf32 *bufend;
  61 static const utf32 *bufpos;
  62 static strset_t     stringset;
  63 bool                allow_dollar_in_symbol = true;
  64
  65 /**
  66  * Prints a parse error message at the current token.
  67  *
  68  * @param msg   the error message
  69  */
  70 static void parse_error(const char *msg)
  71 {
  72         errorf(&lexer_token.source_position, "%s", msg);
  73 }
  74
  75 /**
  76  * Prints an internal error message at the current token.
  77  *
  78  * @param msg   the error message
  79  */
  80 static NORETURN internal_error(const char *msg)
  81 {
  82         internal_errorf(&lexer_token.source_position, "%s", msg);
  83 }
  84
  85 static size_t read_block(unsigned char *const read_buf, size_t const n)
  86 {
  87         size_t const s = fread(read_buf, 1, n, input);
  88         if (s == 0) {
  89                 if (ferror(input))
  90                         parse_error("read from input failed");
  91                 buf[MAX_PUTBACK] = EOF;
  92                 bufpos           = buf + MAX_PUTBACK;
  93                 bufend           = buf + MAX_PUTBACK + 1;
  94         }
  95         return s;
  96 }
  97
  98 static void decode_iso_8859_1(void)
  99 {
 100         unsigned char read_buf[BUF_SIZE];
 101         size_t const s = read_block(read_buf, sizeof(read_buf));
 102         if (s == 0)
 103                 return;
 104
 105         unsigned char const *src = read_buf;
 106         unsigned char const *end = read_buf + s;
 107         utf32               *dst = buf + MAX_PUTBACK;
 108         while (src != end)
 109                 *dst++ = *src++;
 110
 111         bufpos = buf + MAX_PUTBACK;
 112         bufend = dst;
 113 }
 114
 115 static void decode_iso_8859_15(void)
 116 {
 117         unsigned char read_buf[BUF_SIZE];
 118         size_t const s = read_block(read_buf, sizeof(read_buf));
 119         if (s == 0)
 120                 return;
 121
 122         unsigned char const *src = read_buf;
 123         unsigned char const *end = read_buf + s;
 124         utf32               *dst = buf + MAX_PUTBACK;
 125         while (src != end) {
 126                 utf32 tc = *src++;
 127                 switch (tc) {
 128                         case 0xA4: tc = 0x20AC; break; // €
 129                         case 0xA6: tc = 0x0160; break; // Š
 130                         case 0xA8: tc = 0x0161; break; // š
 131                         case 0xB4: tc = 0x017D; break; // Ž
 132                         case 0xB8: tc = 0x017E; break; // ž
 133                         case 0xBC: tc = 0x0152; break; // Œ
 134                         case 0xBD: tc = 0x0153; break; // œ
 135                         case 0xBE: tc = 0x0178; break; // Ÿ
 136                 }
 137                 *dst++ = tc;
 138         }
 139
 140         bufpos = buf + MAX_PUTBACK;
 141         bufend = dst;
 142 }
 143
 144 static void decode_utf8(void)
 145 {
 146         static utf32  part_decoded_min_code;
 147         static utf32  part_decoded_char;
 148         static size_t part_decoded_rest_len;
 149
 150         do {
 151                 unsigned char read_buf[BUF_SIZE];
 152                 size_t const s = read_block(read_buf, sizeof(read_buf));
 153                 if (s == 0) {
 154                         if (part_decoded_rest_len > 0)
 155                                 parse_error("incomplete input char at end of input");
 156                         return;
 157                 }
 158
 159                 unsigned char const *src = read_buf;
 160                 unsigned char const *end = read_buf + s;
 161                 utf32               *dst = buf + MAX_PUTBACK;
 162                 utf32                decoded;
 163                 utf32                min_code;
 164
 165                 if (part_decoded_rest_len != 0) {
 166                         min_code              = part_decoded_min_code;
 167                         decoded               = part_decoded_char;
 168                         size_t const rest_len = part_decoded_rest_len;
 169                         part_decoded_rest_len = 0;
 170                         switch (rest_len) {
 171                                 case 4:  goto realign;
 172                                 case 3:  goto three_more;
 173                                 case 2:  goto two_more;
 174                                 default: goto one_more;
 175                         }
 176                 }
 177
 178                 while (src != end) {
 179                         if ((*src & 0x80) == 0) {
 180                                 decoded = *src++;
 181                         } else if ((*src & 0xE0) == 0xC0) {
 182                                 min_code = 0x80;
 183                                 decoded  = *src++ & 0x1F;
 184 one_more:
 185                                 if (src == end) {
 186                                         part_decoded_min_code = min_code;
 187                                         part_decoded_char     = decoded;
 188                                         part_decoded_rest_len = 1;
 189                                         break;
 190                                 }
 191                                 if ((*src & 0xC0) == 0x80) {
 192                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 193                                 } else {
 194                                         goto invalid_char;
 195                                 }
 196                                 if (decoded < min_code                      ||
 197                                                 decoded > 0x10FFFF                      ||
 198                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 199                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 200                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 201                                         parse_error("invalid byte sequence in input");
 202                                 }
 203                         } else if ((*src & 0xF0) == 0xE0) {
 204                                 min_code = 0x800;
 205                                 decoded  = *src++ & 0x0F;
 206 two_more:
 207                                 if (src == end) {
 208                                         part_decoded_min_code = min_code;
 209                                         part_decoded_char     = decoded;
 210                                         part_decoded_rest_len = 2;
 211                                         break;
 212                                 }
 213                                 if ((*src & 0xC0) == 0x80) {
 214                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 215                                 } else {
 216                                         goto invalid_char;
 217                                 }
 218                                 goto one_more;
 219                         } else if ((*src & 0xF8) == 0xF0) {
 220                                 min_code = 0x10000;
 221                                 decoded  = *src++ & 0x07;
 222 three_more:
 223                                 if (src == end) {
 224                                         part_decoded_min_code = min_code;
 225                                         part_decoded_char     = decoded;
 226                                         part_decoded_rest_len = 3;
 227                                         break;
 228                                 }
 229                                 if ((*src & 0xC0) == 0x80) {
 230                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 231                                 } else {
 232                                         goto invalid_char;
 233                                 }
 234                                 goto two_more;
 235                         } else {
 236 invalid_char:
 237                                 parse_error("invalid byte sequence in input");
 238 realign:
 239                                 do {
 240                                         ++src;
 241                                         if (src == end) {
 242                                                 part_decoded_rest_len = 4;
 243                                                 break;
 244                                         }
 245                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 246                                 continue;
 247                         }
 248                         *dst++ = decoded;
 249                 }
 250
 251                 bufpos = buf + MAX_PUTBACK;
 252                 bufend = dst;
 253         } while (bufpos == bufend);
 254 }
 255
 256 typedef void (*decoder_t)(void);
 257
 258 static decoder_t decoder = decode_utf8;
 259
 260 typedef struct named_decoder_t {
 261         char const *name;
 262         decoder_t   decoder;
 263 } named_decoder_t;
 264
 265 static named_decoder_t const decoders[] = {
 266         { "CP819",           decode_iso_8859_1  }, // offical alias
 267         { "IBM819",          decode_iso_8859_1  }, // offical alias
 268         { "ISO-8859-1",      decode_iso_8859_1  }, // offical alias
 269         { "ISO-8859-15",     decode_iso_8859_15 }, // offical name
 270         { "ISO8859-1",       decode_iso_8859_1  },
 271         { "ISO8859-15",      decode_iso_8859_15 },
 272         { "ISO_8859-1",      decode_iso_8859_1  }, // offical alias
 273         { "ISO_8859-15",     decode_iso_8859_15 }, // offical alias
 274         { "ISO_8859-1:1987", decode_iso_8859_1  }, // offical name
 275         { "Latin-9",         decode_iso_8859_15 }, // offical alias
 276         { "UTF-8",           decode_utf8        }, // offical name
 277         { "csISOLatin1",     decode_iso_8859_1  }, // offical alias
 278         { "iso-ir-100",      decode_iso_8859_1  }, // offical alias
 279         { "l1",              decode_iso_8859_1  }, // offical alias
 280         { "latin1",          decode_iso_8859_1  }, // offical alias
 281
 282         { NULL,              NULL               }
 283 };
 284
 285 void select_input_encoding(char const* const encoding)
 286 {
 287         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 288                 if (strcasecmp(encoding, i->name) != 0)
 289                         continue;
 290                 decoder = i->decoder;
 291                 return;
 292         }
 293         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 294 }
 295
 296 static inline void next_real_char(void)
 297 {
 298         assert(bufpos <= bufend);
 299         if (bufpos >= bufend) {
 300                 if (input == NULL) {
 301                         c = EOF;
 302                         return;
 303                 }
 304                 decoder();
 305         }
 306         c = *bufpos++;
 307 }
 308
 309 /**
 310  * Put a character back into the buffer.
 311  *
 312  * @param pc  the character to put back
 313  */
 314 static inline void put_back(utf32 const pc)
 315 {
 316         assert(bufpos > buf);
 317         *(--bufpos - buf + buf) = pc;
 318
 319 #ifdef DEBUG_CHARS
 320         printf("putback '%lc'\n", pc);
 321 #endif
 322 }
 323
 324 static inline void next_char(void);
 325
 326 #define MATCH_NEWLINE(code)                   \
 327         case '\r':                                \
 328                 next_char();                          \
 329                 if(c == '\n') {                       \
 330                         next_char();                      \
 331                 }                                     \
 332                 lexer_token.source_position.linenr++; \
 333                 code                                  \
 334         case '\n':                                \
 335                 next_char();                          \
 336                 lexer_token.source_position.linenr++; \
 337                 code
 338
 339 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 340
 341 static void maybe_concat_lines(void)
 342 {
 343         eat('\\');
 344
 345         switch(c) {
 346         MATCH_NEWLINE(return;)
 347
 348         default:
 349                 break;
 350         }
 351
 352         put_back(c);
 353         c = '\\';
 354 }
 355
 356 /**
 357  * Set c to the next input character, ie.
 358  * after expanding trigraphs.
 359  */
 360 static inline void next_char(void)
 361 {
 362         next_real_char();
 363
 364         /* filter trigraphs */
 365         if(UNLIKELY(c == '\\')) {
 366                 maybe_concat_lines();
 367                 goto end_of_next_char;
 368         }
 369
 370         if(LIKELY(c != '?'))
 371                 goto end_of_next_char;
 372
 373         next_real_char();
 374         if(LIKELY(c != '?')) {
 375                 put_back(c);
 376                 c = '?';
 377                 goto end_of_next_char;
 378         }
 379
 380         next_real_char();
 381         switch(c) {
 382         case '=': c = '#'; break;
 383         case '(': c = '['; break;
 384         case '/': c = '\\'; maybe_concat_lines(); break;
 385         case ')': c = ']'; break;
 386         case '\'': c = '^'; break;
 387         case '<': c = '{'; break;
 388         case '!': c = '|'; break;
 389         case '>': c = '}'; break;
 390         case '-': c = '~'; break;
 391         default:
 392                 put_back(c);
 393                 put_back('?');
 394                 c = '?';
 395                 break;
 396         }
 397
 398 end_of_next_char:;
 399 #ifdef DEBUG_CHARS
 400         printf("nchar '%c'\n", c);
 401 #endif
 402 }
 403
 404 #define SYMBOL_CHARS  \
 405         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 406         case 'a':         \
 407         case 'b':         \
 408         case 'c':         \
 409         case 'd':         \
 410         case 'e':         \
 411         case 'f':         \
 412         case 'g':         \
 413         case 'h':         \
 414         case 'i':         \
 415         case 'j':         \
 416         case 'k':         \
 417         case 'l':         \
 418         case 'm':         \
 419         case 'n':         \
 420         case 'o':         \
 421         case 'p':         \
 422         case 'q':         \
 423         case 'r':         \
 424         case 's':         \
 425         case 't':         \
 426         case 'u':         \
 427         case 'v':         \
 428         case 'w':         \
 429         case 'x':         \
 430         case 'y':         \
 431         case 'z':         \
 432         case 'A':         \
 433         case 'B':         \
 434         case 'C':         \
 435         case 'D':         \
 436         case 'E':         \
 437         case 'F':         \
 438         case 'G':         \
 439         case 'H':         \
 440         case 'I':         \
 441         case 'J':         \
 442         case 'K':         \
 443         case 'L':         \
 444         case 'M':         \
 445         case 'N':         \
 446         case 'O':         \
 447         case 'P':         \
 448         case 'Q':         \
 449         case 'R':         \
 450         case 'S':         \
 451         case 'T':         \
 452         case 'U':         \
 453         case 'V':         \
 454         case 'W':         \
 455         case 'X':         \
 456         case 'Y':         \
 457         case 'Z':         \
 458         case '_':
 459
 460 #define DIGITS        \
 461         case '0':         \
 462         case '1':         \
 463         case '2':         \
 464         case '3':         \
 465         case '4':         \
 466         case '5':         \
 467         case '6':         \
 468         case '7':         \
 469         case '8':         \
 470         case '9':
 471
 472 /**
 473  * Read a symbol from the input and build
 474  * the lexer_token.
 475  */
 476 static void parse_symbol(void)
 477 {
 478         symbol_t *symbol;
 479         char     *string;
 480
 481         obstack_1grow(&symbol_obstack, (char) c);
 482         next_char();
 483
 484         while(1) {
 485                 switch(c) {
 486                 DIGITS
 487                 SYMBOL_CHARS
 488                         obstack_1grow(&symbol_obstack, (char) c);
 489                         next_char();
 490                         break;
 491
 492                 default:
 493 dollar_sign:
 494                         goto end_symbol;
 495                 }
 496         }
 497
 498 end_symbol:
 499         obstack_1grow(&symbol_obstack, '\0');
 500
 501         string = obstack_finish(&symbol_obstack);
 502         symbol = symbol_table_insert(string);
 503
 504         lexer_token.type     = symbol->ID;
 505         lexer_token.v.symbol = symbol;
 506
 507         if(symbol->string != string) {
 508                 obstack_free(&symbol_obstack, string);
 509         }
 510 }
 511
 512 static void parse_integer_suffix(bool is_oct_hex)
 513 {
 514         bool is_unsigned     = false;
 515         bool min_long        = false;
 516         bool min_longlong    = false;
 517         bool not_traditional = false;
 518         int  pos             = 0;
 519         char suffix[4];
 520
 521         if (c == 'U' || c == 'u') {
 522                 not_traditional = true;
 523                 suffix[pos++]   = toupper(c);
 524                 is_unsigned     = true;
 525                 next_char();
 526                 if (c == 'L' || c == 'l') {
 527                         suffix[pos++] = toupper(c);
 528                         min_long = true;
 529                         next_char();
 530                         if (c == 'L' || c == 'l') {
 531                                 suffix[pos++] = toupper(c);
 532                                 min_longlong = true;
 533                                 next_char();
 534                         }
 535                 }
 536         } else if (c == 'l' || c == 'L') {
 537                 suffix[pos++] = toupper(c);
 538                 min_long = true;
 539                 next_char();
 540                 if (c == 'l' || c == 'L') {
 541                         not_traditional = true;
 542                         suffix[pos++]   = toupper(c);
 543                         min_longlong    = true;
 544                         next_char();
 545                         if (c == 'u' || c == 'U') {
 546                                 suffix[pos++] = toupper(c);
 547                                 is_unsigned   = true;
 548                                 next_char();
 549                         }
 550                 } else if (c == 'u' || c == 'U') {
 551                         not_traditional = true;
 552                         suffix[pos++]   = toupper(c);
 553                         is_unsigned     = true;
 554                         next_char();
 555                         lexer_token.datatype = type_unsigned_long;
 556                 }
 557         }
 558
 559         if (warning.traditional && not_traditional) {
 560                 suffix[pos] = '\0';
 561                 warningf(&lexer_token.source_position,
 562                         "traditional C rejects the '%s' suffix", suffix);
 563         }
 564         if (!is_unsigned) {
 565                 long long v = lexer_token.v.intvalue;
 566                 if (!min_long) {
 567                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 568                                 lexer_token.datatype = type_int;
 569                                 return;
 570                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 571                                 lexer_token.datatype = type_unsigned_int;
 572                                 return;
 573                         }
 574                 }
 575                 if (!min_longlong) {
 576                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 577                                 lexer_token.datatype = type_long;
 578                                 return;
 579                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
 580                                 lexer_token.datatype = type_unsigned_long;
 581                                 return;
 582                         }
 583                 }
 584                 unsigned long long uv = (unsigned long long) v;
 585                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 586                         lexer_token.datatype = type_unsigned_long_long;
 587                         return;
 588                 }
 589
 590                 lexer_token.datatype = type_long_long;
 591         } else {
 592                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 593                 if (!min_long && v <= TARGET_UINT_MAX) {
 594                         lexer_token.datatype = type_unsigned_int;
 595                         return;
 596                 }
 597                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
 598                         lexer_token.datatype = type_unsigned_long;
 599                         return;
 600                 }
 601                 lexer_token.datatype = type_unsigned_long_long;
 602         }
 603 }
 604
 605 static void parse_floating_suffix(void)
 606 {
 607         switch(c) {
 608         /* TODO: do something useful with the suffixes... */
 609         case 'f':
 610         case 'F':
 611                 if (warning.traditional) {
 612                         warningf(&lexer_token.source_position,
 613                                 "traditional C rejects the 'F' suffix");
 614                 }
 615                 next_char();
 616                 lexer_token.datatype = type_float;
 617                 break;
 618         case 'l':
 619         case 'L':
 620                 if (warning.traditional) {
 621                         warningf(&lexer_token.source_position,
 622                                 "traditional C rejects the 'F' suffix");
 623                 }
 624                 next_char();
 625                 lexer_token.datatype = type_long_double;
 626                 break;
 627         default:
 628                 lexer_token.datatype = type_double;
 629                 break;
 630         }
 631 }
 632
 633 /**
 634  * A replacement for strtoull. Only those parts needed for
 635  * our parser are implemented.
 636  */
 637 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 638         unsigned long long v = 0;
 639
 640         switch (base) {
 641         case 16:
 642                 for (;; ++s) {
 643                         /* check for overrun */
 644                         if (v >= 0x1000000000000000ULL)
 645                                 break;
 646                         switch (tolower(*s)) {
 647                         case '0': v <<= 4; break;
 648                         case '1': v <<= 4; v |= 0x1; break;
 649                         case '2': v <<= 4; v |= 0x2; break;
 650                         case '3': v <<= 4; v |= 0x3; break;
 651                         case '4': v <<= 4; v |= 0x4; break;
 652                         case '5': v <<= 4; v |= 0x5; break;
 653                         case '6': v <<= 4; v |= 0x6; break;
 654                         case '7': v <<= 4; v |= 0x7; break;
 655                         case '8': v <<= 4; v |= 0x8; break;
 656                         case '9': v <<= 4; v |= 0x9; break;
 657                         case 'a': v <<= 4; v |= 0xa; break;
 658                         case 'b': v <<= 4; v |= 0xb; break;
 659                         case 'c': v <<= 4; v |= 0xc; break;
 660                         case 'd': v <<= 4; v |= 0xd; break;
 661                         case 'e': v <<= 4; v |= 0xe; break;
 662                         case 'f': v <<= 4; v |= 0xf; break;
 663                         default:
 664                                 goto end;
 665                         }
 666                 }
 667                 break;
 668         case 8:
 669                 for (;; ++s) {
 670                         /* check for overrun */
 671                         if (v >= 0x2000000000000000ULL)
 672                                 break;
 673                         switch (tolower(*s)) {
 674                         case '0': v <<= 3; break;
 675                         case '1': v <<= 3; v |= 1; break;
 676                         case '2': v <<= 3; v |= 2; break;
 677                         case '3': v <<= 3; v |= 3; break;
 678                         case '4': v <<= 3; v |= 4; break;
 679                         case '5': v <<= 3; v |= 5; break;
 680                         case '6': v <<= 3; v |= 6; break;
 681                         case '7': v <<= 3; v |= 7; break;
 682                         default:
 683                                 goto end;
 684                         }
 685                 }
 686                 break;
 687         case 10:
 688                 for (;; ++s) {
 689                         /* check for overrun */
 690                         if (v > 0x1999999999999999ULL)
 691                                 break;
 692                         switch (tolower(*s)) {
 693                         case '0': v *= 10; break;
 694                         case '1': v *= 10; v += 1; break;
 695                         case '2': v *= 10; v += 2; break;
 696                         case '3': v *= 10; v += 3; break;
 697                         case '4': v *= 10; v += 4; break;
 698                         case '5': v *= 10; v += 5; break;
 699                         case '6': v *= 10; v += 6; break;
 700                         case '7': v *= 10; v += 7; break;
 701                         case '8': v *= 10; v += 8; break;
 702                         case '9': v *= 10; v += 9; break;
 703                         default:
 704                                 goto end;
 705                         }
 706                 }
 707                 break;
 708         default:
 709                 assert(0);
 710                 break;
 711         }
 712 end:
 713         *endptr = s;
 714         return v;
 715 }
 716
 717 /**
 718  * Parses a hex number including hex floats and set the
 719  * lexer_token.
 720  */
 721 static void parse_number_hex(void)
 722 {
 723         bool is_float = false;
 724         assert(c == 'x' || c == 'X');
 725         next_char();
 726
 727         obstack_1grow(&symbol_obstack, '0');
 728         obstack_1grow(&symbol_obstack, 'x');
 729
 730         while(isxdigit(c)) {
 731                 obstack_1grow(&symbol_obstack, (char) c);
 732                 next_char();
 733         }
 734
 735         if (c == '.') {
 736                 obstack_1grow(&symbol_obstack, (char) c);
 737                 next_char();
 738
 739                 while (isxdigit(c)) {
 740                         obstack_1grow(&symbol_obstack, (char) c);
 741                         next_char();
 742                 }
 743                 is_float = true;
 744         }
 745         if (c == 'p' || c == 'P') {
 746                 obstack_1grow(&symbol_obstack, (char) c);
 747                 next_char();
 748
 749                 if (c == '-' || c == '+') {
 750                         obstack_1grow(&symbol_obstack, (char) c);
 751                         next_char();
 752                 }
 753
 754                 while (isxdigit(c)) {
 755                         obstack_1grow(&symbol_obstack, (char) c);
 756                         next_char();
 757                 }
 758                 is_float = true;
 759         }
 760
 761         obstack_1grow(&symbol_obstack, '\0');
 762         char *string = obstack_finish(&symbol_obstack);
 763         if(*string == '\0') {
 764                 parse_error("invalid hex number");
 765                 lexer_token.type = T_ERROR;
 766                 obstack_free(&symbol_obstack, string);
 767                 return;
 768         }
 769
 770         if (is_float) {
 771                 char *endptr;
 772                 lexer_token.type         = T_FLOATINGPOINT;
 773                 lexer_token.v.floatvalue = strtold(string, &endptr);
 774
 775                 if(*endptr != '\0') {
 776                         parse_error("invalid hex float literal");
 777                 }
 778
 779                 parse_floating_suffix();
 780         } else {
 781                 const char *endptr;
 782                 lexer_token.type       = T_INTEGER;
 783                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
 784                 if(*endptr != '\0') {
 785                         parse_error("hex number literal too long");
 786                 }
 787                 parse_integer_suffix(true);
 788         }
 789
 790         obstack_free(&symbol_obstack, string);
 791 }
 792
 793 /**
 794  * Returns true if the given char is a octal digit.
 795  *
 796  * @param char  the character to check
 797  */
 798 static inline bool is_octal_digit(utf32 chr)
 799 {
 800         switch(chr) {
 801         case '0':
 802         case '1':
 803         case '2':
 804         case '3':
 805         case '4':
 806         case '5':
 807         case '6':
 808         case '7':
 809                 return true;
 810         default:
 811                 return false;
 812         }
 813 }
 814
 815 /**
 816  * Parses a octal number and set the lexer_token.
 817  */
 818 static void parse_number_oct(void)
 819 {
 820         while(is_octal_digit(c)) {
 821                 obstack_1grow(&symbol_obstack, (char) c);
 822                 next_char();
 823         }
 824         obstack_1grow(&symbol_obstack, '\0');
 825         char *string = obstack_finish(&symbol_obstack);
 826
 827         const char *endptr;
 828         lexer_token.type       = T_INTEGER;
 829         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 830         if(*endptr != '\0') {
 831                 parse_error("octal number literal too long");
 832         }
 833
 834         obstack_free(&symbol_obstack, string);
 835         parse_integer_suffix(true);
 836 }
 837
 838 /**
 839  * Parses a decimal including float number and set the
 840  * lexer_token.
 841  */
 842 static void parse_number_dec(void)
 843 {
 844         bool is_float = false;
 845         while (isdigit(c)) {
 846                 obstack_1grow(&symbol_obstack, (char) c);
 847                 next_char();
 848         }
 849
 850         if (c == '.') {
 851                 obstack_1grow(&symbol_obstack, '.');
 852                 next_char();
 853
 854                 while (isdigit(c)) {
 855                         obstack_1grow(&symbol_obstack, (char) c);
 856                         next_char();
 857                 }
 858                 is_float = true;
 859         }
 860         if(c == 'e' || c == 'E') {
 861                 obstack_1grow(&symbol_obstack, (char) c);
 862                 next_char();
 863
 864                 if(c == '-' || c == '+') {
 865                         obstack_1grow(&symbol_obstack, (char) c);
 866                         next_char();
 867                 }
 868
 869                 while(isdigit(c)) {
 870                         obstack_1grow(&symbol_obstack, (char) c);
 871                         next_char();
 872                 }
 873                 is_float = true;
 874         }
 875
 876         obstack_1grow(&symbol_obstack, '\0');
 877         char *string = obstack_finish(&symbol_obstack);
 878
 879         if(is_float) {
 880                 char *endptr;
 881                 lexer_token.type         = T_FLOATINGPOINT;
 882                 lexer_token.v.floatvalue = strtold(string, &endptr);
 883
 884                 if(*endptr != '\0') {
 885                         parse_error("invalid number literal");
 886                 }
 887
 888                 parse_floating_suffix();
 889         } else {
 890                 const char *endptr;
 891                 lexer_token.type       = T_INTEGER;
 892                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 893
 894                 if(*endptr != '\0') {
 895                         parse_error("invalid number literal");
 896                 }
 897
 898                 parse_integer_suffix(false);
 899         }
 900         obstack_free(&symbol_obstack, string);
 901 }
 902
 903 /**
 904  * Parses a number and sets the lexer_token.
 905  */
 906 static void parse_number(void)
 907 {
 908         if (c == '0') {
 909                 next_char();
 910                 switch (c) {
 911                         case 'X':
 912                         case 'x':
 913                                 parse_number_hex();
 914                                 break;
 915                         case '0':
 916                         case '1':
 917                         case '2':
 918                         case '3':
 919                         case '4':
 920                         case '5':
 921                         case '6':
 922                         case '7':
 923                                 parse_number_oct();
 924                                 break;
 925                         case '8':
 926                         case '9':
 927                                 next_char();
 928                                 parse_error("invalid octal number");
 929                                 lexer_token.type = T_ERROR;
 930                                 return;
 931                         case '.':
 932                         case 'e':
 933                         case 'E':
 934                         default:
 935                                 obstack_1grow(&symbol_obstack, '0');
 936                                 parse_number_dec();
 937                                 return;
 938                 }
 939         } else {
 940                 parse_number_dec();
 941         }
 942 }
 943
 944 /**
 945  * Returns the value of a digit.
 946  * The only portable way to do it ...
 947  */
 948 static int digit_value(utf32 const digit)
 949 {
 950         switch (digit) {
 951         case '0': return 0;
 952         case '1': return 1;
 953         case '2': return 2;
 954         case '3': return 3;
 955         case '4': return 4;
 956         case '5': return 5;
 957         case '6': return 6;
 958         case '7': return 7;
 959         case '8': return 8;
 960         case '9': return 9;
 961         case 'a':
 962         case 'A': return 10;
 963         case 'b':
 964         case 'B': return 11;
 965         case 'c':
 966         case 'C': return 12;
 967         case 'd':
 968         case 'D': return 13;
 969         case 'e':
 970         case 'E': return 14;
 971         case 'f':
 972         case 'F': return 15;
 973         default:
 974                 internal_error("wrong character given");
 975         }
 976 }
 977
 978 /**
 979  * Parses an octal character sequence.
 980  *
 981  * @param first_digit  the already read first digit
 982  */
 983 static utf32 parse_octal_sequence(utf32 const first_digit)
 984 {
 985         assert(is_octal_digit(first_digit));
 986         utf32 value = digit_value(first_digit);
 987         if (!is_octal_digit(c)) return value;
 988         value = 8 * value + digit_value(c);
 989         next_char();
 990         if (!is_octal_digit(c)) return value;
 991         value = 8 * value + digit_value(c);
 992         next_char();
 993         return value;
 994 }
 995
 996 /**
 997  * Parses a hex character sequence.
 998  */
 999 static utf32 parse_hex_sequence(void)
1000 {
1001         utf32 value = 0;
1002         while(isxdigit(c)) {
1003                 value = 16 * value + digit_value(c);
1004                 next_char();
1005         }
1006         return value;
1007 }
1008
1009 /**
1010  * Parse an escape sequence.
1011  */
1012 static utf32 parse_escape_sequence(void)
1013 {
1014         eat('\\');
1015
1016         utf32 const ec = c;
1017         next_char();
1018
1019         switch (ec) {
1020         case '"':  return '"';
1021         case '\'': return '\'';
1022         case '\\': return '\\';
1023         case '?': return '\?';
1024         case 'a': return '\a';
1025         case 'b': return '\b';
1026         case 'f': return '\f';
1027         case 'n': return '\n';
1028         case 'r': return '\r';
1029         case 't': return '\t';
1030         case 'v': return '\v';
1031         case 'x':
1032                 return parse_hex_sequence();
1033         case '0':
1034         case '1':
1035         case '2':
1036         case '3':
1037         case '4':
1038         case '5':
1039         case '6':
1040         case '7':
1041                 return parse_octal_sequence(ec);
1042         case EOF:
1043                 parse_error("reached end of file while parsing escape sequence");
1044                 return EOF;
1045         /* \E is not documented, but handled, by GCC.  It is acceptable according
1046          * to §6.11.4, whereas \e is not. */
1047         case 'E':
1048         case 'e':
1049                 if (c_mode & _GNUC)
1050                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1051                 /* FALLTHROUGH */
1052         default:
1053                 /* §6.4.4.4:8 footnote 64 */
1054                 parse_error("unknown escape sequence");
1055                 return EOF;
1056         }
1057 }
1058
1059 /**
1060  * Concatenate two strings.
1061  */
1062 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1063 {
1064         const size_t len1 = s1->size - 1;
1065         const size_t len2 = s2->size - 1;
1066
1067         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1068         memcpy(concat, s1->begin, len1);
1069         memcpy(concat + len1, s2->begin, len2 + 1);
1070
1071         if (warning.traditional) {
1072                 warningf(&lexer_token.source_position,
1073                         "traditional C rejects string constant concatenation");
1074         }
1075 #if 0 /* TODO hash */
1076         const char *result = strset_insert(&stringset, concat);
1077         if(result != concat) {
1078                 obstack_free(&symbol_obstack, concat);
1079         }
1080
1081         return result;
1082 #else
1083         return (string_t){ concat, len1 + len2 + 1 };
1084 #endif
1085 }
1086
1087 /**
1088  * Concatenate a string and a wide string.
1089  */
1090 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1091 {
1092         const size_t len1 = s1->size - 1;
1093         const size_t len2 = s2->size - 1;
1094
1095         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1096         const char *const src = s1->begin;
1097         for (size_t i = 0; i != len1; ++i) {
1098                 concat[i] = src[i];
1099         }
1100         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1101         if (warning.traditional) {
1102                 warningf(&lexer_token.source_position,
1103                         "traditional C rejects string constant concatenation");
1104         }
1105
1106         return (wide_string_t){ concat, len1 + len2 + 1 };
1107 }
1108
1109 /**
1110  * Concatenate two wide strings.
1111  */
1112 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1113 {
1114         const size_t len1 = s1->size - 1;
1115         const size_t len2 = s2->size - 1;
1116
1117         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1118         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1119         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1120         if (warning.traditional) {
1121                 warningf(&lexer_token.source_position,
1122                         "traditional C rejects string constant concatenation");
1123         }
1124
1125         return (wide_string_t){ concat, len1 + len2 + 1 };
1126 }
1127
1128 /**
1129  * Concatenate a wide string and a string.
1130  */
1131 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1132 {
1133         const size_t len1 = s1->size - 1;
1134         const size_t len2 = s2->size - 1;
1135
1136         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1137         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1138         const char  *const src = s2->begin;
1139         wchar_rep_t *const dst = concat + len1;
1140         for (size_t i = 0; i != len2 + 1; ++i) {
1141                 dst[i] = src[i];
1142         }
1143         if (warning.traditional) {
1144                 warningf(&lexer_token.source_position,
1145                         "traditional C rejects string constant concatenation");
1146         }
1147
1148         return (wide_string_t){ concat, len1 + len2 + 1 };
1149 }
1150
1151 static void grow_symbol(utf32 const tc)
1152 {
1153         struct obstack *const o  = &symbol_obstack;
1154         if (tc < 0x80U) {
1155                 obstack_1grow(o, tc);
1156         } else if (tc < 0x800) {
1157                 obstack_1grow(o, 0xC0 | (tc >> 6));
1158                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1159         } else if (tc < 0x10000) {
1160                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1161                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1162                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1163         } else {
1164                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1165                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1166                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1167                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1168         }
1169 }
1170
1171 /**
1172  * Parse a string literal and set lexer_token.
1173  */
1174 static void parse_string_literal(void)
1175 {
1176         const unsigned start_linenr = lexer_token.source_position.linenr;
1177
1178         eat('"');
1179
1180         while(1) {
1181                 switch(c) {
1182                 case '\\': {
1183                         utf32 const tc = parse_escape_sequence();
1184                         if (tc >= 0x100) {
1185                                 warningf(&lexer_token.source_position,
1186                                                 "escape sequence out of range");
1187                         }
1188                         obstack_1grow(&symbol_obstack, tc);
1189                         break;
1190                 }
1191
1192                 case EOF: {
1193                         source_position_t source_position;
1194                         source_position.input_name = lexer_token.source_position.input_name;
1195                         source_position.linenr     = start_linenr;
1196                         errorf(&source_position, "string has no end");
1197                         lexer_token.type = T_ERROR;
1198                         return;
1199                 }
1200
1201                 case '"':
1202                         next_char();
1203                         goto end_of_string;
1204
1205                 default:
1206                         grow_symbol(c);
1207                         next_char();
1208                         break;
1209                 }
1210         }
1211
1212 end_of_string:
1213
1214         /* TODO: concatenate multiple strings separated by whitespace... */
1215
1216         /* add finishing 0 to the string */
1217         obstack_1grow(&symbol_obstack, '\0');
1218         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1219         const char *const string = obstack_finish(&symbol_obstack);
1220
1221 #if 0 /* TODO hash */
1222         /* check if there is already a copy of the string */
1223         result = strset_insert(&stringset, string);
1224         if(result != string) {
1225                 obstack_free(&symbol_obstack, string);
1226         }
1227 #else
1228         const char *const result = string;
1229 #endif
1230
1231         lexer_token.type           = T_STRING_LITERAL;
1232         lexer_token.v.string.begin = result;
1233         lexer_token.v.string.size  = size;
1234 }
1235
1236 /**
1237  * Parse a wide character constant and set lexer_token.
1238  */
1239 static void parse_wide_character_constant(void)
1240 {
1241         const unsigned start_linenr = lexer_token.source_position.linenr;
1242
1243         eat('\'');
1244
1245         while(1) {
1246                 switch(c) {
1247                 case '\\': {
1248                         wchar_rep_t tc = parse_escape_sequence();
1249                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1250                         break;
1251                 }
1252
1253                 MATCH_NEWLINE(
1254                         parse_error("newline while parsing character constant");
1255                         break;
1256                 )
1257
1258                 case '\'':
1259                         next_char();
1260                         goto end_of_wide_char_constant;
1261
1262                 case EOF: {
1263                         source_position_t source_position = lexer_token.source_position;
1264                         source_position.linenr = start_linenr;
1265                         errorf(&source_position, "EOF while parsing character constant");
1266                         lexer_token.type = T_ERROR;
1267                         return;
1268                 }
1269
1270                 default: {
1271                         wchar_rep_t tc = (wchar_rep_t) c;
1272                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1273                         next_char();
1274                         break;
1275                 }
1276                 }
1277         }
1278
1279 end_of_wide_char_constant:;
1280         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1281         assert(size % sizeof(wchar_rep_t) == 0);
1282         size /= sizeof(wchar_rep_t);
1283
1284         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1285
1286         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1287         lexer_token.v.wide_string.begin = string;
1288         lexer_token.v.wide_string.size  = size;
1289         lexer_token.datatype            = type_wchar_t;
1290 }
1291
1292 /**
1293  * Parse a wide string literal and set lexer_token.
1294  */
1295 static void parse_wide_string_literal(void)
1296 {
1297         const unsigned start_linenr = lexer_token.source_position.linenr;
1298
1299         assert(c == '"');
1300         next_char();
1301
1302         while(1) {
1303                 switch(c) {
1304                 case '\\': {
1305                         wchar_rep_t tc = parse_escape_sequence();
1306                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1307                         break;
1308                 }
1309
1310                 case EOF: {
1311                         source_position_t source_position;
1312                         source_position.input_name = lexer_token.source_position.input_name;
1313                         source_position.linenr     = start_linenr;
1314                         errorf(&source_position, "string has no end");
1315                         lexer_token.type = T_ERROR;
1316                         return;
1317                 }
1318
1319                 case '"':
1320                         next_char();
1321                         goto end_of_string;
1322
1323                 default: {
1324                         wchar_rep_t tc = c;
1325                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1326                         next_char();
1327                         break;
1328                 }
1329                 }
1330         }
1331
1332 end_of_string:;
1333
1334         /* TODO: concatenate multiple strings separated by whitespace... */
1335
1336         /* add finishing 0 to the string */
1337         wchar_rep_t nul = L'\0';
1338         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1339         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1340         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1341
1342 #if 0 /* TODO hash */
1343         /* check if there is already a copy of the string */
1344         const wchar_rep_t *const result = strset_insert(&stringset, string);
1345         if(result != string) {
1346                 obstack_free(&symbol_obstack, string);
1347         }
1348 #else
1349         const wchar_rep_t *const result = string;
1350 #endif
1351
1352         lexer_token.type                = T_WIDE_STRING_LITERAL;
1353         lexer_token.v.wide_string.begin = result;
1354         lexer_token.v.wide_string.size  = size;
1355 }
1356
1357 /**
1358  * Parse a character constant and set lexer_token.
1359  */
1360 static void parse_character_constant(void)
1361 {
1362         const unsigned start_linenr = lexer_token.source_position.linenr;
1363
1364         eat('\'');
1365
1366         while(1) {
1367                 switch(c) {
1368                 case '\\': {
1369                         utf32 const tc = parse_escape_sequence();
1370                         if (tc >= 0x100) {
1371                                 warningf(&lexer_token.source_position,
1372                                                 "escape sequence out of range");
1373                         }
1374                         obstack_1grow(&symbol_obstack, tc);
1375                         break;
1376                 }
1377
1378                 MATCH_NEWLINE(
1379                         parse_error("newline while parsing character constant");
1380                         break;
1381                 )
1382
1383                 case '\'':
1384                         next_char();
1385                         goto end_of_char_constant;
1386
1387                 case EOF: {
1388                         source_position_t source_position;
1389                         source_position.input_name = lexer_token.source_position.input_name;
1390                         source_position.linenr     = start_linenr;
1391                         errorf(&source_position, "EOF while parsing character constant");
1392                         lexer_token.type = T_ERROR;
1393                         return;
1394                 }
1395
1396                 default:
1397                         grow_symbol(c);
1398                         next_char();
1399                         break;
1400
1401                 }
1402         }
1403
1404 end_of_char_constant:;
1405         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1406         const char *const string = obstack_finish(&symbol_obstack);
1407
1408         lexer_token.type           = T_CHARACTER_CONSTANT;
1409         lexer_token.v.string.begin = string;
1410         lexer_token.v.string.size  = size;
1411         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1412 }
1413
1414 /**
1415  * Skip a multiline comment.
1416  */
1417 static void skip_multiline_comment(void)
1418 {
1419         unsigned start_linenr = lexer_token.source_position.linenr;
1420
1421         while(1) {
1422                 switch(c) {
1423                 case '/':
1424                         next_char();
1425                         if (c == '*') {
1426                                 /* nested comment, warn here */
1427                                 if (warning.comment) {
1428                                         warningf(&lexer_token.source_position, "'/*' within comment");
1429                                 }
1430                         }
1431                         break;
1432                 case '*':
1433                         next_char();
1434                         if(c == '/') {
1435                                 next_char();
1436                                 return;
1437                         }
1438                         break;
1439
1440                 MATCH_NEWLINE(break;)
1441
1442                 case EOF: {
1443                         source_position_t source_position;
1444                         source_position.input_name = lexer_token.source_position.input_name;
1445                         source_position.linenr     = start_linenr;
1446                         errorf(&source_position, "at end of file while looking for comment end");
1447                         return;
1448                 }
1449
1450                 default:
1451                         next_char();
1452                         break;
1453                 }
1454         }
1455 }
1456
1457 /**
1458  * Skip a single line comment.
1459  */
1460 static void skip_line_comment(void)
1461 {
1462         while(1) {
1463                 switch(c) {
1464                 case EOF:
1465                         return;
1466
1467                 case '\n':
1468                 case '\r':
1469                         return;
1470
1471                 case '\\':
1472                         next_char();
1473                         if (c == '\n' || c == '\r') {
1474                                 if (warning.comment)
1475                                         warningf(&lexer_token.source_position, "multi-line comment");
1476                                 return;
1477                         }
1478                         break;
1479
1480                 default:
1481                         next_char();
1482                         break;
1483                 }
1484         }
1485 }
1486
1487 /** The current preprocessor token. */
1488 static token_t pp_token;
1489
1490 /**
1491  * Read the next preprocessor token.
1492  */
1493 static inline void next_pp_token(void)
1494 {
1495         lexer_next_preprocessing_token();
1496         pp_token = lexer_token;
1497 }
1498
1499 /**
1500  * Eat all preprocessor tokens until newline.
1501  */
1502 static void eat_until_newline(void)
1503 {
1504         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1505                 next_pp_token();
1506         }
1507 }
1508
1509 /**
1510  * Handle the define directive.
1511  */
1512 static void define_directive(void)
1513 {
1514         lexer_next_preprocessing_token();
1515         if(lexer_token.type != T_IDENTIFIER) {
1516                 parse_error("expected identifier after #define\n");
1517                 eat_until_newline();
1518         }
1519 }
1520
1521 /**
1522  * Handle the ifdef directive.
1523  */
1524 static void ifdef_directive(int is_ifndef)
1525 {
1526         (void) is_ifndef;
1527         lexer_next_preprocessing_token();
1528         //expect_identifier();
1529         //extect_newline();
1530 }
1531
1532 /**
1533  * Handle the endif directive.
1534  */
1535 static void endif_directive(void)
1536 {
1537         //expect_newline();
1538 }
1539
1540 /**
1541  * Parse the line directive.
1542  */
1543 static void parse_line_directive(void)
1544 {
1545         if(pp_token.type != T_INTEGER) {
1546                 parse_error("expected integer");
1547         } else {
1548                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1549                 next_pp_token();
1550         }
1551         if(pp_token.type == T_STRING_LITERAL) {
1552                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1553                 next_pp_token();
1554         }
1555
1556         eat_until_newline();
1557 }
1558
1559 /**
1560  * STDC pragmas.
1561  */
1562 typedef enum stdc_pragma_kind_t {
1563         STDC_UNKNOWN,
1564         STDC_FP_CONTRACT,
1565         STDC_FENV_ACCESS,
1566         STDC_CX_LIMITED_RANGE
1567 } stdc_pragma_kind_t;
1568
1569 /**
1570  * STDC pragma values.
1571  */
1572 typedef enum stdc_pragma_value_kind_t {
1573         STDC_VALUE_UNKNOWN,
1574         STDC_VALUE_ON,
1575         STDC_VALUE_OFF,
1576         STDC_VALUE_DEFAULT
1577 } stdc_pragma_value_kind_t;
1578
1579 /**
1580  * Parse a pragma directive.
1581  */
1582 static void parse_pragma(void) {
1583         bool unknown_pragma = true;
1584
1585         next_pp_token();
1586         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1587                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1588                 /* a STDC pragma */
1589                 if (c_mode & _C99) {
1590                         next_pp_token();
1591
1592                         switch (pp_token.v.symbol->pp_ID) {
1593                         case TP_FP_CONTRACT:
1594                                 kind = STDC_FP_CONTRACT;
1595                                 break;
1596                         case TP_FENV_ACCESS:
1597                                 kind = STDC_FENV_ACCESS;
1598                                 break;
1599                         case TP_CX_LIMITED_RANGE:
1600                                 kind = STDC_CX_LIMITED_RANGE;
1601                                 break;
1602                         default:
1603                                 break;
1604                         }
1605                         if (kind != STDC_UNKNOWN) {
1606                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1607                                 next_pp_token();
1608                                 switch (pp_token.v.symbol->pp_ID) {
1609                                 case TP_ON:
1610                                         value = STDC_VALUE_ON;
1611                                         break;
1612                                 case TP_OFF:
1613                                         value = STDC_VALUE_OFF;
1614                                         break;
1615                                 case TP_DEFAULT:
1616                                         value = STDC_VALUE_DEFAULT;
1617                                         break;
1618                                 default:
1619                                         break;
1620                                 }
1621                                 if (value != STDC_VALUE_UNKNOWN) {
1622                                         unknown_pragma = false;
1623                                 } else {
1624                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1625                                 }
1626                         }
1627                 }
1628         } else {
1629                 unknown_pragma = true;
1630         }
1631         eat_until_newline();
1632         if (unknown_pragma && warning.unknown_pragmas) {
1633                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1634         }
1635 }
1636
1637 /**
1638  * Parse a preprocessor non-null directive.
1639  */
1640 static void parse_preprocessor_identifier(void)
1641 {
1642         assert(pp_token.type == T_IDENTIFIER);
1643         symbol_t *symbol = pp_token.v.symbol;
1644
1645         switch(symbol->pp_ID) {
1646         case TP_include:
1647                 printf("include - enable header name parsing!\n");
1648                 break;
1649         case TP_define:
1650                 define_directive();
1651                 break;
1652         case TP_ifdef:
1653                 ifdef_directive(0);
1654                 break;
1655         case TP_ifndef:
1656                 ifdef_directive(1);
1657                 break;
1658         case TP_endif:
1659                 endif_directive();
1660                 break;
1661         case TP_line:
1662                 next_pp_token();
1663                 parse_line_directive();
1664                 break;
1665         case TP_if:
1666         case TP_else:
1667         case TP_elif:
1668         case TP_undef:
1669         case TP_error:
1670                 /* TODO; output the rest of the line */
1671                 parse_error("#error directive: ");
1672                 break;
1673         case TP_pragma:
1674                 parse_pragma();
1675                 break;
1676         }
1677 }
1678
1679 /**
1680  * Parse a preprocessor directive.
1681  */
1682 static void parse_preprocessor_directive(void)
1683 {
1684         next_pp_token();
1685
1686         switch(pp_token.type) {
1687         case T_IDENTIFIER:
1688                 parse_preprocessor_identifier();
1689                 break;
1690         case T_INTEGER:
1691                 parse_line_directive();
1692                 break;
1693         case '\n':
1694                 /* NULL directive, see § 6.10.7 */
1695                 break;
1696         default:
1697                 parse_error("invalid preprocessor directive");
1698                 eat_until_newline();
1699                 break;
1700         }
1701 }
1702
1703 #define MAYBE_PROLOG                                       \
1704                         next_char();                                   \
1705                         while(1) {                                     \
1706                                 switch(c) {
1707
1708 #define MAYBE(ch, set_type)                                \
1709                                 case ch:                                   \
1710                                         next_char();                           \
1711                                         lexer_token.type = set_type;           \
1712                                         return;
1713
1714 #define ELSE_CODE(code)                                    \
1715                                 default:                                   \
1716                                         code                                   \
1717                                 }                                          \
1718                         } /* end of while(1) */                        \
1719                         break;
1720
1721 #define ELSE(set_type)                                     \
1722                 ELSE_CODE(                                         \
1723                         lexer_token.type = set_type;                   \
1724                         return;                                        \
1725                 )
1726
1727 void lexer_next_preprocessing_token(void)
1728 {
1729         while(1) {
1730                 switch(c) {
1731                 case ' ':
1732                 case '\t':
1733                         next_char();
1734                         break;
1735
1736                 MATCH_NEWLINE(
1737                         lexer_token.type = '\n';
1738                         return;
1739                 )
1740
1741                 SYMBOL_CHARS
1742                         parse_symbol();
1743                         /* might be a wide string ( L"string" ) */
1744                         if(lexer_token.type == T_IDENTIFIER &&
1745                             lexer_token.v.symbol == symbol_L) {
1746                             if(c == '"') {
1747                                         parse_wide_string_literal();
1748                                 } else if(c == '\'') {
1749                                         parse_wide_character_constant();
1750                                 }
1751                         }
1752                         return;
1753
1754                 DIGITS
1755                         parse_number();
1756                         return;
1757
1758                 case '"':
1759                         parse_string_literal();
1760                         return;
1761
1762                 case '\'':
1763                         parse_character_constant();
1764                         return;
1765
1766                 case '.':
1767                         MAYBE_PROLOG
1768                                 DIGITS
1769                                         put_back(c);
1770                                         c = '.';
1771                                         parse_number_dec();
1772                                         return;
1773
1774                                 case '.':
1775                                         MAYBE_PROLOG
1776                                         MAYBE('.', T_DOTDOTDOT)
1777                                         ELSE_CODE(
1778                                                 put_back(c);
1779                                                 c = '.';
1780                                                 lexer_token.type = '.';
1781                                                 return;
1782                                         )
1783                         ELSE('.')
1784                 case '&':
1785                         MAYBE_PROLOG
1786                         MAYBE('&', T_ANDAND)
1787                         MAYBE('=', T_ANDEQUAL)
1788                         ELSE('&')
1789                 case '*':
1790                         MAYBE_PROLOG
1791                         MAYBE('=', T_ASTERISKEQUAL)
1792                         ELSE('*')
1793                 case '+':
1794                         MAYBE_PROLOG
1795                         MAYBE('+', T_PLUSPLUS)
1796                         MAYBE('=', T_PLUSEQUAL)
1797                         ELSE('+')
1798                 case '-':
1799                         MAYBE_PROLOG
1800                         MAYBE('>', T_MINUSGREATER)
1801                         MAYBE('-', T_MINUSMINUS)
1802                         MAYBE('=', T_MINUSEQUAL)
1803                         ELSE('-')
1804                 case '!':
1805                         MAYBE_PROLOG
1806                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1807                         ELSE('!')
1808                 case '/':
1809                         MAYBE_PROLOG
1810                         MAYBE('=', T_SLASHEQUAL)
1811                                 case '*':
1812                                         next_char();
1813                                         skip_multiline_comment();
1814                                         lexer_next_preprocessing_token();
1815                                         return;
1816                                 case '/':
1817                                         next_char();
1818                                         skip_line_comment();
1819                                         lexer_next_preprocessing_token();
1820                                         return;
1821                         ELSE('/')
1822                 case '%':
1823                         MAYBE_PROLOG
1824                         MAYBE('>', '}')
1825                         MAYBE('=', T_PERCENTEQUAL)
1826                                 case ':':
1827                                         MAYBE_PROLOG
1828                                                 case '%':
1829                                                         MAYBE_PROLOG
1830                                                         MAYBE(':', T_HASHHASH)
1831                                                         ELSE_CODE(
1832                                                                 put_back(c);
1833                                                                 c = '%';
1834                                                                 lexer_token.type = '#';
1835                                                                 return;
1836                                                         )
1837                                         ELSE('#')
1838                         ELSE('%')
1839                 case '<':
1840                         MAYBE_PROLOG
1841                         MAYBE(':', '[')
1842                         MAYBE('%', '{')
1843                         MAYBE('=', T_LESSEQUAL)
1844                                 case '<':
1845                                         MAYBE_PROLOG
1846                                         MAYBE('=', T_LESSLESSEQUAL)
1847                                         ELSE(T_LESSLESS)
1848                         ELSE('<')
1849                 case '>':
1850                         MAYBE_PROLOG
1851                         MAYBE('=', T_GREATEREQUAL)
1852                                 case '>':
1853                                         MAYBE_PROLOG
1854                                         MAYBE('=', T_GREATERGREATEREQUAL)
1855                                         ELSE(T_GREATERGREATER)
1856                         ELSE('>')
1857                 case '^':
1858                         MAYBE_PROLOG
1859                         MAYBE('=', T_CARETEQUAL)
1860                         ELSE('^')
1861                 case '|':
1862                         MAYBE_PROLOG
1863                         MAYBE('=', T_PIPEEQUAL)
1864                         MAYBE('|', T_PIPEPIPE)
1865                         ELSE('|')
1866                 case ':':
1867                         MAYBE_PROLOG
1868                         MAYBE('>', ']')
1869                         ELSE(':')
1870                 case '=':
1871                         MAYBE_PROLOG
1872                         MAYBE('=', T_EQUALEQUAL)
1873                         ELSE('=')
1874                 case '#':
1875                         MAYBE_PROLOG
1876                         MAYBE('#', T_HASHHASH)
1877                         ELSE('#')
1878
1879                 case '?':
1880                 case '[':
1881                 case ']':
1882                 case '(':
1883                 case ')':
1884                 case '{':
1885                 case '}':
1886                 case '~':
1887                 case ';':
1888                 case ',':
1889                 case '\\':
1890                         lexer_token.type = c;
1891                         next_char();
1892                         return;
1893
1894                 case EOF:
1895                         lexer_token.type = T_EOF;
1896                         return;
1897
1898                 default:
1899 dollar_sign:
1900                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1901                         next_char();
1902                         lexer_token.type = T_ERROR;
1903                         return;
1904                 }
1905         }
1906 }
1907
1908 void lexer_next_token(void)
1909 {
1910         lexer_next_preprocessing_token();
1911
1912         while (lexer_token.type == '\n') {
1913 newline_found:
1914                 lexer_next_preprocessing_token();
1915         }
1916
1917         if (lexer_token.type == '#') {
1918                 parse_preprocessor_directive();
1919                 goto newline_found;
1920         }
1921 }
1922
1923 void init_lexer(void)
1924 {
1925         strset_init(&stringset);
1926         symbol_L = symbol_table_insert("L");
1927 }
1928
1929 void lexer_open_stream(FILE *stream, const char *input_name)
1930 {
1931         input                                  = stream;
1932         lexer_token.source_position.linenr     = 0;
1933         lexer_token.source_position.input_name = input_name;
1934
1935         bufpos = NULL;
1936         bufend = NULL;
1937
1938         /* place a virtual \n at the beginning so the lexer knows that we're
1939          * at the beginning of a line */
1940         c = '\n';
1941 }
1942
1943 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1944 {
1945         input                                  = NULL;
1946         lexer_token.source_position.linenr     = 0;
1947         lexer_token.source_position.input_name = input_name;
1948
1949 #if 0 // TODO
1950         bufpos = buffer;
1951         bufend = buffer + len;
1952 #else
1953         (void)buffer;
1954         (void)len;
1955         panic("builtin lexing not done yet");
1956 #endif
1957
1958         /* place a virtual \n at the beginning so the lexer knows that we're
1959          * at the beginning of a line */
1960         c = '\n';
1961 }
1962
1963 void exit_lexer(void)
1964 {
1965         strset_destroy(&stringset);
1966 }
1967
1968 static __attribute__((unused))
1969 void dbg_pos(const source_position_t source_position)
1970 {
1971         fprintf(stdout, "%s:%u\n", source_position.input_name,
1972                 source_position.linenr);
1973         fflush(stdout);
1974 }