nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <stdbool.h>
  41 #include <ctype.h>
  42
  43 //#define DEBUG_CHARS
  44 #define MAX_PUTBACK 3
  45 #define BUF_SIZE    1024
  46
  47 #if defined(_WIN32) || defined(__CYGWIN__)
  48 /* No strtold on windows and no replacement yet */
  49 #define strtold(s, e) strtod(s, e)
  50 #endif
  51
  52 typedef unsigned int utf32;
  53
  54 static utf32        c;
  55 token_t             lexer_token;
  56 symbol_t           *symbol_L;
  57 static FILE        *input;
  58 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  59 static const utf32 *bufend;
  60 static const utf32 *bufpos;
  61 static strset_t     stringset;
  62 bool                allow_dollar_in_symbol = true;
  63
  64 /**
  65  * Prints a parse error message at the current token.
  66  *
  67  * @param msg   the error message
  68  */
  69 static void parse_error(const char *msg)
  70 {
  71         errorf(&lexer_token.source_position, "%s", msg);
  72 }
  73
  74 /**
  75  * Prints an internal error message at the current token.
  76  *
  77  * @param msg   the error message
  78  */
  79 static NORETURN internal_error(const char *msg)
  80 {
  81         internal_errorf(&lexer_token.source_position, "%s", msg);
  82 }
  83
  84 static size_t read_block(unsigned char *const read_buf, size_t const n)
  85 {
  86         size_t const s = fread(read_buf, 1, n, input);
  87         if (s == 0) {
  88                 if (ferror(input))
  89                         parse_error("read from input failed");
  90                 buf[MAX_PUTBACK] = EOF;
  91                 bufpos           = buf + MAX_PUTBACK;
  92                 bufend           = buf + MAX_PUTBACK + 1;
  93         }
  94         return s;
  95 }
  96
  97 static void decode_iso_8859_1(void)
  98 {
  99         unsigned char read_buf[BUF_SIZE];
 100         size_t const s = read_block(read_buf, sizeof(read_buf));
 101         if (s == 0)
 102                 return;
 103
 104         unsigned char const *src = read_buf;
 105         unsigned char const *end = read_buf + s;
 106         utf32               *dst = buf + MAX_PUTBACK;
 107         while (src != end)
 108                 *dst++ = *src++;
 109
 110         bufpos = buf + MAX_PUTBACK;
 111         bufend = dst;
 112 }
 113
 114 static void decode_iso_8859_15(void)
 115 {
 116         unsigned char read_buf[BUF_SIZE];
 117         size_t const s = read_block(read_buf, sizeof(read_buf));
 118         if (s == 0)
 119                 return;
 120
 121         unsigned char const *src = read_buf;
 122         unsigned char const *end = read_buf + s;
 123         utf32               *dst = buf + MAX_PUTBACK;
 124         while (src != end) {
 125                 utf32 tc = *src++;
 126                 switch (tc) {
 127                         case 0xA4: tc = 0x20AC; break; // €
 128                         case 0xA6: tc = 0x0160; break; // Š
 129                         case 0xA8: tc = 0x0161; break; // š
 130                         case 0xB4: tc = 0x017D; break; // Ž
 131                         case 0xB8: tc = 0x017E; break; // ž
 132                         case 0xBC: tc = 0x0152; break; // Œ
 133                         case 0xBD: tc = 0x0153; break; // œ
 134                         case 0xBE: tc = 0x0178; break; // Ÿ
 135                 }
 136                 *dst++ = tc;
 137         }
 138
 139         bufpos = buf + MAX_PUTBACK;
 140         bufend = dst;
 141 }
 142
 143 static void decode_utf8(void)
 144 {
 145         static utf32  part_decoded_min_code;
 146         static utf32  part_decoded_char;
 147         static size_t part_decoded_rest_len;
 148
 149         do {
 150                 unsigned char read_buf[BUF_SIZE];
 151                 size_t const s = read_block(read_buf, sizeof(read_buf));
 152                 if (s == 0) {
 153                         if (part_decoded_rest_len > 0)
 154                                 parse_error("incomplete input char at end of input");
 155                         return;
 156                 }
 157
 158                 unsigned char const *src = read_buf;
 159                 unsigned char const *end = read_buf + s;
 160                 utf32               *dst = buf + MAX_PUTBACK;
 161                 utf32                decoded;
 162                 utf32                min_code;
 163
 164                 if (part_decoded_rest_len != 0) {
 165                         min_code = part_decoded_min_code;
 166                         decoded  = part_decoded_char;
 167                         switch (part_decoded_rest_len) {
 168                                 case 4:  goto realign;
 169                                 case 3:  goto three_more;
 170                                 case 2:  goto two_more;
 171                                 default: goto one_more;
 172                         }
 173                 }
 174
 175                 while (src != end) {
 176                         if ((*src & 0x80) == 0) {
 177                                 decoded = *src++;
 178                         } else if ((*src & 0xE0) == 0xC0) {
 179                                 min_code = 0x80;
 180                                 decoded  = *src++ & 0x1F;
 181 one_more:
 182                                 if (src == end) {
 183                                         part_decoded_min_code = min_code;
 184                                         part_decoded_char     = decoded;
 185                                         part_decoded_rest_len = 1;
 186                                         break;
 187                                 }
 188                                 if ((*src & 0xC0) == 0x80) {
 189                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 190                                 } else {
 191                                         goto invalid_char;
 192                                 }
 193                                 if (decoded < min_code                      ||
 194                                                 decoded > 0x10FFFF                      ||
 195                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 196                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 197                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 198                                         parse_error("invalid byte sequence in input");
 199                                 }
 200                         } else if ((*src & 0xF0) == 0xE0) {
 201                                 min_code = 0x800;
 202                                 decoded  = *src++ & 0x0F;
 203 two_more:
 204                                 if (src == end) {
 205                                         part_decoded_min_code = min_code;
 206                                         part_decoded_char     = decoded;
 207                                         part_decoded_rest_len = 2;
 208                                         break;
 209                                 }
 210                                 if ((*src & 0xC0) == 0x80) {
 211                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 212                                 } else {
 213                                         goto invalid_char;
 214                                 }
 215                                 goto one_more;
 216                         } else if ((*src & 0xF8) == 0xF0) {
 217                                 min_code = 0x10000;
 218                                 decoded  = *src++ & 0x07;
 219 three_more:
 220                                 if (src == end) {
 221                                         part_decoded_min_code = min_code;
 222                                         part_decoded_char     = decoded;
 223                                         part_decoded_rest_len = 3;
 224                                         break;
 225                                 }
 226                                 if ((*src & 0xC0) == 0x80) {
 227                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 228                                 } else {
 229                                         goto invalid_char;
 230                                 }
 231                                 goto two_more;
 232                         } else {
 233 invalid_char:
 234                                 parse_error("invalid byte sequence in input");
 235 realign:
 236                                 do {
 237                                         ++src;
 238                                         if (src == end) {
 239                                                 part_decoded_rest_len = 4;
 240                                                 break;
 241                                         }
 242                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 243                                 continue;
 244                         }
 245                         *dst++ = decoded;
 246                 }
 247
 248                 bufpos = buf + MAX_PUTBACK;
 249                 bufend = dst;
 250         } while (bufpos == bufend);
 251 }
 252
 253 typedef void (*decoder_t)(void);
 254
 255 static decoder_t decoder = decode_utf8;
 256
 257 typedef struct named_decoder_t {
 258         char const *name;
 259         decoder_t   decoder;
 260 } named_decoder_t;
 261
 262 static named_decoder_t const decoders[] = {
 263         { "CP819",           decode_iso_8859_1  }, // offical alias
 264         { "IBM819",          decode_iso_8859_1  }, // offical alias
 265         { "ISO-8859-1",      decode_iso_8859_1  }, // offical alias
 266         { "ISO-8859-15",     decode_iso_8859_15 }, // offical name
 267         { "ISO8859-1",       decode_iso_8859_1  },
 268         { "ISO8859-15",      decode_iso_8859_15 },
 269         { "ISO_8859-1",      decode_iso_8859_1  }, // offical alias
 270         { "ISO_8859-15",     decode_iso_8859_15 }, // offical alias
 271         { "ISO_8859-1:1987", decode_iso_8859_1  }, // offical name
 272         { "Latin-9",         decode_iso_8859_15 }, // offical alias
 273         { "UTF-8",           decode_utf8        }, // offical name
 274         { "csISOLatin1",     decode_iso_8859_1  }, // offical alias
 275         { "iso-ir-100",      decode_iso_8859_1  }, // offical alias
 276         { "l1",              decode_iso_8859_1  }, // offical alias
 277         { "latin1",          decode_iso_8859_1  }, // offical alias
 278
 279         { NULL,              NULL               }
 280 };
 281
 282 void select_input_encoding(char const* const encoding)
 283 {
 284         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 285                 if (strcasecmp(encoding, i->name) != 0)
 286                         continue;
 287                 decoder = i->decoder;
 288                 return;
 289         }
 290         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 291 }
 292
 293 static inline void next_real_char(void)
 294 {
 295         assert(bufpos <= bufend);
 296         if (bufpos >= bufend) {
 297                 if (input == NULL) {
 298                         c = EOF;
 299                         return;
 300                 }
 301                 decoder();
 302         }
 303         c = *bufpos++;
 304 }
 305
 306 /**
 307  * Put a character back into the buffer.
 308  *
 309  * @param pc  the character to put back
 310  */
 311 static inline void put_back(utf32 const pc)
 312 {
 313         assert(bufpos > buf);
 314         *(--bufpos - buf + buf) = pc;
 315
 316 #ifdef DEBUG_CHARS
 317         printf("putback '%lc'\n", pc);
 318 #endif
 319 }
 320
 321 static inline void next_char(void);
 322
 323 #define MATCH_NEWLINE(code)                   \
 324         case '\r':                                \
 325                 next_char();                          \
 326                 if(c == '\n') {                       \
 327                         next_char();                      \
 328                 }                                     \
 329                 lexer_token.source_position.linenr++; \
 330                 code                                  \
 331         case '\n':                                \
 332                 next_char();                          \
 333                 lexer_token.source_position.linenr++; \
 334                 code
 335
 336 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 337
 338 static void maybe_concat_lines(void)
 339 {
 340         eat('\\');
 341
 342         switch(c) {
 343         MATCH_NEWLINE(return;)
 344
 345         default:
 346                 break;
 347         }
 348
 349         put_back(c);
 350         c = '\\';
 351 }
 352
 353 /**
 354  * Set c to the next input character, ie.
 355  * after expanding trigraphs.
 356  */
 357 static inline void next_char(void)
 358 {
 359         next_real_char();
 360
 361         /* filter trigraphs */
 362         if(UNLIKELY(c == '\\')) {
 363                 maybe_concat_lines();
 364                 goto end_of_next_char;
 365         }
 366
 367         if(LIKELY(c != '?'))
 368                 goto end_of_next_char;
 369
 370         next_real_char();
 371         if(LIKELY(c != '?')) {
 372                 put_back(c);
 373                 c = '?';
 374                 goto end_of_next_char;
 375         }
 376
 377         next_real_char();
 378         switch(c) {
 379         case '=': c = '#'; break;
 380         case '(': c = '['; break;
 381         case '/': c = '\\'; maybe_concat_lines(); break;
 382         case ')': c = ']'; break;
 383         case '\'': c = '^'; break;
 384         case '<': c = '{'; break;
 385         case '!': c = '|'; break;
 386         case '>': c = '}'; break;
 387         case '-': c = '~'; break;
 388         default:
 389                 put_back(c);
 390                 put_back('?');
 391                 c = '?';
 392                 break;
 393         }
 394
 395 end_of_next_char:;
 396 #ifdef DEBUG_CHARS
 397         printf("nchar '%c'\n", c);
 398 #endif
 399 }
 400
 401 #define SYMBOL_CHARS  \
 402         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 403         case 'a':         \
 404         case 'b':         \
 405         case 'c':         \
 406         case 'd':         \
 407         case 'e':         \
 408         case 'f':         \
 409         case 'g':         \
 410         case 'h':         \
 411         case 'i':         \
 412         case 'j':         \
 413         case 'k':         \
 414         case 'l':         \
 415         case 'm':         \
 416         case 'n':         \
 417         case 'o':         \
 418         case 'p':         \
 419         case 'q':         \
 420         case 'r':         \
 421         case 's':         \
 422         case 't':         \
 423         case 'u':         \
 424         case 'v':         \
 425         case 'w':         \
 426         case 'x':         \
 427         case 'y':         \
 428         case 'z':         \
 429         case 'A':         \
 430         case 'B':         \
 431         case 'C':         \
 432         case 'D':         \
 433         case 'E':         \
 434         case 'F':         \
 435         case 'G':         \
 436         case 'H':         \
 437         case 'I':         \
 438         case 'J':         \
 439         case 'K':         \
 440         case 'L':         \
 441         case 'M':         \
 442         case 'N':         \
 443         case 'O':         \
 444         case 'P':         \
 445         case 'Q':         \
 446         case 'R':         \
 447         case 'S':         \
 448         case 'T':         \
 449         case 'U':         \
 450         case 'V':         \
 451         case 'W':         \
 452         case 'X':         \
 453         case 'Y':         \
 454         case 'Z':         \
 455         case '_':
 456
 457 #define DIGITS        \
 458         case '0':         \
 459         case '1':         \
 460         case '2':         \
 461         case '3':         \
 462         case '4':         \
 463         case '5':         \
 464         case '6':         \
 465         case '7':         \
 466         case '8':         \
 467         case '9':
 468
 469 /**
 470  * Read a symbol from the input and build
 471  * the lexer_token.
 472  */
 473 static void parse_symbol(void)
 474 {
 475         symbol_t *symbol;
 476         char     *string;
 477
 478         obstack_1grow(&symbol_obstack, (char) c);
 479         next_char();
 480
 481         while(1) {
 482                 switch(c) {
 483                 DIGITS
 484                 SYMBOL_CHARS
 485                         obstack_1grow(&symbol_obstack, (char) c);
 486                         next_char();
 487                         break;
 488
 489                 default:
 490 dollar_sign:
 491                         goto end_symbol;
 492                 }
 493         }
 494
 495 end_symbol:
 496         obstack_1grow(&symbol_obstack, '\0');
 497
 498         string = obstack_finish(&symbol_obstack);
 499         symbol = symbol_table_insert(string);
 500
 501         lexer_token.type     = symbol->ID;
 502         lexer_token.v.symbol = symbol;
 503
 504         if(symbol->string != string) {
 505                 obstack_free(&symbol_obstack, string);
 506         }
 507 }
 508
 509 static void parse_integer_suffix(bool is_oct_hex)
 510 {
 511         bool is_unsigned     = false;
 512         bool min_long        = false;
 513         bool min_longlong    = false;
 514         bool not_traditional = false;
 515         int  pos             = 0;
 516         char suffix[4];
 517
 518         if (c == 'U' || c == 'u') {
 519                 not_traditional = true;
 520                 suffix[pos++]   = toupper(c);
 521                 is_unsigned     = true;
 522                 next_char();
 523                 if (c == 'L' || c == 'l') {
 524                         suffix[pos++] = toupper(c);
 525                         min_long = true;
 526                         next_char();
 527                         if (c == 'L' || c == 'l') {
 528                                 suffix[pos++] = toupper(c);
 529                                 min_longlong = true;
 530                                 next_char();
 531                         }
 532                 }
 533         } else if (c == 'l' || c == 'L') {
 534                 suffix[pos++] = toupper(c);
 535                 min_long = true;
 536                 next_char();
 537                 if (c == 'l' || c == 'L') {
 538                         not_traditional = true;
 539                         suffix[pos++]   = toupper(c);
 540                         min_longlong    = true;
 541                         next_char();
 542                         if (c == 'u' || c == 'U') {
 543                                 suffix[pos++] = toupper(c);
 544                                 is_unsigned   = true;
 545                                 next_char();
 546                         }
 547                 } else if (c == 'u' || c == 'U') {
 548                         not_traditional = true;
 549                         suffix[pos++]   = toupper(c);
 550                         is_unsigned     = true;
 551                         next_char();
 552                         lexer_token.datatype = type_unsigned_long;
 553                 }
 554         }
 555
 556         if (warning.traditional && not_traditional) {
 557                 suffix[pos] = '\0';
 558                 warningf(&lexer_token.source_position,
 559                         "traditional C rejects the '%s' suffix", suffix);
 560         }
 561         if (!is_unsigned) {
 562                 long long v = lexer_token.v.intvalue;
 563                 if (!min_long) {
 564                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 565                                 lexer_token.datatype = type_int;
 566                                 return;
 567                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 568                                 lexer_token.datatype = type_unsigned_int;
 569                                 return;
 570                         }
 571                 }
 572                 if (!min_longlong) {
 573                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 574                                 lexer_token.datatype = type_long;
 575                                 return;
 576                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
 577                                 lexer_token.datatype = type_unsigned_long;
 578                                 return;
 579                         }
 580                 }
 581                 unsigned long long uv = (unsigned long long) v;
 582                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 583                         lexer_token.datatype = type_unsigned_long_long;
 584                         return;
 585                 }
 586
 587                 lexer_token.datatype = type_long_long;
 588         } else {
 589                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 590                 if (!min_long && v <= TARGET_UINT_MAX) {
 591                         lexer_token.datatype = type_unsigned_int;
 592                         return;
 593                 }
 594                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
 595                         lexer_token.datatype = type_unsigned_long;
 596                         return;
 597                 }
 598                 lexer_token.datatype = type_unsigned_long_long;
 599         }
 600 }
 601
 602 static void parse_floating_suffix(void)
 603 {
 604         switch(c) {
 605         /* TODO: do something useful with the suffixes... */
 606         case 'f':
 607         case 'F':
 608                 if (warning.traditional) {
 609                         warningf(&lexer_token.source_position,
 610                                 "traditional C rejects the 'F' suffix");
 611                 }
 612                 next_char();
 613                 lexer_token.datatype = type_float;
 614                 break;
 615         case 'l':
 616         case 'L':
 617                 if (warning.traditional) {
 618                         warningf(&lexer_token.source_position,
 619                                 "traditional C rejects the 'F' suffix");
 620                 }
 621                 next_char();
 622                 lexer_token.datatype = type_long_double;
 623                 break;
 624         default:
 625                 lexer_token.datatype = type_double;
 626                 break;
 627         }
 628 }
 629
 630 /**
 631  * A replacement for strtoull. Only those parts needed for
 632  * our parser are implemented.
 633  */
 634 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
 635         unsigned long long v = 0;
 636
 637         switch (base) {
 638         case 16:
 639                 for (;; ++s) {
 640                         /* check for overrun */
 641                         if (v >= 0x1000000000000000ULL)
 642                                 break;
 643                         switch (tolower(*s)) {
 644                         case '0': v <<= 4; break;
 645                         case '1': v <<= 4; v |= 0x1; break;
 646                         case '2': v <<= 4; v |= 0x2; break;
 647                         case '3': v <<= 4; v |= 0x3; break;
 648                         case '4': v <<= 4; v |= 0x4; break;
 649                         case '5': v <<= 4; v |= 0x5; break;
 650                         case '6': v <<= 4; v |= 0x6; break;
 651                         case '7': v <<= 4; v |= 0x7; break;
 652                         case '8': v <<= 4; v |= 0x8; break;
 653                         case '9': v <<= 4; v |= 0x9; break;
 654                         case 'a': v <<= 4; v |= 0xa; break;
 655                         case 'b': v <<= 4; v |= 0xb; break;
 656                         case 'c': v <<= 4; v |= 0xc; break;
 657                         case 'd': v <<= 4; v |= 0xd; break;
 658                         case 'e': v <<= 4; v |= 0xe; break;
 659                         case 'f': v <<= 4; v |= 0xf; break;
 660                         default:
 661                                 goto end;
 662                         }
 663                 }
 664                 break;
 665         case 8:
 666                 for (;; ++s) {
 667                         /* check for overrun */
 668                         if (v >= 0x2000000000000000ULL)
 669                                 break;
 670                         switch (tolower(*s)) {
 671                         case '0': v <<= 3; break;
 672                         case '1': v <<= 3; v |= 1; break;
 673                         case '2': v <<= 3; v |= 2; break;
 674                         case '3': v <<= 3; v |= 3; break;
 675                         case '4': v <<= 3; v |= 4; break;
 676                         case '5': v <<= 3; v |= 5; break;
 677                         case '6': v <<= 3; v |= 6; break;
 678                         case '7': v <<= 3; v |= 7; break;
 679                         default:
 680                                 goto end;
 681                         }
 682                 }
 683                 break;
 684         case 10:
 685                 for (;; ++s) {
 686                         /* check for overrun */
 687                         if (v > 0x1999999999999999ULL)
 688                                 break;
 689                         switch (tolower(*s)) {
 690                         case '0': v *= 10; break;
 691                         case '1': v *= 10; v += 1; break;
 692                         case '2': v *= 10; v += 2; break;
 693                         case '3': v *= 10; v += 3; break;
 694                         case '4': v *= 10; v += 4; break;
 695                         case '5': v *= 10; v += 5; break;
 696                         case '6': v *= 10; v += 6; break;
 697                         case '7': v *= 10; v += 7; break;
 698                         case '8': v *= 10; v += 8; break;
 699                         case '9': v *= 10; v += 9; break;
 700                         default:
 701                                 goto end;
 702                         }
 703                 }
 704                 break;
 705         default:
 706                 assert(0);
 707                 break;
 708         }
 709 end:
 710         *endptr = s;
 711         return v;
 712 }
 713
 714 /**
 715  * Parses a hex number including hex floats and set the
 716  * lexer_token.
 717  */
 718 static void parse_number_hex(void)
 719 {
 720         bool is_float = false;
 721         assert(c == 'x' || c == 'X');
 722         next_char();
 723
 724         obstack_1grow(&symbol_obstack, '0');
 725         obstack_1grow(&symbol_obstack, 'x');
 726
 727         while(isxdigit(c)) {
 728                 obstack_1grow(&symbol_obstack, (char) c);
 729                 next_char();
 730         }
 731
 732         if (c == '.') {
 733                 obstack_1grow(&symbol_obstack, (char) c);
 734                 next_char();
 735
 736                 while (isxdigit(c)) {
 737                         obstack_1grow(&symbol_obstack, (char) c);
 738                         next_char();
 739                 }
 740                 is_float = true;
 741         }
 742         if (c == 'p' || c == 'P') {
 743                 obstack_1grow(&symbol_obstack, (char) c);
 744                 next_char();
 745
 746                 if (c == '-' || c == '+') {
 747                         obstack_1grow(&symbol_obstack, (char) c);
 748                         next_char();
 749                 }
 750
 751                 while (isxdigit(c)) {
 752                         obstack_1grow(&symbol_obstack, (char) c);
 753                         next_char();
 754                 }
 755                 is_float = true;
 756         }
 757
 758         obstack_1grow(&symbol_obstack, '\0');
 759         char *string = obstack_finish(&symbol_obstack);
 760         if(*string == '\0') {
 761                 parse_error("invalid hex number");
 762                 lexer_token.type = T_ERROR;
 763                 obstack_free(&symbol_obstack, string);
 764                 return;
 765         }
 766
 767         if (is_float) {
 768                 char *endptr;
 769                 lexer_token.type         = T_FLOATINGPOINT;
 770                 lexer_token.v.floatvalue = strtold(string, &endptr);
 771
 772                 if(*endptr != '\0') {
 773                         parse_error("invalid hex float literal");
 774                 }
 775
 776                 parse_floating_suffix();
 777         } else {
 778                 const char *endptr;
 779                 lexer_token.type       = T_INTEGER;
 780                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
 781                 if(*endptr != '\0') {
 782                         parse_error("hex number literal too long");
 783                 }
 784                 parse_integer_suffix(true);
 785         }
 786
 787         obstack_free(&symbol_obstack, string);
 788 }
 789
 790 /**
 791  * Returns true if the given char is a octal digit.
 792  *
 793  * @param char  the character to check
 794  */
 795 static inline bool is_octal_digit(utf32 chr)
 796 {
 797         switch(chr) {
 798         case '0':
 799         case '1':
 800         case '2':
 801         case '3':
 802         case '4':
 803         case '5':
 804         case '6':
 805         case '7':
 806                 return true;
 807         default:
 808                 return false;
 809         }
 810 }
 811
 812 /**
 813  * Parses a octal number and set the lexer_token.
 814  */
 815 static void parse_number_oct(void)
 816 {
 817         while(is_octal_digit(c)) {
 818                 obstack_1grow(&symbol_obstack, (char) c);
 819                 next_char();
 820         }
 821         obstack_1grow(&symbol_obstack, '\0');
 822         char *string = obstack_finish(&symbol_obstack);
 823
 824         const char *endptr;
 825         lexer_token.type       = T_INTEGER;
 826         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 827         if(*endptr != '\0') {
 828                 parse_error("octal number literal too long");
 829         }
 830
 831         obstack_free(&symbol_obstack, string);
 832         parse_integer_suffix(true);
 833 }
 834
 835 /**
 836  * Parses a decimal including float number and set the
 837  * lexer_token.
 838  */
 839 static void parse_number_dec(void)
 840 {
 841         bool is_float = false;
 842         while (isdigit(c)) {
 843                 obstack_1grow(&symbol_obstack, (char) c);
 844                 next_char();
 845         }
 846
 847         if (c == '.') {
 848                 obstack_1grow(&symbol_obstack, '.');
 849                 next_char();
 850
 851                 while (isdigit(c)) {
 852                         obstack_1grow(&symbol_obstack, (char) c);
 853                         next_char();
 854                 }
 855                 is_float = true;
 856         }
 857         if(c == 'e' || c == 'E') {
 858                 obstack_1grow(&symbol_obstack, (char) c);
 859                 next_char();
 860
 861                 if(c == '-' || c == '+') {
 862                         obstack_1grow(&symbol_obstack, (char) c);
 863                         next_char();
 864                 }
 865
 866                 while(isdigit(c)) {
 867                         obstack_1grow(&symbol_obstack, (char) c);
 868                         next_char();
 869                 }
 870                 is_float = true;
 871         }
 872
 873         obstack_1grow(&symbol_obstack, '\0');
 874         char *string = obstack_finish(&symbol_obstack);
 875
 876         if(is_float) {
 877                 char *endptr;
 878                 lexer_token.type         = T_FLOATINGPOINT;
 879                 lexer_token.v.floatvalue = strtold(string, &endptr);
 880
 881                 if(*endptr != '\0') {
 882                         parse_error("invalid number literal");
 883                 }
 884
 885                 parse_floating_suffix();
 886         } else {
 887                 const char *endptr;
 888                 lexer_token.type       = T_INTEGER;
 889                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 890
 891                 if(*endptr != '\0') {
 892                         parse_error("invalid number literal");
 893                 }
 894
 895                 parse_integer_suffix(false);
 896         }
 897         obstack_free(&symbol_obstack, string);
 898 }
 899
 900 /**
 901  * Parses a number and sets the lexer_token.
 902  */
 903 static void parse_number(void)
 904 {
 905         if (c == '0') {
 906                 next_char();
 907                 switch (c) {
 908                         case 'X':
 909                         case 'x':
 910                                 parse_number_hex();
 911                                 break;
 912                         case '0':
 913                         case '1':
 914                         case '2':
 915                         case '3':
 916                         case '4':
 917                         case '5':
 918                         case '6':
 919                         case '7':
 920                                 parse_number_oct();
 921                                 break;
 922                         case '8':
 923                         case '9':
 924                                 next_char();
 925                                 parse_error("invalid octal number");
 926                                 lexer_token.type = T_ERROR;
 927                                 return;
 928                         case '.':
 929                         case 'e':
 930                         case 'E':
 931                         default:
 932                                 obstack_1grow(&symbol_obstack, '0');
 933                                 parse_number_dec();
 934                                 return;
 935                 }
 936         } else {
 937                 parse_number_dec();
 938         }
 939 }
 940
 941 /**
 942  * Returns the value of a digit.
 943  * The only portable way to do it ...
 944  */
 945 static int digit_value(utf32 const digit)
 946 {
 947         switch (digit) {
 948         case '0': return 0;
 949         case '1': return 1;
 950         case '2': return 2;
 951         case '3': return 3;
 952         case '4': return 4;
 953         case '5': return 5;
 954         case '6': return 6;
 955         case '7': return 7;
 956         case '8': return 8;
 957         case '9': return 9;
 958         case 'a':
 959         case 'A': return 10;
 960         case 'b':
 961         case 'B': return 11;
 962         case 'c':
 963         case 'C': return 12;
 964         case 'd':
 965         case 'D': return 13;
 966         case 'e':
 967         case 'E': return 14;
 968         case 'f':
 969         case 'F': return 15;
 970         default:
 971                 internal_error("wrong character given");
 972         }
 973 }
 974
 975 /**
 976  * Parses an octal character sequence.
 977  *
 978  * @param first_digit  the already read first digit
 979  */
 980 static utf32 parse_octal_sequence(utf32 const first_digit)
 981 {
 982         assert(is_octal_digit(first_digit));
 983         utf32 value = digit_value(first_digit);
 984         if (!is_octal_digit(c)) return value;
 985         value = 8 * value + digit_value(c);
 986         next_char();
 987         if (!is_octal_digit(c)) return value;
 988         value = 8 * value + digit_value(c);
 989         next_char();
 990         return value;
 991 }
 992
 993 /**
 994  * Parses a hex character sequence.
 995  */
 996 static utf32 parse_hex_sequence(void)
 997 {
 998         utf32 value = 0;
 999         while(isxdigit(c)) {
1000                 value = 16 * value + digit_value(c);
1001                 next_char();
1002         }
1003         return value;
1004 }
1005
1006 /**
1007  * Parse an escape sequence.
1008  */
1009 static utf32 parse_escape_sequence(void)
1010 {
1011         eat('\\');
1012
1013         utf32 const ec = c;
1014         next_char();
1015
1016         switch (ec) {
1017         case '"':  return '"';
1018         case '\'': return '\'';
1019         case '\\': return '\\';
1020         case '?': return '\?';
1021         case 'a': return '\a';
1022         case 'b': return '\b';
1023         case 'f': return '\f';
1024         case 'n': return '\n';
1025         case 'r': return '\r';
1026         case 't': return '\t';
1027         case 'v': return '\v';
1028         case 'x':
1029                 return parse_hex_sequence();
1030         case '0':
1031         case '1':
1032         case '2':
1033         case '3':
1034         case '4':
1035         case '5':
1036         case '6':
1037         case '7':
1038                 return parse_octal_sequence(ec);
1039         case EOF:
1040                 parse_error("reached end of file while parsing escape sequence");
1041                 return EOF;
1042         /* \E is not documented, but handled, by GCC.  It is acceptable according
1043          * to §6.11.4, whereas \e is not. */
1044         case 'E':
1045         case 'e':
1046                 if (c_mode & _GNUC)
1047                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1048                 /* FALLTHROUGH */
1049         default:
1050                 /* §6.4.4.4:8 footnote 64 */
1051                 parse_error("unknown escape sequence");
1052                 return EOF;
1053         }
1054 }
1055
1056 /**
1057  * Concatenate two strings.
1058  */
1059 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1060 {
1061         const size_t len1 = s1->size - 1;
1062         const size_t len2 = s2->size - 1;
1063
1064         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1065         memcpy(concat, s1->begin, len1);
1066         memcpy(concat + len1, s2->begin, len2 + 1);
1067
1068         if (warning.traditional) {
1069                 warningf(&lexer_token.source_position,
1070                         "traditional C rejects string constant concatenation");
1071         }
1072 #if 0 /* TODO hash */
1073         const char *result = strset_insert(&stringset, concat);
1074         if(result != concat) {
1075                 obstack_free(&symbol_obstack, concat);
1076         }
1077
1078         return result;
1079 #else
1080         return (string_t){ concat, len1 + len2 + 1 };
1081 #endif
1082 }
1083
1084 /**
1085  * Concatenate a string and a wide string.
1086  */
1087 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1088 {
1089         const size_t len1 = s1->size - 1;
1090         const size_t len2 = s2->size - 1;
1091
1092         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1093         const char *const src = s1->begin;
1094         for (size_t i = 0; i != len1; ++i) {
1095                 concat[i] = src[i];
1096         }
1097         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1098         if (warning.traditional) {
1099                 warningf(&lexer_token.source_position,
1100                         "traditional C rejects string constant concatenation");
1101         }
1102
1103         return (wide_string_t){ concat, len1 + len2 + 1 };
1104 }
1105
1106 /**
1107  * Concatenate two wide strings.
1108  */
1109 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1110 {
1111         const size_t len1 = s1->size - 1;
1112         const size_t len2 = s2->size - 1;
1113
1114         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1115         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1116         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1117         if (warning.traditional) {
1118                 warningf(&lexer_token.source_position,
1119                         "traditional C rejects string constant concatenation");
1120         }
1121
1122         return (wide_string_t){ concat, len1 + len2 + 1 };
1123 }
1124
1125 /**
1126  * Concatenate a wide string and a string.
1127  */
1128 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1129 {
1130         const size_t len1 = s1->size - 1;
1131         const size_t len2 = s2->size - 1;
1132
1133         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1134         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1135         const char  *const src = s2->begin;
1136         wchar_rep_t *const dst = concat + len1;
1137         for (size_t i = 0; i != len2 + 1; ++i) {
1138                 dst[i] = src[i];
1139         }
1140         if (warning.traditional) {
1141                 warningf(&lexer_token.source_position,
1142                         "traditional C rejects string constant concatenation");
1143         }
1144
1145         return (wide_string_t){ concat, len1 + len2 + 1 };
1146 }
1147
1148 static void grow_symbol(utf32 const tc)
1149 {
1150         struct obstack *const o  = &symbol_obstack;
1151         if (tc < 0x80U) {
1152                 obstack_1grow(o, tc);
1153         } else if (tc < 0x800) {
1154                 obstack_1grow(o, 0xC0 | (tc >> 6));
1155                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1156         } else if (tc < 0x10000) {
1157                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1158                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1159                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1160         } else {
1161                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1162                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1163                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1164                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1165         }
1166 }
1167
1168 /**
1169  * Parse a string literal and set lexer_token.
1170  */
1171 static void parse_string_literal(void)
1172 {
1173         const unsigned start_linenr = lexer_token.source_position.linenr;
1174
1175         eat('"');
1176
1177         while(1) {
1178                 switch(c) {
1179                 case '\\': {
1180                         utf32 const tc = parse_escape_sequence();
1181                         if (tc >= 0x100) {
1182                                 warningf(&lexer_token.source_position,
1183                                                 "escape sequence out of range");
1184                         }
1185                         obstack_1grow(&symbol_obstack, tc);
1186                         break;
1187                 }
1188
1189                 case EOF: {
1190                         source_position_t source_position;
1191                         source_position.input_name = lexer_token.source_position.input_name;
1192                         source_position.linenr     = start_linenr;
1193                         errorf(&source_position, "string has no end");
1194                         lexer_token.type = T_ERROR;
1195                         return;
1196                 }
1197
1198                 case '"':
1199                         next_char();
1200                         goto end_of_string;
1201
1202                 default:
1203                         grow_symbol(c);
1204                         next_char();
1205                         break;
1206                 }
1207         }
1208
1209 end_of_string:
1210
1211         /* TODO: concatenate multiple strings separated by whitespace... */
1212
1213         /* add finishing 0 to the string */
1214         obstack_1grow(&symbol_obstack, '\0');
1215         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1216         const char *const string = obstack_finish(&symbol_obstack);
1217
1218 #if 0 /* TODO hash */
1219         /* check if there is already a copy of the string */
1220         result = strset_insert(&stringset, string);
1221         if(result != string) {
1222                 obstack_free(&symbol_obstack, string);
1223         }
1224 #else
1225         const char *const result = string;
1226 #endif
1227
1228         lexer_token.type           = T_STRING_LITERAL;
1229         lexer_token.v.string.begin = result;
1230         lexer_token.v.string.size  = size;
1231 }
1232
1233 /**
1234  * Parse a wide character constant and set lexer_token.
1235  */
1236 static void parse_wide_character_constant(void)
1237 {
1238         const unsigned start_linenr = lexer_token.source_position.linenr;
1239
1240         eat('\'');
1241
1242         while(1) {
1243                 switch(c) {
1244                 case '\\': {
1245                         wchar_rep_t tc = parse_escape_sequence();
1246                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1247                         break;
1248                 }
1249
1250                 MATCH_NEWLINE(
1251                         parse_error("newline while parsing character constant");
1252                         break;
1253                 )
1254
1255                 case '\'':
1256                         next_char();
1257                         goto end_of_wide_char_constant;
1258
1259                 case EOF: {
1260                         source_position_t source_position = lexer_token.source_position;
1261                         source_position.linenr = start_linenr;
1262                         errorf(&source_position, "EOF while parsing character constant");
1263                         lexer_token.type = T_ERROR;
1264                         return;
1265                 }
1266
1267                 default: {
1268                         wchar_rep_t tc = (wchar_rep_t) c;
1269                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1270                         next_char();
1271                         break;
1272                 }
1273                 }
1274         }
1275
1276 end_of_wide_char_constant:;
1277         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1278         assert(size % sizeof(wchar_rep_t) == 0);
1279         size /= sizeof(wchar_rep_t);
1280
1281         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1282
1283         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1284         lexer_token.v.wide_string.begin = string;
1285         lexer_token.v.wide_string.size  = size;
1286         lexer_token.datatype            = type_wchar_t;
1287 }
1288
1289 /**
1290  * Parse a wide string literal and set lexer_token.
1291  */
1292 static void parse_wide_string_literal(void)
1293 {
1294         const unsigned start_linenr = lexer_token.source_position.linenr;
1295
1296         assert(c == '"');
1297         next_char();
1298
1299         while(1) {
1300                 switch(c) {
1301                 case '\\': {
1302                         wchar_rep_t tc = parse_escape_sequence();
1303                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1304                         break;
1305                 }
1306
1307                 case EOF: {
1308                         source_position_t source_position;
1309                         source_position.input_name = lexer_token.source_position.input_name;
1310                         source_position.linenr     = start_linenr;
1311                         errorf(&source_position, "string has no end");
1312                         lexer_token.type = T_ERROR;
1313                         return;
1314                 }
1315
1316                 case '"':
1317                         next_char();
1318                         goto end_of_string;
1319
1320                 default: {
1321                         wchar_rep_t tc = c;
1322                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1323                         next_char();
1324                         break;
1325                 }
1326                 }
1327         }
1328
1329 end_of_string:;
1330
1331         /* TODO: concatenate multiple strings separated by whitespace... */
1332
1333         /* add finishing 0 to the string */
1334         wchar_rep_t nul = L'\0';
1335         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1336         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1337         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1338
1339 #if 0 /* TODO hash */
1340         /* check if there is already a copy of the string */
1341         const wchar_rep_t *const result = strset_insert(&stringset, string);
1342         if(result != string) {
1343                 obstack_free(&symbol_obstack, string);
1344         }
1345 #else
1346         const wchar_rep_t *const result = string;
1347 #endif
1348
1349         lexer_token.type                = T_WIDE_STRING_LITERAL;
1350         lexer_token.v.wide_string.begin = result;
1351         lexer_token.v.wide_string.size  = size;
1352 }
1353
1354 /**
1355  * Parse a character constant and set lexer_token.
1356  */
1357 static void parse_character_constant(void)
1358 {
1359         const unsigned start_linenr = lexer_token.source_position.linenr;
1360
1361         eat('\'');
1362
1363         while(1) {
1364                 switch(c) {
1365                 case '\\': {
1366                         utf32 const tc = parse_escape_sequence();
1367                         if (tc >= 0x100) {
1368                                 warningf(&lexer_token.source_position,
1369                                                 "escape sequence out of range");
1370                         }
1371                         obstack_1grow(&symbol_obstack, tc);
1372                         break;
1373                 }
1374
1375                 MATCH_NEWLINE(
1376                         parse_error("newline while parsing character constant");
1377                         break;
1378                 )
1379
1380                 case '\'':
1381                         next_char();
1382                         goto end_of_char_constant;
1383
1384                 case EOF: {
1385                         source_position_t source_position;
1386                         source_position.input_name = lexer_token.source_position.input_name;
1387                         source_position.linenr     = start_linenr;
1388                         errorf(&source_position, "EOF while parsing character constant");
1389                         lexer_token.type = T_ERROR;
1390                         return;
1391                 }
1392
1393                 default:
1394                         grow_symbol(c);
1395                         next_char();
1396                         break;
1397
1398                 }
1399         }
1400
1401 end_of_char_constant:;
1402         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1403         const char *const string = obstack_finish(&symbol_obstack);
1404
1405         lexer_token.type           = T_CHARACTER_CONSTANT;
1406         lexer_token.v.string.begin = string;
1407         lexer_token.v.string.size  = size;
1408         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1409 }
1410
1411 /**
1412  * Skip a multiline comment.
1413  */
1414 static void skip_multiline_comment(void)
1415 {
1416         unsigned start_linenr = lexer_token.source_position.linenr;
1417
1418         while(1) {
1419                 switch(c) {
1420                 case '/':
1421                         next_char();
1422                         if (c == '*') {
1423                                 /* nested comment, warn here */
1424                                 if (warning.comment) {
1425                                         warningf(&lexer_token.source_position, "'/*' within comment");
1426                                 }
1427                         }
1428                         break;
1429                 case '*':
1430                         next_char();
1431                         if(c == '/') {
1432                                 next_char();
1433                                 return;
1434                         }
1435                         break;
1436
1437                 MATCH_NEWLINE(break;)
1438
1439                 case EOF: {
1440                         source_position_t source_position;
1441                         source_position.input_name = lexer_token.source_position.input_name;
1442                         source_position.linenr     = start_linenr;
1443                         errorf(&source_position, "at end of file while looking for comment end");
1444                         return;
1445                 }
1446
1447                 default:
1448                         next_char();
1449                         break;
1450                 }
1451         }
1452 }
1453
1454 /**
1455  * Skip a single line comment.
1456  */
1457 static void skip_line_comment(void)
1458 {
1459         while(1) {
1460                 switch(c) {
1461                 case EOF:
1462                         return;
1463
1464                 case '\n':
1465                 case '\r':
1466                         return;
1467
1468                 case '\\':
1469                         next_char();
1470                         if (c == '\n' || c == '\r') {
1471                                 if (warning.comment)
1472                                         warningf(&lexer_token.source_position, "multi-line comment");
1473                                 return;
1474                         }
1475                         break;
1476
1477                 default:
1478                         next_char();
1479                         break;
1480                 }
1481         }
1482 }
1483
1484 /** The current preprocessor token. */
1485 static token_t pp_token;
1486
1487 /**
1488  * Read the next preprocessor token.
1489  */
1490 static inline void next_pp_token(void)
1491 {
1492         lexer_next_preprocessing_token();
1493         pp_token = lexer_token;
1494 }
1495
1496 /**
1497  * Eat all preprocessor tokens until newline.
1498  */
1499 static void eat_until_newline(void)
1500 {
1501         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1502                 next_pp_token();
1503         }
1504 }
1505
1506 /**
1507  * Handle the define directive.
1508  */
1509 static void define_directive(void)
1510 {
1511         lexer_next_preprocessing_token();
1512         if(lexer_token.type != T_IDENTIFIER) {
1513                 parse_error("expected identifier after #define\n");
1514                 eat_until_newline();
1515         }
1516 }
1517
1518 /**
1519  * Handle the ifdef directive.
1520  */
1521 static void ifdef_directive(int is_ifndef)
1522 {
1523         (void) is_ifndef;
1524         lexer_next_preprocessing_token();
1525         //expect_identifier();
1526         //extect_newline();
1527 }
1528
1529 /**
1530  * Handle the endif directive.
1531  */
1532 static void endif_directive(void)
1533 {
1534         //expect_newline();
1535 }
1536
1537 /**
1538  * Parse the line directive.
1539  */
1540 static void parse_line_directive(void)
1541 {
1542         if(pp_token.type != T_INTEGER) {
1543                 parse_error("expected integer");
1544         } else {
1545                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1546                 next_pp_token();
1547         }
1548         if(pp_token.type == T_STRING_LITERAL) {
1549                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1550                 next_pp_token();
1551         }
1552
1553         eat_until_newline();
1554 }
1555
1556 /**
1557  * STDC pragmas.
1558  */
1559 typedef enum stdc_pragma_kind_t {
1560         STDC_UNKNOWN,
1561         STDC_FP_CONTRACT,
1562         STDC_FENV_ACCESS,
1563         STDC_CX_LIMITED_RANGE
1564 } stdc_pragma_kind_t;
1565
1566 /**
1567  * STDC pragma values.
1568  */
1569 typedef enum stdc_pragma_value_kind_t {
1570         STDC_VALUE_UNKNOWN,
1571         STDC_VALUE_ON,
1572         STDC_VALUE_OFF,
1573         STDC_VALUE_DEFAULT
1574 } stdc_pragma_value_kind_t;
1575
1576 /**
1577  * Parse a pragma directive.
1578  */
1579 static void parse_pragma(void) {
1580         bool unknown_pragma = true;
1581
1582         next_pp_token();
1583         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1584                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1585                 /* a STDC pragma */
1586                 if (c_mode & _C99) {
1587                         next_pp_token();
1588
1589                         switch (pp_token.v.symbol->pp_ID) {
1590                         case TP_FP_CONTRACT:
1591                                 kind = STDC_FP_CONTRACT;
1592                                 break;
1593                         case TP_FENV_ACCESS:
1594                                 kind = STDC_FENV_ACCESS;
1595                                 break;
1596                         case TP_CX_LIMITED_RANGE:
1597                                 kind = STDC_CX_LIMITED_RANGE;
1598                                 break;
1599                         default:
1600                                 break;
1601                         }
1602                         if (kind != STDC_UNKNOWN) {
1603                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1604                                 next_pp_token();
1605                                 switch (pp_token.v.symbol->pp_ID) {
1606                                 case TP_ON:
1607                                         value = STDC_VALUE_ON;
1608                                         break;
1609                                 case TP_OFF:
1610                                         value = STDC_VALUE_OFF;
1611                                         break;
1612                                 case TP_DEFAULT:
1613                                         value = STDC_VALUE_DEFAULT;
1614                                         break;
1615                                 default:
1616                                         break;
1617                                 }
1618                                 if (value != STDC_VALUE_UNKNOWN) {
1619                                         unknown_pragma = false;
1620                                 } else {
1621                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1622                                 }
1623                         }
1624                 }
1625         } else {
1626                 unknown_pragma = true;
1627         }
1628         eat_until_newline();
1629         if (unknown_pragma && warning.unknown_pragmas) {
1630                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1631         }
1632 }
1633
1634 /**
1635  * Parse a preprocessor non-null directive.
1636  */
1637 static void parse_preprocessor_identifier(void)
1638 {
1639         assert(pp_token.type == T_IDENTIFIER);
1640         symbol_t *symbol = pp_token.v.symbol;
1641
1642         switch(symbol->pp_ID) {
1643         case TP_include:
1644                 printf("include - enable header name parsing!\n");
1645                 break;
1646         case TP_define:
1647                 define_directive();
1648                 break;
1649         case TP_ifdef:
1650                 ifdef_directive(0);
1651                 break;
1652         case TP_ifndef:
1653                 ifdef_directive(1);
1654                 break;
1655         case TP_endif:
1656                 endif_directive();
1657                 break;
1658         case TP_line:
1659                 next_pp_token();
1660                 parse_line_directive();
1661                 break;
1662         case TP_if:
1663         case TP_else:
1664         case TP_elif:
1665         case TP_undef:
1666         case TP_error:
1667                 /* TODO; output the rest of the line */
1668                 parse_error("#error directive: ");
1669                 break;
1670         case TP_pragma:
1671                 parse_pragma();
1672                 break;
1673         }
1674 }
1675
1676 /**
1677  * Parse a preprocessor directive.
1678  */
1679 static void parse_preprocessor_directive(void)
1680 {
1681         next_pp_token();
1682
1683         switch(pp_token.type) {
1684         case T_IDENTIFIER:
1685                 parse_preprocessor_identifier();
1686                 break;
1687         case T_INTEGER:
1688                 parse_line_directive();
1689                 break;
1690         case '\n':
1691                 /* NULL directive, see § 6.10.7 */
1692                 break;
1693         default:
1694                 parse_error("invalid preprocessor directive");
1695                 eat_until_newline();
1696                 break;
1697         }
1698 }
1699
1700 #define MAYBE_PROLOG                                       \
1701                         next_char();                                   \
1702                         while(1) {                                     \
1703                                 switch(c) {
1704
1705 #define MAYBE(ch, set_type)                                \
1706                                 case ch:                                   \
1707                                         next_char();                           \
1708                                         lexer_token.type = set_type;           \
1709                                         return;
1710
1711 #define ELSE_CODE(code)                                    \
1712                                 default:                                   \
1713                                         code                                   \
1714                                 }                                          \
1715                         } /* end of while(1) */                        \
1716                         break;
1717
1718 #define ELSE(set_type)                                     \
1719                 ELSE_CODE(                                         \
1720                         lexer_token.type = set_type;                   \
1721                         return;                                        \
1722                 )
1723
1724 void lexer_next_preprocessing_token(void)
1725 {
1726         while(1) {
1727                 switch(c) {
1728                 case ' ':
1729                 case '\t':
1730                         next_char();
1731                         break;
1732
1733                 MATCH_NEWLINE(
1734                         lexer_token.type = '\n';
1735                         return;
1736                 )
1737
1738                 SYMBOL_CHARS
1739                         parse_symbol();
1740                         /* might be a wide string ( L"string" ) */
1741                         if(lexer_token.type == T_IDENTIFIER &&
1742                             lexer_token.v.symbol == symbol_L) {
1743                             if(c == '"') {
1744                                         parse_wide_string_literal();
1745                                 } else if(c == '\'') {
1746                                         parse_wide_character_constant();
1747                                 }
1748                         }
1749                         return;
1750
1751                 DIGITS
1752                         parse_number();
1753                         return;
1754
1755                 case '"':
1756                         parse_string_literal();
1757                         return;
1758
1759                 case '\'':
1760                         parse_character_constant();
1761                         return;
1762
1763                 case '.':
1764                         MAYBE_PROLOG
1765                                 DIGITS
1766                                         put_back(c);
1767                                         c = '.';
1768                                         parse_number_dec();
1769                                         return;
1770
1771                                 case '.':
1772                                         MAYBE_PROLOG
1773                                         MAYBE('.', T_DOTDOTDOT)
1774                                         ELSE_CODE(
1775                                                 put_back(c);
1776                                                 c = '.';
1777                                                 lexer_token.type = '.';
1778                                                 return;
1779                                         )
1780                         ELSE('.')
1781                 case '&':
1782                         MAYBE_PROLOG
1783                         MAYBE('&', T_ANDAND)
1784                         MAYBE('=', T_ANDEQUAL)
1785                         ELSE('&')
1786                 case '*':
1787                         MAYBE_PROLOG
1788                         MAYBE('=', T_ASTERISKEQUAL)
1789                         ELSE('*')
1790                 case '+':
1791                         MAYBE_PROLOG
1792                         MAYBE('+', T_PLUSPLUS)
1793                         MAYBE('=', T_PLUSEQUAL)
1794                         ELSE('+')
1795                 case '-':
1796                         MAYBE_PROLOG
1797                         MAYBE('>', T_MINUSGREATER)
1798                         MAYBE('-', T_MINUSMINUS)
1799                         MAYBE('=', T_MINUSEQUAL)
1800                         ELSE('-')
1801                 case '!':
1802                         MAYBE_PROLOG
1803                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1804                         ELSE('!')
1805                 case '/':
1806                         MAYBE_PROLOG
1807                         MAYBE('=', T_SLASHEQUAL)
1808                                 case '*':
1809                                         next_char();
1810                                         skip_multiline_comment();
1811                                         lexer_next_preprocessing_token();
1812                                         return;
1813                                 case '/':
1814                                         next_char();
1815                                         skip_line_comment();
1816                                         lexer_next_preprocessing_token();
1817                                         return;
1818                         ELSE('/')
1819                 case '%':
1820                         MAYBE_PROLOG
1821                         MAYBE('>', '}')
1822                         MAYBE('=', T_PERCENTEQUAL)
1823                                 case ':':
1824                                         MAYBE_PROLOG
1825                                                 case '%':
1826                                                         MAYBE_PROLOG
1827                                                         MAYBE(':', T_HASHHASH)
1828                                                         ELSE_CODE(
1829                                                                 put_back(c);
1830                                                                 c = '%';
1831                                                                 lexer_token.type = '#';
1832                                                                 return;
1833                                                         )
1834                                         ELSE('#')
1835                         ELSE('%')
1836                 case '<':
1837                         MAYBE_PROLOG
1838                         MAYBE(':', '[')
1839                         MAYBE('%', '{')
1840                         MAYBE('=', T_LESSEQUAL)
1841                                 case '<':
1842                                         MAYBE_PROLOG
1843                                         MAYBE('=', T_LESSLESSEQUAL)
1844                                         ELSE(T_LESSLESS)
1845                         ELSE('<')
1846                 case '>':
1847                         MAYBE_PROLOG
1848                         MAYBE('=', T_GREATEREQUAL)
1849                                 case '>':
1850                                         MAYBE_PROLOG
1851                                         MAYBE('=', T_GREATERGREATEREQUAL)
1852                                         ELSE(T_GREATERGREATER)
1853                         ELSE('>')
1854                 case '^':
1855                         MAYBE_PROLOG
1856                         MAYBE('=', T_CARETEQUAL)
1857                         ELSE('^')
1858                 case '|':
1859                         MAYBE_PROLOG
1860                         MAYBE('=', T_PIPEEQUAL)
1861                         MAYBE('|', T_PIPEPIPE)
1862                         ELSE('|')
1863                 case ':':
1864                         MAYBE_PROLOG
1865                         MAYBE('>', ']')
1866                         ELSE(':')
1867                 case '=':
1868                         MAYBE_PROLOG
1869                         MAYBE('=', T_EQUALEQUAL)
1870                         ELSE('=')
1871                 case '#':
1872                         MAYBE_PROLOG
1873                         MAYBE('#', T_HASHHASH)
1874                         ELSE('#')
1875
1876                 case '?':
1877                 case '[':
1878                 case ']':
1879                 case '(':
1880                 case ')':
1881                 case '{':
1882                 case '}':
1883                 case '~':
1884                 case ';':
1885                 case ',':
1886                 case '\\':
1887                         lexer_token.type = c;
1888                         next_char();
1889                         return;
1890
1891                 case EOF:
1892                         lexer_token.type = T_EOF;
1893                         return;
1894
1895                 default:
1896 dollar_sign:
1897                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1898                         next_char();
1899                         lexer_token.type = T_ERROR;
1900                         return;
1901                 }
1902         }
1903 }
1904
1905 void lexer_next_token(void)
1906 {
1907         lexer_next_preprocessing_token();
1908
1909         while (lexer_token.type == '\n') {
1910 newline_found:
1911                 lexer_next_preprocessing_token();
1912         }
1913
1914         if (lexer_token.type == '#') {
1915                 parse_preprocessor_directive();
1916                 goto newline_found;
1917         }
1918 }
1919
1920 void init_lexer(void)
1921 {
1922         strset_init(&stringset);
1923         symbol_L = symbol_table_insert("L");
1924 }
1925
1926 void lexer_open_stream(FILE *stream, const char *input_name)
1927 {
1928         input                                  = stream;
1929         lexer_token.source_position.linenr     = 0;
1930         lexer_token.source_position.input_name = input_name;
1931
1932         bufpos = NULL;
1933         bufend = NULL;
1934
1935         /* place a virtual \n at the beginning so the lexer knows that we're
1936          * at the beginning of a line */
1937         c = '\n';
1938 }
1939
1940 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1941 {
1942         input                                  = NULL;
1943         lexer_token.source_position.linenr     = 0;
1944         lexer_token.source_position.input_name = input_name;
1945
1946 #if 0 // TODO
1947         bufpos = buffer;
1948         bufend = buffer + len;
1949 #else
1950         (void)buffer;
1951         (void)len;
1952         panic("builtin lexing not done yet");
1953 #endif
1954
1955         /* place a virtual \n at the beginning so the lexer knows that we're
1956          * at the beginning of a line */
1957         c = '\n';
1958 }
1959
1960 void exit_lexer(void)
1961 {
1962         strset_destroy(&stringset);
1963 }
1964
1965 static __attribute__((unused))
1966 void dbg_pos(const source_position_t source_position)
1967 {
1968         fprintf(stdout, "%s:%u\n", source_position.input_name,
1969                 source_position.linenr);
1970         fflush(stdout);
1971 }