nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2009 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <stdbool.h>
  41 #include <ctype.h>
  42
  43 #ifndef _WIN32
  44 #include <strings.h>
  45 #endif
  46
  47 //#define DEBUG_CHARS
  48 #define MAX_PUTBACK 3
  49 #define BUF_SIZE    1024
  50
  51 #if defined(_WIN32) || defined(__CYGWIN__)
  52 /* No strtold on windows and no replacement yet */
  53 #define strtold(s, e) strtod(s, e)
  54 #endif
  55
  56 typedef unsigned int utf32;
  57
  58 static utf32        c;
  59 token_t             lexer_token;
  60 symbol_t           *symbol_L;
  61 static FILE        *input;
  62 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  63 static const utf32 *bufend;
  64 static const utf32 *bufpos;
  65 static strset_t     stringset;
  66 bool                allow_dollar_in_symbol = true;
  67
  68 /**
  69  * Prints a parse error message at the current token.
  70  *
  71  * @param msg   the error message
  72  */
  73 static void parse_error(const char *msg)
  74 {
  75         errorf(&lexer_token.source_position, "%s", msg);
  76 }
  77
  78 /**
  79  * Prints an internal error message at the current token.
  80  *
  81  * @param msg   the error message
  82  */
  83 static NORETURN internal_error(const char *msg)
  84 {
  85         internal_errorf(&lexer_token.source_position, "%s", msg);
  86 }
  87
  88 static size_t read_block(unsigned char *const read_buf, size_t const n)
  89 {
  90         size_t const s = fread(read_buf, 1, n, input);
  91         if (s == 0) {
  92                 if (ferror(input))
  93                         parse_error("read from input failed");
  94                 buf[MAX_PUTBACK] = EOF;
  95                 bufpos           = buf + MAX_PUTBACK;
  96                 bufend           = buf + MAX_PUTBACK + 1;
  97         }
  98         return s;
  99 }
 100
 101 static void decode_iso_8859_1(void)
 102 {
 103         unsigned char read_buf[BUF_SIZE];
 104         size_t const s = read_block(read_buf, sizeof(read_buf));
 105         if (s == 0)
 106                 return;
 107
 108         unsigned char const *src = read_buf;
 109         unsigned char const *end = read_buf + s;
 110         utf32               *dst = buf + MAX_PUTBACK;
 111         while (src != end)
 112                 *dst++ = *src++;
 113
 114         bufpos = buf + MAX_PUTBACK;
 115         bufend = dst;
 116 }
 117
 118 static void decode_iso_8859_15(void)
 119 {
 120         unsigned char read_buf[BUF_SIZE];
 121         size_t const s = read_block(read_buf, sizeof(read_buf));
 122         if (s == 0)
 123                 return;
 124
 125         unsigned char const *src = read_buf;
 126         unsigned char const *end = read_buf + s;
 127         utf32               *dst = buf + MAX_PUTBACK;
 128         while (src != end) {
 129                 utf32 tc = *src++;
 130                 switch (tc) {
 131                         case 0xA4: tc = 0x20AC; break; // €
 132                         case 0xA6: tc = 0x0160; break; // Š
 133                         case 0xA8: tc = 0x0161; break; // š
 134                         case 0xB4: tc = 0x017D; break; // Ž
 135                         case 0xB8: tc = 0x017E; break; // ž
 136                         case 0xBC: tc = 0x0152; break; // Œ
 137                         case 0xBD: tc = 0x0153; break; // œ
 138                         case 0xBE: tc = 0x0178; break; // Ÿ
 139                 }
 140                 *dst++ = tc;
 141         }
 142
 143         bufpos = buf + MAX_PUTBACK;
 144         bufend = dst;
 145 }
 146
 147 static void decode_utf8(void)
 148 {
 149         static utf32  part_decoded_min_code;
 150         static utf32  part_decoded_char;
 151         static size_t part_decoded_rest_len;
 152
 153         do {
 154                 unsigned char read_buf[BUF_SIZE];
 155                 size_t const s = read_block(read_buf, sizeof(read_buf));
 156                 if (s == 0) {
 157                         if (part_decoded_rest_len > 0)
 158                                 parse_error("incomplete input char at end of input");
 159                         return;
 160                 }
 161
 162                 unsigned char const *src = read_buf;
 163                 unsigned char const *end = read_buf + s;
 164                 utf32               *dst = buf + MAX_PUTBACK;
 165                 utf32                decoded;
 166                 utf32                min_code;
 167
 168                 if (part_decoded_rest_len != 0) {
 169                         min_code              = part_decoded_min_code;
 170                         decoded               = part_decoded_char;
 171                         size_t const rest_len = part_decoded_rest_len;
 172                         part_decoded_rest_len = 0;
 173                         switch (rest_len) {
 174                                 case 4:  goto realign;
 175                                 case 3:  goto three_more;
 176                                 case 2:  goto two_more;
 177                                 default: goto one_more;
 178                         }
 179                 }
 180
 181                 while (src != end) {
 182                         if ((*src & 0x80) == 0) {
 183                                 decoded = *src++;
 184                         } else if ((*src & 0xE0) == 0xC0) {
 185                                 min_code = 0x80;
 186                                 decoded  = *src++ & 0x1F;
 187 one_more:
 188                                 if (src == end) {
 189                                         part_decoded_min_code = min_code;
 190                                         part_decoded_char     = decoded;
 191                                         part_decoded_rest_len = 1;
 192                                         break;
 193                                 }
 194                                 if ((*src & 0xC0) == 0x80) {
 195                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 196                                 } else {
 197                                         goto invalid_char;
 198                                 }
 199                                 if (decoded < min_code                      ||
 200                                                 decoded > 0x10FFFF                      ||
 201                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 202                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 203                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 204                                         parse_error("invalid byte sequence in input");
 205                                 }
 206                         } else if ((*src & 0xF0) == 0xE0) {
 207                                 min_code = 0x800;
 208                                 decoded  = *src++ & 0x0F;
 209 two_more:
 210                                 if (src == end) {
 211                                         part_decoded_min_code = min_code;
 212                                         part_decoded_char     = decoded;
 213                                         part_decoded_rest_len = 2;
 214                                         break;
 215                                 }
 216                                 if ((*src & 0xC0) == 0x80) {
 217                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 218                                 } else {
 219                                         goto invalid_char;
 220                                 }
 221                                 goto one_more;
 222                         } else if ((*src & 0xF8) == 0xF0) {
 223                                 min_code = 0x10000;
 224                                 decoded  = *src++ & 0x07;
 225 three_more:
 226                                 if (src == end) {
 227                                         part_decoded_min_code = min_code;
 228                                         part_decoded_char     = decoded;
 229                                         part_decoded_rest_len = 3;
 230                                         break;
 231                                 }
 232                                 if ((*src & 0xC0) == 0x80) {
 233                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 234                                 } else {
 235                                         goto invalid_char;
 236                                 }
 237                                 goto two_more;
 238                         } else {
 239 invalid_char:
 240                                 parse_error("invalid byte sequence in input");
 241 realign:
 242                                 do {
 243                                         ++src;
 244                                         if (src == end) {
 245                                                 part_decoded_rest_len = 4;
 246                                                 break;
 247                                         }
 248                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 249                                 continue;
 250                         }
 251                         *dst++ = decoded;
 252                 }
 253
 254                 bufpos = buf + MAX_PUTBACK;
 255                 bufend = dst;
 256         } while (bufpos == bufend);
 257 }
 258
 259 static void decode_windows_1252(void)
 260 {
 261         unsigned char read_buf[BUF_SIZE];
 262         size_t const s = read_block(read_buf, sizeof(read_buf));
 263         if (s == 0)
 264                 return;
 265
 266         unsigned char const *src = read_buf;
 267         unsigned char const *end = read_buf + s;
 268         utf32               *dst = buf + MAX_PUTBACK;
 269         while (src != end) {
 270                 utf32 tc = *src++;
 271                 switch (tc) {
 272                         case 0x80: tc = 0x20AC; break; // €
 273                         case 0x82: tc = 0x201A; break; // ‚
 274                         case 0x83: tc = 0x0192; break; // ƒ
 275                         case 0x84: tc = 0x201E; break; // „
 276                         case 0x85: tc = 0x2026; break; // …
 277                         case 0x86: tc = 0x2020; break; // †
 278                         case 0x87: tc = 0x2021; break; // ‡
 279                         case 0x88: tc = 0x02C6; break; // ˆ
 280                         case 0x89: tc = 0x2030; break; // ‰
 281                         case 0x8A: tc = 0x0160; break; // Š
 282                         case 0x8B: tc = 0x2039; break; // ‹
 283                         case 0x8C: tc = 0x0152; break; // Œ
 284                         case 0x8E: tc = 0x017D; break; // Ž
 285                         case 0x91: tc = 0x2018; break; // ‘
 286                         case 0x92: tc = 0x2019; break; // ’
 287                         case 0x93: tc = 0x201C; break; // “
 288                         case 0x94: tc = 0x201D; break; // ”
 289                         case 0x95: tc = 0x2022; break; // •
 290                         case 0x96: tc = 0x2013; break; // –
 291                         case 0x97: tc = 0x2014; break; // —
 292                         case 0x98: tc = 0x02DC; break; // ˜
 293                         case 0x99: tc = 0x2122; break; // ™
 294                         case 0x9A: tc = 0x0161; break; // š
 295                         case 0x9B: tc = 0x203A; break; // ›
 296                         case 0x9C: tc = 0x0153; break; // œ
 297                         case 0x9E: tc = 0x017E; break; // ž
 298                         case 0x9F: tc = 0x0178; break; // Ÿ
 299                 }
 300                 *dst++ = tc;
 301         }
 302
 303         bufpos = buf + MAX_PUTBACK;
 304         bufend = dst;
 305 }
 306
 307 typedef void (*decoder_t)(void);
 308
 309 static decoder_t decoder = decode_utf8;
 310
 311 typedef struct named_decoder_t {
 312         char const *name;
 313         decoder_t   decoder;
 314 } named_decoder_t;
 315
 316 static named_decoder_t const decoders[] = {
 317         { "CP819",           decode_iso_8859_1   }, // offical alias
 318         { "IBM819",          decode_iso_8859_1   }, // offical alias
 319         { "ISO-8859-1",      decode_iso_8859_1   }, // offical alias
 320         { "ISO-8859-15",     decode_iso_8859_15  }, // offical name
 321         { "ISO8859-1",       decode_iso_8859_1   },
 322         { "ISO8859-15",      decode_iso_8859_15  },
 323         { "ISO_8859-1",      decode_iso_8859_1   }, // offical alias
 324         { "ISO_8859-15",     decode_iso_8859_15  }, // offical alias
 325         { "ISO_8859-1:1987", decode_iso_8859_1   }, // offical name
 326         { "Latin-9",         decode_iso_8859_15  }, // offical alias
 327         { "UTF-8",           decode_utf8         }, // offical name
 328         { "csISOLatin1",     decode_iso_8859_1   }, // offical alias
 329         { "cp1252",          decode_windows_1252 },
 330         { "iso-ir-100",      decode_iso_8859_1   }, // offical alias
 331         { "l1",              decode_iso_8859_1   }, // offical alias
 332         { "latin1",          decode_iso_8859_1   }, // offical alias
 333         { "windows-1252",    decode_windows_1252 }, // official name
 334
 335         { NULL,              NULL                }
 336 };
 337
 338 void select_input_encoding(char const* const encoding)
 339 {
 340         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 341                 if (strcasecmp(encoding, i->name) != 0)
 342                         continue;
 343                 decoder = i->decoder;
 344                 return;
 345         }
 346         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 347 }
 348
 349 static inline void next_real_char(void)
 350 {
 351         assert(bufpos <= bufend);
 352         if (bufpos >= bufend) {
 353                 if (input == NULL) {
 354                         c = EOF;
 355                         return;
 356                 }
 357                 decoder();
 358         }
 359         c = *bufpos++;
 360 }
 361
 362 /**
 363  * Put a character back into the buffer.
 364  *
 365  * @param pc  the character to put back
 366  */
 367 static inline void put_back(utf32 const pc)
 368 {
 369         assert(bufpos > buf);
 370         *(--bufpos - buf + buf) = pc;
 371
 372 #ifdef DEBUG_CHARS
 373         printf("putback '%lc'\n", pc);
 374 #endif
 375 }
 376
 377 static inline void next_char(void);
 378
 379 #define MATCH_NEWLINE(code)                   \
 380         case '\r':                                \
 381                 next_char();                          \
 382                 if(c == '\n') {                       \
 383                         next_char();                      \
 384                 }                                     \
 385                 lexer_token.source_position.linenr++; \
 386                 code                                  \
 387         case '\n':                                \
 388                 next_char();                          \
 389                 lexer_token.source_position.linenr++; \
 390                 code
 391
 392 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 393
 394 static void maybe_concat_lines(void)
 395 {
 396         eat('\\');
 397
 398         switch(c) {
 399         MATCH_NEWLINE(return;)
 400
 401         default:
 402                 break;
 403         }
 404
 405         put_back(c);
 406         c = '\\';
 407 }
 408
 409 /**
 410  * Set c to the next input character, ie.
 411  * after expanding trigraphs.
 412  */
 413 static inline void next_char(void)
 414 {
 415         next_real_char();
 416
 417         /* filter trigraphs */
 418         if(UNLIKELY(c == '\\')) {
 419                 maybe_concat_lines();
 420                 goto end_of_next_char;
 421         }
 422
 423         if(LIKELY(c != '?'))
 424                 goto end_of_next_char;
 425
 426         next_real_char();
 427         if(LIKELY(c != '?')) {
 428                 put_back(c);
 429                 c = '?';
 430                 goto end_of_next_char;
 431         }
 432
 433         next_real_char();
 434         switch(c) {
 435         case '=': c = '#'; break;
 436         case '(': c = '['; break;
 437         case '/': c = '\\'; maybe_concat_lines(); break;
 438         case ')': c = ']'; break;
 439         case '\'': c = '^'; break;
 440         case '<': c = '{'; break;
 441         case '!': c = '|'; break;
 442         case '>': c = '}'; break;
 443         case '-': c = '~'; break;
 444         default:
 445                 put_back(c);
 446                 put_back('?');
 447                 c = '?';
 448                 break;
 449         }
 450
 451 end_of_next_char:;
 452 #ifdef DEBUG_CHARS
 453         printf("nchar '%c'\n", c);
 454 #endif
 455 }
 456
 457 #define SYMBOL_CHARS  \
 458         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 459         case 'a':         \
 460         case 'b':         \
 461         case 'c':         \
 462         case 'd':         \
 463         case 'e':         \
 464         case 'f':         \
 465         case 'g':         \
 466         case 'h':         \
 467         case 'i':         \
 468         case 'j':         \
 469         case 'k':         \
 470         case 'l':         \
 471         case 'm':         \
 472         case 'n':         \
 473         case 'o':         \
 474         case 'p':         \
 475         case 'q':         \
 476         case 'r':         \
 477         case 's':         \
 478         case 't':         \
 479         case 'u':         \
 480         case 'v':         \
 481         case 'w':         \
 482         case 'x':         \
 483         case 'y':         \
 484         case 'z':         \
 485         case 'A':         \
 486         case 'B':         \
 487         case 'C':         \
 488         case 'D':         \
 489         case 'E':         \
 490         case 'F':         \
 491         case 'G':         \
 492         case 'H':         \
 493         case 'I':         \
 494         case 'J':         \
 495         case 'K':         \
 496         case 'L':         \
 497         case 'M':         \
 498         case 'N':         \
 499         case 'O':         \
 500         case 'P':         \
 501         case 'Q':         \
 502         case 'R':         \
 503         case 'S':         \
 504         case 'T':         \
 505         case 'U':         \
 506         case 'V':         \
 507         case 'W':         \
 508         case 'X':         \
 509         case 'Y':         \
 510         case 'Z':         \
 511         case '_':
 512
 513 #define DIGITS        \
 514         case '0':         \
 515         case '1':         \
 516         case '2':         \
 517         case '3':         \
 518         case '4':         \
 519         case '5':         \
 520         case '6':         \
 521         case '7':         \
 522         case '8':         \
 523         case '9':
 524
 525 /**
 526  * Read a symbol from the input and build
 527  * the lexer_token.
 528  */
 529 static void parse_symbol(void)
 530 {
 531         symbol_t *symbol;
 532         char     *string;
 533
 534         obstack_1grow(&symbol_obstack, (char) c);
 535         next_char();
 536
 537         while(1) {
 538                 switch(c) {
 539                 DIGITS
 540                 SYMBOL_CHARS
 541                         obstack_1grow(&symbol_obstack, (char) c);
 542                         next_char();
 543                         break;
 544
 545                 default:
 546 dollar_sign:
 547                         goto end_symbol;
 548                 }
 549         }
 550
 551 end_symbol:
 552         obstack_1grow(&symbol_obstack, '\0');
 553
 554         string = obstack_finish(&symbol_obstack);
 555         symbol = symbol_table_insert(string);
 556
 557         lexer_token.type     = symbol->ID;
 558         lexer_token.v.symbol = symbol;
 559
 560         if(symbol->string != string) {
 561                 obstack_free(&symbol_obstack, string);
 562         }
 563 }
 564
 565 static void parse_integer_suffix(bool is_oct_hex)
 566 {
 567         bool is_unsigned     = false;
 568         bool min_long        = false;
 569         bool min_longlong    = false;
 570         bool not_traditional = false;
 571         int  pos             = 0;
 572         char suffix[4];
 573
 574         if (c == 'U' || c == 'u') {
 575                 not_traditional = true;
 576                 suffix[pos++]   = toupper(c);
 577                 is_unsigned     = true;
 578                 next_char();
 579                 if (c == 'L' || c == 'l') {
 580                         suffix[pos++] = toupper(c);
 581                         min_long = true;
 582                         next_char();
 583                         if (c == 'L' || c == 'l') {
 584                                 suffix[pos++] = toupper(c);
 585                                 min_longlong = true;
 586                                 next_char();
 587                         }
 588                 }
 589         } else if (c == 'l' || c == 'L') {
 590                 suffix[pos++] = toupper(c);
 591                 min_long = true;
 592                 next_char();
 593                 if (c == 'l' || c == 'L') {
 594                         not_traditional = true;
 595                         suffix[pos++]   = toupper(c);
 596                         min_longlong    = true;
 597                         next_char();
 598                         if (c == 'u' || c == 'U') {
 599                                 suffix[pos++] = toupper(c);
 600                                 is_unsigned   = true;
 601                                 next_char();
 602                         }
 603                 } else if (c == 'u' || c == 'U') {
 604                         not_traditional = true;
 605                         suffix[pos++]   = toupper(c);
 606                         is_unsigned     = true;
 607                         next_char();
 608                         lexer_token.datatype = type_unsigned_long;
 609                 }
 610         }
 611
 612         if (warning.traditional && not_traditional) {
 613                 suffix[pos] = '\0';
 614                 warningf(&lexer_token.source_position,
 615                         "traditional C rejects the '%s' suffix", suffix);
 616         }
 617         if (!is_unsigned) {
 618                 long long v = lexer_token.v.intvalue;
 619                 if (!min_long) {
 620                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 621                                 lexer_token.datatype = type_int;
 622                                 return;
 623                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 624                                 lexer_token.datatype = type_unsigned_int;
 625                                 return;
 626                         }
 627                 }
 628                 if (!min_longlong) {
 629                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 630                                 lexer_token.datatype = type_long;
 631                                 return;
 632                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
 633                                 lexer_token.datatype = type_unsigned_long;
 634                                 return;
 635                         }
 636                 }
 637                 unsigned long long uv = (unsigned long long) v;
 638                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 639                         lexer_token.datatype = type_unsigned_long_long;
 640                         return;
 641                 }
 642
 643                 lexer_token.datatype = type_long_long;
 644         } else {
 645                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 646                 if (!min_long && v <= TARGET_UINT_MAX) {
 647                         lexer_token.datatype = type_unsigned_int;
 648                         return;
 649                 }
 650                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
 651                         lexer_token.datatype = type_unsigned_long;
 652                         return;
 653                 }
 654                 lexer_token.datatype = type_unsigned_long_long;
 655         }
 656 }
 657
 658 static void parse_floating_suffix(void)
 659 {
 660         switch(c) {
 661         /* TODO: do something useful with the suffixes... */
 662         case 'f':
 663         case 'F':
 664                 if (warning.traditional) {
 665                         warningf(&lexer_token.source_position,
 666                                 "traditional C rejects the 'F' suffix");
 667                 }
 668                 next_char();
 669                 lexer_token.datatype = type_float;
 670                 break;
 671         case 'l':
 672         case 'L':
 673                 if (warning.traditional) {
 674                         warningf(&lexer_token.source_position,
 675                                 "traditional C rejects the 'F' suffix");
 676                 }
 677                 next_char();
 678                 lexer_token.datatype = type_long_double;
 679                 break;
 680         default:
 681                 lexer_token.datatype = type_double;
 682                 break;
 683         }
 684 }
 685
 686 /**
 687  * A replacement for strtoull. Only those parts needed for
 688  * our parser are implemented.
 689  */
 690 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
 691 {
 692         unsigned long long v = 0;
 693
 694         switch (base) {
 695         case 16:
 696                 for (;; ++s) {
 697                         /* check for overrun */
 698                         if (v >= 0x1000000000000000ULL)
 699                                 break;
 700                         switch (tolower(*s)) {
 701                         case '0': v <<= 4; break;
 702                         case '1': v <<= 4; v |= 0x1; break;
 703                         case '2': v <<= 4; v |= 0x2; break;
 704                         case '3': v <<= 4; v |= 0x3; break;
 705                         case '4': v <<= 4; v |= 0x4; break;
 706                         case '5': v <<= 4; v |= 0x5; break;
 707                         case '6': v <<= 4; v |= 0x6; break;
 708                         case '7': v <<= 4; v |= 0x7; break;
 709                         case '8': v <<= 4; v |= 0x8; break;
 710                         case '9': v <<= 4; v |= 0x9; break;
 711                         case 'a': v <<= 4; v |= 0xa; break;
 712                         case 'b': v <<= 4; v |= 0xb; break;
 713                         case 'c': v <<= 4; v |= 0xc; break;
 714                         case 'd': v <<= 4; v |= 0xd; break;
 715                         case 'e': v <<= 4; v |= 0xe; break;
 716                         case 'f': v <<= 4; v |= 0xf; break;
 717                         default:
 718                                 goto end;
 719                         }
 720                 }
 721                 break;
 722         case 8:
 723                 for (;; ++s) {
 724                         /* check for overrun */
 725                         if (v >= 0x2000000000000000ULL)
 726                                 break;
 727                         switch (tolower(*s)) {
 728                         case '0': v <<= 3; break;
 729                         case '1': v <<= 3; v |= 1; break;
 730                         case '2': v <<= 3; v |= 2; break;
 731                         case '3': v <<= 3; v |= 3; break;
 732                         case '4': v <<= 3; v |= 4; break;
 733                         case '5': v <<= 3; v |= 5; break;
 734                         case '6': v <<= 3; v |= 6; break;
 735                         case '7': v <<= 3; v |= 7; break;
 736                         default:
 737                                 goto end;
 738                         }
 739                 }
 740                 break;
 741         case 10:
 742                 for (;; ++s) {
 743                         /* check for overrun */
 744                         if (v > 0x1999999999999999ULL)
 745                                 break;
 746                         switch (tolower(*s)) {
 747                         case '0': v *= 10; break;
 748                         case '1': v *= 10; v += 1; break;
 749                         case '2': v *= 10; v += 2; break;
 750                         case '3': v *= 10; v += 3; break;
 751                         case '4': v *= 10; v += 4; break;
 752                         case '5': v *= 10; v += 5; break;
 753                         case '6': v *= 10; v += 6; break;
 754                         case '7': v *= 10; v += 7; break;
 755                         case '8': v *= 10; v += 8; break;
 756                         case '9': v *= 10; v += 9; break;
 757                         default:
 758                                 goto end;
 759                         }
 760                 }
 761                 break;
 762         default:
 763                 assert(0);
 764                 break;
 765         }
 766 end:
 767         *endptr = s;
 768         return v;
 769 }
 770
 771 /**
 772  * Parses a hex number including hex floats and set the
 773  * lexer_token.
 774  */
 775 static void parse_number_hex(void)
 776 {
 777         bool is_float = false;
 778         assert(c == 'x' || c == 'X');
 779         next_char();
 780
 781         obstack_1grow(&symbol_obstack, '0');
 782         obstack_1grow(&symbol_obstack, 'x');
 783
 784         while(isxdigit(c)) {
 785                 obstack_1grow(&symbol_obstack, (char) c);
 786                 next_char();
 787         }
 788
 789         if (c == '.') {
 790                 obstack_1grow(&symbol_obstack, (char) c);
 791                 next_char();
 792
 793                 while (isxdigit(c)) {
 794                         obstack_1grow(&symbol_obstack, (char) c);
 795                         next_char();
 796                 }
 797                 is_float = true;
 798         }
 799         if (c == 'p' || c == 'P') {
 800                 obstack_1grow(&symbol_obstack, (char) c);
 801                 next_char();
 802
 803                 if (c == '-' || c == '+') {
 804                         obstack_1grow(&symbol_obstack, (char) c);
 805                         next_char();
 806                 }
 807
 808                 while (isxdigit(c)) {
 809                         obstack_1grow(&symbol_obstack, (char) c);
 810                         next_char();
 811                 }
 812                 is_float = true;
 813         }
 814
 815         obstack_1grow(&symbol_obstack, '\0');
 816         char *string = obstack_finish(&symbol_obstack);
 817         if(*string == '\0') {
 818                 parse_error("invalid hex number");
 819                 lexer_token.type = T_ERROR;
 820                 obstack_free(&symbol_obstack, string);
 821                 return;
 822         }
 823
 824         if (is_float) {
 825                 char *endptr;
 826                 lexer_token.type         = T_FLOATINGPOINT;
 827                 lexer_token.v.floatvalue = strtold(string, &endptr);
 828
 829                 if(*endptr != '\0') {
 830                         parse_error("invalid hex float literal");
 831                 }
 832
 833                 parse_floating_suffix();
 834         } else {
 835                 const char *endptr;
 836                 lexer_token.type       = T_INTEGER;
 837                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
 838                 if(*endptr != '\0') {
 839                         parse_error("hex number literal too long");
 840                 }
 841                 parse_integer_suffix(true);
 842         }
 843
 844         obstack_free(&symbol_obstack, string);
 845 }
 846
 847 /**
 848  * Returns true if the given char is a octal digit.
 849  *
 850  * @param char  the character to check
 851  */
 852 static inline bool is_octal_digit(utf32 chr)
 853 {
 854         switch(chr) {
 855         case '0':
 856         case '1':
 857         case '2':
 858         case '3':
 859         case '4':
 860         case '5':
 861         case '6':
 862         case '7':
 863                 return true;
 864         default:
 865                 return false;
 866         }
 867 }
 868
 869 /**
 870  * Parses a octal number and set the lexer_token.
 871  */
 872 static void parse_number_oct(void)
 873 {
 874         while(is_octal_digit(c)) {
 875                 obstack_1grow(&symbol_obstack, (char) c);
 876                 next_char();
 877         }
 878         obstack_1grow(&symbol_obstack, '\0');
 879         char *string = obstack_finish(&symbol_obstack);
 880
 881         const char *endptr;
 882         lexer_token.type       = T_INTEGER;
 883         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 884         if(*endptr != '\0') {
 885                 parse_error("octal number literal too long");
 886         }
 887
 888         obstack_free(&symbol_obstack, string);
 889         parse_integer_suffix(true);
 890 }
 891
 892 /**
 893  * Parses a decimal including float number and set the
 894  * lexer_token.
 895  */
 896 static void parse_number_dec(void)
 897 {
 898         bool is_float = false;
 899         while (isdigit(c)) {
 900                 obstack_1grow(&symbol_obstack, (char) c);
 901                 next_char();
 902         }
 903
 904         if (c == '.') {
 905                 obstack_1grow(&symbol_obstack, '.');
 906                 next_char();
 907
 908                 while (isdigit(c)) {
 909                         obstack_1grow(&symbol_obstack, (char) c);
 910                         next_char();
 911                 }
 912                 is_float = true;
 913         }
 914         if(c == 'e' || c == 'E') {
 915                 obstack_1grow(&symbol_obstack, (char) c);
 916                 next_char();
 917
 918                 if(c == '-' || c == '+') {
 919                         obstack_1grow(&symbol_obstack, (char) c);
 920                         next_char();
 921                 }
 922
 923                 while(isdigit(c)) {
 924                         obstack_1grow(&symbol_obstack, (char) c);
 925                         next_char();
 926                 }
 927                 is_float = true;
 928         }
 929
 930         obstack_1grow(&symbol_obstack, '\0');
 931         char *string = obstack_finish(&symbol_obstack);
 932
 933         if(is_float) {
 934                 char *endptr;
 935                 lexer_token.type         = T_FLOATINGPOINT;
 936                 lexer_token.v.floatvalue = strtold(string, &endptr);
 937
 938                 if(*endptr != '\0') {
 939                         parse_error("invalid number literal");
 940                 }
 941
 942                 parse_floating_suffix();
 943         } else {
 944                 const char *endptr;
 945                 lexer_token.type       = T_INTEGER;
 946                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 947
 948                 if(*endptr != '\0') {
 949                         parse_error("invalid number literal");
 950                 }
 951
 952                 parse_integer_suffix(false);
 953         }
 954         obstack_free(&symbol_obstack, string);
 955 }
 956
 957 /**
 958  * Parses a number and sets the lexer_token.
 959  */
 960 static void parse_number(void)
 961 {
 962         if (c == '0') {
 963                 next_char();
 964                 switch (c) {
 965                         case 'X':
 966                         case 'x':
 967                                 parse_number_hex();
 968                                 break;
 969                         case '0':
 970                         case '1':
 971                         case '2':
 972                         case '3':
 973                         case '4':
 974                         case '5':
 975                         case '6':
 976                         case '7':
 977                                 parse_number_oct();
 978                                 break;
 979                         case '8':
 980                         case '9':
 981                                 next_char();
 982                                 parse_error("invalid octal number");
 983                                 lexer_token.type = T_ERROR;
 984                                 return;
 985                         case '.':
 986                         case 'e':
 987                         case 'E':
 988                         default:
 989                                 obstack_1grow(&symbol_obstack, '0');
 990                                 parse_number_dec();
 991                                 return;
 992                 }
 993         } else {
 994                 parse_number_dec();
 995         }
 996 }
 997
 998 /**
 999  * Returns the value of a digit.
1000  * The only portable way to do it ...
1001  */
1002 static int digit_value(utf32 const digit)
1003 {
1004         switch (digit) {
1005         case '0': return 0;
1006         case '1': return 1;
1007         case '2': return 2;
1008         case '3': return 3;
1009         case '4': return 4;
1010         case '5': return 5;
1011         case '6': return 6;
1012         case '7': return 7;
1013         case '8': return 8;
1014         case '9': return 9;
1015         case 'a':
1016         case 'A': return 10;
1017         case 'b':
1018         case 'B': return 11;
1019         case 'c':
1020         case 'C': return 12;
1021         case 'd':
1022         case 'D': return 13;
1023         case 'e':
1024         case 'E': return 14;
1025         case 'f':
1026         case 'F': return 15;
1027         default:
1028                 internal_error("wrong character given");
1029         }
1030 }
1031
1032 /**
1033  * Parses an octal character sequence.
1034  *
1035  * @param first_digit  the already read first digit
1036  */
1037 static utf32 parse_octal_sequence(utf32 const first_digit)
1038 {
1039         assert(is_octal_digit(first_digit));
1040         utf32 value = digit_value(first_digit);
1041         if (!is_octal_digit(c)) return value;
1042         value = 8 * value + digit_value(c);
1043         next_char();
1044         if (!is_octal_digit(c)) return value;
1045         value = 8 * value + digit_value(c);
1046         next_char();
1047         return value;
1048 }
1049
1050 /**
1051  * Parses a hex character sequence.
1052  */
1053 static utf32 parse_hex_sequence(void)
1054 {
1055         utf32 value = 0;
1056         while(isxdigit(c)) {
1057                 value = 16 * value + digit_value(c);
1058                 next_char();
1059         }
1060         return value;
1061 }
1062
1063 /**
1064  * Parse an escape sequence.
1065  */
1066 static utf32 parse_escape_sequence(void)
1067 {
1068         eat('\\');
1069
1070         utf32 const ec = c;
1071         next_char();
1072
1073         switch (ec) {
1074         case '"':  return '"';
1075         case '\'': return '\'';
1076         case '\\': return '\\';
1077         case '?': return '\?';
1078         case 'a': return '\a';
1079         case 'b': return '\b';
1080         case 'f': return '\f';
1081         case 'n': return '\n';
1082         case 'r': return '\r';
1083         case 't': return '\t';
1084         case 'v': return '\v';
1085         case 'x':
1086                 return parse_hex_sequence();
1087         case '0':
1088         case '1':
1089         case '2':
1090         case '3':
1091         case '4':
1092         case '5':
1093         case '6':
1094         case '7':
1095                 return parse_octal_sequence(ec);
1096         case EOF:
1097                 parse_error("reached end of file while parsing escape sequence");
1098                 return EOF;
1099         /* \E is not documented, but handled, by GCC.  It is acceptable according
1100          * to §6.11.4, whereas \e is not. */
1101         case 'E':
1102         case 'e':
1103                 if (c_mode & _GNUC)
1104                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1105                 /* FALLTHROUGH */
1106         default:
1107                 /* §6.4.4.4:8 footnote 64 */
1108                 parse_error("unknown escape sequence");
1109                 return EOF;
1110         }
1111 }
1112
1113 /**
1114  * Concatenate two strings.
1115  */
1116 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1117 {
1118         const size_t len1 = s1->size - 1;
1119         const size_t len2 = s2->size - 1;
1120
1121         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1122         memcpy(concat, s1->begin, len1);
1123         memcpy(concat + len1, s2->begin, len2 + 1);
1124
1125         if (warning.traditional) {
1126                 warningf(&lexer_token.source_position,
1127                         "traditional C rejects string constant concatenation");
1128         }
1129 #if 0 /* TODO hash */
1130         const char *result = strset_insert(&stringset, concat);
1131         if(result != concat) {
1132                 obstack_free(&symbol_obstack, concat);
1133         }
1134
1135         return result;
1136 #else
1137         return (string_t){ concat, len1 + len2 + 1 };
1138 #endif
1139 }
1140
1141 /**
1142  * Concatenate a string and a wide string.
1143  */
1144 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1145 {
1146         const size_t len1 = s1->size - 1;
1147         const size_t len2 = s2->size - 1;
1148
1149         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1150         const char *const src = s1->begin;
1151         for (size_t i = 0; i != len1; ++i) {
1152                 concat[i] = src[i];
1153         }
1154         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1155         if (warning.traditional) {
1156                 warningf(&lexer_token.source_position,
1157                         "traditional C rejects string constant concatenation");
1158         }
1159
1160         return (wide_string_t){ concat, len1 + len2 + 1 };
1161 }
1162
1163 /**
1164  * Concatenate two wide strings.
1165  */
1166 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1167 {
1168         const size_t len1 = s1->size - 1;
1169         const size_t len2 = s2->size - 1;
1170
1171         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1172         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1173         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1174         if (warning.traditional) {
1175                 warningf(&lexer_token.source_position,
1176                         "traditional C rejects string constant concatenation");
1177         }
1178
1179         return (wide_string_t){ concat, len1 + len2 + 1 };
1180 }
1181
1182 /**
1183  * Concatenate a wide string and a string.
1184  */
1185 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1186 {
1187         const size_t len1 = s1->size - 1;
1188         const size_t len2 = s2->size - 1;
1189
1190         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1191         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1192         const char  *const src = s2->begin;
1193         wchar_rep_t *const dst = concat + len1;
1194         for (size_t i = 0; i != len2 + 1; ++i) {
1195                 dst[i] = src[i];
1196         }
1197         if (warning.traditional) {
1198                 warningf(&lexer_token.source_position,
1199                         "traditional C rejects string constant concatenation");
1200         }
1201
1202         return (wide_string_t){ concat, len1 + len2 + 1 };
1203 }
1204
1205 static void grow_symbol(utf32 const tc)
1206 {
1207         struct obstack *const o  = &symbol_obstack;
1208         if (tc < 0x80U) {
1209                 obstack_1grow(o, tc);
1210         } else if (tc < 0x800) {
1211                 obstack_1grow(o, 0xC0 | (tc >> 6));
1212                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1213         } else if (tc < 0x10000) {
1214                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1215                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1216                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1217         } else {
1218                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1219                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1220                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1221                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1222         }
1223 }
1224
1225 /**
1226  * Parse a string literal and set lexer_token.
1227  */
1228 static void parse_string_literal(void)
1229 {
1230         const unsigned start_linenr = lexer_token.source_position.linenr;
1231
1232         eat('"');
1233
1234         while(1) {
1235                 switch(c) {
1236                 case '\\': {
1237                         utf32 const tc = parse_escape_sequence();
1238                         if (tc >= 0x100) {
1239                                 warningf(&lexer_token.source_position,
1240                                                 "escape sequence out of range");
1241                         }
1242                         obstack_1grow(&symbol_obstack, tc);
1243                         break;
1244                 }
1245
1246                 case EOF: {
1247                         source_position_t source_position;
1248                         source_position.input_name = lexer_token.source_position.input_name;
1249                         source_position.linenr     = start_linenr;
1250                         errorf(&source_position, "string has no end");
1251                         lexer_token.type = T_ERROR;
1252                         return;
1253                 }
1254
1255                 case '"':
1256                         next_char();
1257                         goto end_of_string;
1258
1259                 default:
1260                         grow_symbol(c);
1261                         next_char();
1262                         break;
1263                 }
1264         }
1265
1266 end_of_string:
1267
1268         /* TODO: concatenate multiple strings separated by whitespace... */
1269
1270         /* add finishing 0 to the string */
1271         obstack_1grow(&symbol_obstack, '\0');
1272         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1273         const char *const string = obstack_finish(&symbol_obstack);
1274
1275 #if 0 /* TODO hash */
1276         /* check if there is already a copy of the string */
1277         result = strset_insert(&stringset, string);
1278         if(result != string) {
1279                 obstack_free(&symbol_obstack, string);
1280         }
1281 #else
1282         const char *const result = string;
1283 #endif
1284
1285         lexer_token.type           = T_STRING_LITERAL;
1286         lexer_token.v.string.begin = result;
1287         lexer_token.v.string.size  = size;
1288 }
1289
1290 /**
1291  * Parse a wide character constant and set lexer_token.
1292  */
1293 static void parse_wide_character_constant(void)
1294 {
1295         const unsigned start_linenr = lexer_token.source_position.linenr;
1296
1297         eat('\'');
1298
1299         while(1) {
1300                 switch(c) {
1301                 case '\\': {
1302                         wchar_rep_t tc = parse_escape_sequence();
1303                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1304                         break;
1305                 }
1306
1307                 MATCH_NEWLINE(
1308                         parse_error("newline while parsing character constant");
1309                         break;
1310                 )
1311
1312                 case '\'':
1313                         next_char();
1314                         goto end_of_wide_char_constant;
1315
1316                 case EOF: {
1317                         source_position_t source_position = lexer_token.source_position;
1318                         source_position.linenr = start_linenr;
1319                         errorf(&source_position, "EOF while parsing character constant");
1320                         lexer_token.type = T_ERROR;
1321                         return;
1322                 }
1323
1324                 default: {
1325                         wchar_rep_t tc = (wchar_rep_t) c;
1326                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1327                         next_char();
1328                         break;
1329                 }
1330                 }
1331         }
1332
1333 end_of_wide_char_constant:;
1334         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1335         assert(size % sizeof(wchar_rep_t) == 0);
1336         size /= sizeof(wchar_rep_t);
1337
1338         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1339
1340         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1341         lexer_token.v.wide_string.begin = string;
1342         lexer_token.v.wide_string.size  = size;
1343         lexer_token.datatype            = type_wchar_t;
1344 }
1345
1346 /**
1347  * Parse a wide string literal and set lexer_token.
1348  */
1349 static void parse_wide_string_literal(void)
1350 {
1351         const unsigned start_linenr = lexer_token.source_position.linenr;
1352
1353         assert(c == '"');
1354         next_char();
1355
1356         while(1) {
1357                 switch(c) {
1358                 case '\\': {
1359                         wchar_rep_t tc = parse_escape_sequence();
1360                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1361                         break;
1362                 }
1363
1364                 case EOF: {
1365                         source_position_t source_position;
1366                         source_position.input_name = lexer_token.source_position.input_name;
1367                         source_position.linenr     = start_linenr;
1368                         errorf(&source_position, "string has no end");
1369                         lexer_token.type = T_ERROR;
1370                         return;
1371                 }
1372
1373                 case '"':
1374                         next_char();
1375                         goto end_of_string;
1376
1377                 default: {
1378                         wchar_rep_t tc = c;
1379                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1380                         next_char();
1381                         break;
1382                 }
1383                 }
1384         }
1385
1386 end_of_string:;
1387
1388         /* TODO: concatenate multiple strings separated by whitespace... */
1389
1390         /* add finishing 0 to the string */
1391         wchar_rep_t nul = L'\0';
1392         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1393         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1394         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1395
1396 #if 0 /* TODO hash */
1397         /* check if there is already a copy of the string */
1398         const wchar_rep_t *const result = strset_insert(&stringset, string);
1399         if(result != string) {
1400                 obstack_free(&symbol_obstack, string);
1401         }
1402 #else
1403         const wchar_rep_t *const result = string;
1404 #endif
1405
1406         lexer_token.type                = T_WIDE_STRING_LITERAL;
1407         lexer_token.v.wide_string.begin = result;
1408         lexer_token.v.wide_string.size  = size;
1409 }
1410
1411 /**
1412  * Parse a character constant and set lexer_token.
1413  */
1414 static void parse_character_constant(void)
1415 {
1416         const unsigned start_linenr = lexer_token.source_position.linenr;
1417
1418         eat('\'');
1419
1420         while(1) {
1421                 switch(c) {
1422                 case '\\': {
1423                         utf32 const tc = parse_escape_sequence();
1424                         if (tc >= 0x100) {
1425                                 warningf(&lexer_token.source_position,
1426                                                 "escape sequence out of range");
1427                         }
1428                         obstack_1grow(&symbol_obstack, tc);
1429                         break;
1430                 }
1431
1432                 MATCH_NEWLINE(
1433                         parse_error("newline while parsing character constant");
1434                         break;
1435                 )
1436
1437                 case '\'':
1438                         next_char();
1439                         goto end_of_char_constant;
1440
1441                 case EOF: {
1442                         source_position_t source_position;
1443                         source_position.input_name = lexer_token.source_position.input_name;
1444                         source_position.linenr     = start_linenr;
1445                         errorf(&source_position, "EOF while parsing character constant");
1446                         lexer_token.type = T_ERROR;
1447                         return;
1448                 }
1449
1450                 default:
1451                         grow_symbol(c);
1452                         next_char();
1453                         break;
1454
1455                 }
1456         }
1457
1458 end_of_char_constant:;
1459         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1460         const char *const string = obstack_finish(&symbol_obstack);
1461
1462         lexer_token.type           = T_CHARACTER_CONSTANT;
1463         lexer_token.v.string.begin = string;
1464         lexer_token.v.string.size  = size;
1465         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1466 }
1467
1468 /**
1469  * Skip a multiline comment.
1470  */
1471 static void skip_multiline_comment(void)
1472 {
1473         unsigned start_linenr = lexer_token.source_position.linenr;
1474
1475         while(1) {
1476                 switch(c) {
1477                 case '/':
1478                         next_char();
1479                         if (c == '*') {
1480                                 /* nested comment, warn here */
1481                                 if (warning.comment) {
1482                                         warningf(&lexer_token.source_position, "'/*' within comment");
1483                                 }
1484                         }
1485                         break;
1486                 case '*':
1487                         next_char();
1488                         if(c == '/') {
1489                                 next_char();
1490                                 return;
1491                         }
1492                         break;
1493
1494                 MATCH_NEWLINE(break;)
1495
1496                 case EOF: {
1497                         source_position_t source_position;
1498                         source_position.input_name = lexer_token.source_position.input_name;
1499                         source_position.linenr     = start_linenr;
1500                         errorf(&source_position, "at end of file while looking for comment end");
1501                         return;
1502                 }
1503
1504                 default:
1505                         next_char();
1506                         break;
1507                 }
1508         }
1509 }
1510
1511 /**
1512  * Skip a single line comment.
1513  */
1514 static void skip_line_comment(void)
1515 {
1516         while(1) {
1517                 switch(c) {
1518                 case EOF:
1519                         return;
1520
1521                 case '\n':
1522                 case '\r':
1523                         return;
1524
1525                 case '\\':
1526                         next_char();
1527                         if (c == '\n' || c == '\r') {
1528                                 if (warning.comment)
1529                                         warningf(&lexer_token.source_position, "multi-line comment");
1530                                 return;
1531                         }
1532                         break;
1533
1534                 default:
1535                         next_char();
1536                         break;
1537                 }
1538         }
1539 }
1540
1541 /** The current preprocessor token. */
1542 static token_t pp_token;
1543
1544 /**
1545  * Read the next preprocessor token.
1546  */
1547 static inline void next_pp_token(void)
1548 {
1549         lexer_next_preprocessing_token();
1550         pp_token = lexer_token;
1551 }
1552
1553 /**
1554  * Eat all preprocessor tokens until newline.
1555  */
1556 static void eat_until_newline(void)
1557 {
1558         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1559                 next_pp_token();
1560         }
1561 }
1562
1563 /**
1564  * Handle the define directive.
1565  */
1566 static void define_directive(void)
1567 {
1568         lexer_next_preprocessing_token();
1569         if(lexer_token.type != T_IDENTIFIER) {
1570                 parse_error("expected identifier after #define\n");
1571                 eat_until_newline();
1572         }
1573 }
1574
1575 /**
1576  * Handle the ifdef directive.
1577  */
1578 static void ifdef_directive(int is_ifndef)
1579 {
1580         (void) is_ifndef;
1581         lexer_next_preprocessing_token();
1582         //expect_identifier();
1583         //extect_newline();
1584 }
1585
1586 /**
1587  * Handle the endif directive.
1588  */
1589 static void endif_directive(void)
1590 {
1591         //expect_newline();
1592 }
1593
1594 /**
1595  * Parse the line directive.
1596  */
1597 static void parse_line_directive(void)
1598 {
1599         if(pp_token.type != T_INTEGER) {
1600                 parse_error("expected integer");
1601         } else {
1602                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1603                 next_pp_token();
1604         }
1605         if(pp_token.type == T_STRING_LITERAL) {
1606                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1607                 next_pp_token();
1608         }
1609
1610         eat_until_newline();
1611 }
1612
1613 /**
1614  * STDC pragmas.
1615  */
1616 typedef enum stdc_pragma_kind_t {
1617         STDC_UNKNOWN,
1618         STDC_FP_CONTRACT,
1619         STDC_FENV_ACCESS,
1620         STDC_CX_LIMITED_RANGE
1621 } stdc_pragma_kind_t;
1622
1623 /**
1624  * STDC pragma values.
1625  */
1626 typedef enum stdc_pragma_value_kind_t {
1627         STDC_VALUE_UNKNOWN,
1628         STDC_VALUE_ON,
1629         STDC_VALUE_OFF,
1630         STDC_VALUE_DEFAULT
1631 } stdc_pragma_value_kind_t;
1632
1633 /**
1634  * Parse a pragma directive.
1635  */
1636 static void parse_pragma(void)
1637 {
1638         bool unknown_pragma = true;
1639
1640         next_pp_token();
1641         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1642                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1643                 /* a STDC pragma */
1644                 if (c_mode & _C99) {
1645                         next_pp_token();
1646
1647                         switch (pp_token.v.symbol->pp_ID) {
1648                         case TP_FP_CONTRACT:
1649                                 kind = STDC_FP_CONTRACT;
1650                                 break;
1651                         case TP_FENV_ACCESS:
1652                                 kind = STDC_FENV_ACCESS;
1653                                 break;
1654                         case TP_CX_LIMITED_RANGE:
1655                                 kind = STDC_CX_LIMITED_RANGE;
1656                                 break;
1657                         default:
1658                                 break;
1659                         }
1660                         if (kind != STDC_UNKNOWN) {
1661                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1662                                 next_pp_token();
1663                                 switch (pp_token.v.symbol->pp_ID) {
1664                                 case TP_ON:
1665                                         value = STDC_VALUE_ON;
1666                                         break;
1667                                 case TP_OFF:
1668                                         value = STDC_VALUE_OFF;
1669                                         break;
1670                                 case TP_DEFAULT:
1671                                         value = STDC_VALUE_DEFAULT;
1672                                         break;
1673                                 default:
1674                                         break;
1675                                 }
1676                                 if (value != STDC_VALUE_UNKNOWN) {
1677                                         unknown_pragma = false;
1678                                 } else {
1679                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1680                                 }
1681                         }
1682                 }
1683         } else {
1684                 unknown_pragma = true;
1685         }
1686         eat_until_newline();
1687         if (unknown_pragma && warning.unknown_pragmas) {
1688                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1689         }
1690 }
1691
1692 /**
1693  * Parse a preprocessor non-null directive.
1694  */
1695 static void parse_preprocessor_identifier(void)
1696 {
1697         assert(pp_token.type == T_IDENTIFIER);
1698         symbol_t *symbol = pp_token.v.symbol;
1699
1700         switch(symbol->pp_ID) {
1701         case TP_include:
1702                 printf("include - enable header name parsing!\n");
1703                 break;
1704         case TP_define:
1705                 define_directive();
1706                 break;
1707         case TP_ifdef:
1708                 ifdef_directive(0);
1709                 break;
1710         case TP_ifndef:
1711                 ifdef_directive(1);
1712                 break;
1713         case TP_endif:
1714                 endif_directive();
1715                 break;
1716         case TP_line:
1717                 next_pp_token();
1718                 parse_line_directive();
1719                 break;
1720         case TP_if:
1721         case TP_else:
1722         case TP_elif:
1723         case TP_undef:
1724         case TP_error:
1725                 /* TODO; output the rest of the line */
1726                 parse_error("#error directive: ");
1727                 break;
1728         case TP_pragma:
1729                 parse_pragma();
1730                 break;
1731         }
1732 }
1733
1734 /**
1735  * Parse a preprocessor directive.
1736  */
1737 static void parse_preprocessor_directive(void)
1738 {
1739         next_pp_token();
1740
1741         switch(pp_token.type) {
1742         case T_IDENTIFIER:
1743                 parse_preprocessor_identifier();
1744                 break;
1745         case T_INTEGER:
1746                 parse_line_directive();
1747                 break;
1748         case '\n':
1749                 /* NULL directive, see §6.10.7 */
1750                 break;
1751         default:
1752                 parse_error("invalid preprocessor directive");
1753                 eat_until_newline();
1754                 break;
1755         }
1756 }
1757
1758 #define MAYBE_PROLOG                                       \
1759                         next_char();                                   \
1760                         while(1) {                                     \
1761                                 switch(c) {
1762
1763 #define MAYBE(ch, set_type)                                \
1764                                 case ch:                                   \
1765                                         next_char();                           \
1766                                         lexer_token.type = set_type;           \
1767                                         return;
1768
1769 #define ELSE_CODE(code)                                    \
1770                                 default:                                   \
1771                                         code                                   \
1772                                 }                                          \
1773                         } /* end of while(1) */                        \
1774                         break;
1775
1776 #define ELSE(set_type)                                     \
1777                 ELSE_CODE(                                         \
1778                         lexer_token.type = set_type;                   \
1779                         return;                                        \
1780                 )
1781
1782 void lexer_next_preprocessing_token(void)
1783 {
1784         while(1) {
1785                 switch(c) {
1786                 case ' ':
1787                 case '\t':
1788                         next_char();
1789                         break;
1790
1791                 MATCH_NEWLINE(
1792                         lexer_token.type = '\n';
1793                         return;
1794                 )
1795
1796                 SYMBOL_CHARS
1797                         parse_symbol();
1798                         /* might be a wide string ( L"string" ) */
1799                         if(lexer_token.type == T_IDENTIFIER &&
1800                             lexer_token.v.symbol == symbol_L) {
1801                             if(c == '"') {
1802                                         parse_wide_string_literal();
1803                                 } else if(c == '\'') {
1804                                         parse_wide_character_constant();
1805                                 }
1806                         }
1807                         return;
1808
1809                 DIGITS
1810                         parse_number();
1811                         return;
1812
1813                 case '"':
1814                         parse_string_literal();
1815                         return;
1816
1817                 case '\'':
1818                         parse_character_constant();
1819                         return;
1820
1821                 case '.':
1822                         MAYBE_PROLOG
1823                                 DIGITS
1824                                         put_back(c);
1825                                         c = '.';
1826                                         parse_number_dec();
1827                                         return;
1828
1829                                 case '.':
1830                                         MAYBE_PROLOG
1831                                         MAYBE('.', T_DOTDOTDOT)
1832                                         ELSE_CODE(
1833                                                 put_back(c);
1834                                                 c = '.';
1835                                                 lexer_token.type = '.';
1836                                                 return;
1837                                         )
1838                         ELSE('.')
1839                 case '&':
1840                         MAYBE_PROLOG
1841                         MAYBE('&', T_ANDAND)
1842                         MAYBE('=', T_ANDEQUAL)
1843                         ELSE('&')
1844                 case '*':
1845                         MAYBE_PROLOG
1846                         MAYBE('=', T_ASTERISKEQUAL)
1847                         ELSE('*')
1848                 case '+':
1849                         MAYBE_PROLOG
1850                         MAYBE('+', T_PLUSPLUS)
1851                         MAYBE('=', T_PLUSEQUAL)
1852                         ELSE('+')
1853                 case '-':
1854                         MAYBE_PROLOG
1855                         MAYBE('>', T_MINUSGREATER)
1856                         MAYBE('-', T_MINUSMINUS)
1857                         MAYBE('=', T_MINUSEQUAL)
1858                         ELSE('-')
1859                 case '!':
1860                         MAYBE_PROLOG
1861                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1862                         ELSE('!')
1863                 case '/':
1864                         MAYBE_PROLOG
1865                         MAYBE('=', T_SLASHEQUAL)
1866                                 case '*':
1867                                         next_char();
1868                                         skip_multiline_comment();
1869                                         lexer_next_preprocessing_token();
1870                                         return;
1871                                 case '/':
1872                                         next_char();
1873                                         skip_line_comment();
1874                                         lexer_next_preprocessing_token();
1875                                         return;
1876                         ELSE('/')
1877                 case '%':
1878                         MAYBE_PROLOG
1879                         MAYBE('>', '}')
1880                         MAYBE('=', T_PERCENTEQUAL)
1881                                 case ':':
1882                                         MAYBE_PROLOG
1883                                                 case '%':
1884                                                         MAYBE_PROLOG
1885                                                         MAYBE(':', T_HASHHASH)
1886                                                         ELSE_CODE(
1887                                                                 put_back(c);
1888                                                                 c = '%';
1889                                                                 lexer_token.type = '#';
1890                                                                 return;
1891                                                         )
1892                                         ELSE('#')
1893                         ELSE('%')
1894                 case '<':
1895                         MAYBE_PROLOG
1896                         MAYBE(':', '[')
1897                         MAYBE('%', '{')
1898                         MAYBE('=', T_LESSEQUAL)
1899                                 case '<':
1900                                         MAYBE_PROLOG
1901                                         MAYBE('=', T_LESSLESSEQUAL)
1902                                         ELSE(T_LESSLESS)
1903                         ELSE('<')
1904                 case '>':
1905                         MAYBE_PROLOG
1906                         MAYBE('=', T_GREATEREQUAL)
1907                                 case '>':
1908                                         MAYBE_PROLOG
1909                                         MAYBE('=', T_GREATERGREATEREQUAL)
1910                                         ELSE(T_GREATERGREATER)
1911                         ELSE('>')
1912                 case '^':
1913                         MAYBE_PROLOG
1914                         MAYBE('=', T_CARETEQUAL)
1915                         ELSE('^')
1916                 case '|':
1917                         MAYBE_PROLOG
1918                         MAYBE('=', T_PIPEEQUAL)
1919                         MAYBE('|', T_PIPEPIPE)
1920                         ELSE('|')
1921                 case ':':
1922                         MAYBE_PROLOG
1923                         MAYBE('>', ']')
1924                         ELSE(':')
1925                 case '=':
1926                         MAYBE_PROLOG
1927                         MAYBE('=', T_EQUALEQUAL)
1928                         ELSE('=')
1929                 case '#':
1930                         MAYBE_PROLOG
1931                         MAYBE('#', T_HASHHASH)
1932                         ELSE('#')
1933
1934                 case '?':
1935                 case '[':
1936                 case ']':
1937                 case '(':
1938                 case ')':
1939                 case '{':
1940                 case '}':
1941                 case '~':
1942                 case ';':
1943                 case ',':
1944                 case '\\':
1945                         lexer_token.type = c;
1946                         next_char();
1947                         return;
1948
1949                 case EOF:
1950                         lexer_token.type = T_EOF;
1951                         return;
1952
1953                 default:
1954 dollar_sign:
1955                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1956                         next_char();
1957                         lexer_token.type = T_ERROR;
1958                         return;
1959                 }
1960         }
1961 }
1962
1963 void lexer_next_token(void)
1964 {
1965         lexer_next_preprocessing_token();
1966
1967         while (lexer_token.type == '\n') {
1968 newline_found:
1969                 lexer_next_preprocessing_token();
1970         }
1971
1972         if (lexer_token.type == '#') {
1973                 parse_preprocessor_directive();
1974                 goto newline_found;
1975         }
1976 }
1977
1978 void init_lexer(void)
1979 {
1980         strset_init(&stringset);
1981         symbol_L = symbol_table_insert("L");
1982 }
1983
1984 void lexer_open_stream(FILE *stream, const char *input_name)
1985 {
1986         input                                  = stream;
1987         lexer_token.source_position.linenr     = 0;
1988         lexer_token.source_position.input_name = input_name;
1989
1990         bufpos = NULL;
1991         bufend = NULL;
1992
1993         /* place a virtual \n at the beginning so the lexer knows that we're
1994          * at the beginning of a line */
1995         c = '\n';
1996 }
1997
1998 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1999 {
2000         input                                  = NULL;
2001         lexer_token.source_position.linenr     = 0;
2002         lexer_token.source_position.input_name = input_name;
2003
2004 #if 0 // TODO
2005         bufpos = buffer;
2006         bufend = buffer + len;
2007 #else
2008         (void)buffer;
2009         (void)len;
2010         panic("builtin lexing not done yet");
2011 #endif
2012
2013         /* place a virtual \n at the beginning so the lexer knows that we're
2014          * at the beginning of a line */
2015         c = '\n';
2016 }
2017
2018 void exit_lexer(void)
2019 {
2020         strset_destroy(&stringset);
2021 }
2022
2023 static __attribute__((unused))
2024 void dbg_pos(const source_position_t source_position)
2025 {
2026         fprintf(stdout, "%s:%u\n", source_position.input_name,
2027                 source_position.linenr);
2028         fflush(stdout);
2029 }