nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <stdbool.h>
  41 #include <ctype.h>
  42
  43 #ifndef _WIN32
  44 #include <strings.h>
  45 #endif
  46
  47 //#define DEBUG_CHARS
  48 #define MAX_PUTBACK 3
  49 #define BUF_SIZE    1024
  50
  51 #if defined(_WIN32) || defined(__CYGWIN__)
  52 /* No strtold on windows and no replacement yet */
  53 #define strtold(s, e)     strtod(s, e)
  54 #define strcasecmp(a, b)  stricmp(a, b)
  55 #endif
  56
  57 typedef unsigned int utf32;
  58
  59 static utf32        c;
  60 token_t             lexer_token;
  61 symbol_t           *symbol_L;
  62 static FILE        *input;
  63 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  64 static const utf32 *bufend;
  65 static const utf32 *bufpos;
  66 static strset_t     stringset;
  67 bool                allow_dollar_in_symbol = true;
  68
  69 /**
  70  * Prints a parse error message at the current token.
  71  *
  72  * @param msg   the error message
  73  */
  74 static void parse_error(const char *msg)
  75 {
  76         errorf(&lexer_token.source_position, "%s", msg);
  77 }
  78
  79 /**
  80  * Prints an internal error message at the current token.
  81  *
  82  * @param msg   the error message
  83  */
  84 static NORETURN internal_error(const char *msg)
  85 {
  86         internal_errorf(&lexer_token.source_position, "%s", msg);
  87 }
  88
  89 static size_t read_block(unsigned char *const read_buf, size_t const n)
  90 {
  91         size_t const s = fread(read_buf, 1, n, input);
  92         if (s == 0) {
  93                 if (ferror(input))
  94                         parse_error("read from input failed");
  95                 buf[MAX_PUTBACK] = EOF;
  96                 bufpos           = buf + MAX_PUTBACK;
  97                 bufend           = buf + MAX_PUTBACK + 1;
  98         }
  99         return s;
 100 }
 101
 102 static void decode_iso_8859_1(void)
 103 {
 104         unsigned char read_buf[BUF_SIZE];
 105         size_t const s = read_block(read_buf, sizeof(read_buf));
 106         if (s == 0)
 107                 return;
 108
 109         unsigned char const *src = read_buf;
 110         unsigned char const *end = read_buf + s;
 111         utf32               *dst = buf + MAX_PUTBACK;
 112         while (src != end)
 113                 *dst++ = *src++;
 114
 115         bufpos = buf + MAX_PUTBACK;
 116         bufend = dst;
 117 }
 118
 119 static void decode_iso_8859_15(void)
 120 {
 121         unsigned char read_buf[BUF_SIZE];
 122         size_t const s = read_block(read_buf, sizeof(read_buf));
 123         if (s == 0)
 124                 return;
 125
 126         unsigned char const *src = read_buf;
 127         unsigned char const *end = read_buf + s;
 128         utf32               *dst = buf + MAX_PUTBACK;
 129         while (src != end) {
 130                 utf32 tc = *src++;
 131                 switch (tc) {
 132                         case 0xA4: tc = 0x20AC; break; // €
 133                         case 0xA6: tc = 0x0160; break; // Š
 134                         case 0xA8: tc = 0x0161; break; // š
 135                         case 0xB4: tc = 0x017D; break; // Ž
 136                         case 0xB8: tc = 0x017E; break; // ž
 137                         case 0xBC: tc = 0x0152; break; // Œ
 138                         case 0xBD: tc = 0x0153; break; // œ
 139                         case 0xBE: tc = 0x0178; break; // Ÿ
 140                 }
 141                 *dst++ = tc;
 142         }
 143
 144         bufpos = buf + MAX_PUTBACK;
 145         bufend = dst;
 146 }
 147
 148 static void decode_utf8(void)
 149 {
 150         static utf32  part_decoded_min_code;
 151         static utf32  part_decoded_char;
 152         static size_t part_decoded_rest_len;
 153
 154         do {
 155                 unsigned char read_buf[BUF_SIZE];
 156                 size_t const s = read_block(read_buf, sizeof(read_buf));
 157                 if (s == 0) {
 158                         if (part_decoded_rest_len > 0)
 159                                 parse_error("incomplete input char at end of input");
 160                         return;
 161                 }
 162
 163                 unsigned char const *src = read_buf;
 164                 unsigned char const *end = read_buf + s;
 165                 utf32               *dst = buf + MAX_PUTBACK;
 166                 utf32                decoded;
 167                 utf32                min_code;
 168
 169                 if (part_decoded_rest_len != 0) {
 170                         min_code              = part_decoded_min_code;
 171                         decoded               = part_decoded_char;
 172                         size_t const rest_len = part_decoded_rest_len;
 173                         part_decoded_rest_len = 0;
 174                         switch (rest_len) {
 175                                 case 4:  goto realign;
 176                                 case 3:  goto three_more;
 177                                 case 2:  goto two_more;
 178                                 default: goto one_more;
 179                         }
 180                 }
 181
 182                 while (src != end) {
 183                         if ((*src & 0x80) == 0) {
 184                                 decoded = *src++;
 185                         } else if ((*src & 0xE0) == 0xC0) {
 186                                 min_code = 0x80;
 187                                 decoded  = *src++ & 0x1F;
 188 one_more:
 189                                 if (src == end) {
 190                                         part_decoded_min_code = min_code;
 191                                         part_decoded_char     = decoded;
 192                                         part_decoded_rest_len = 1;
 193                                         break;
 194                                 }
 195                                 if ((*src & 0xC0) == 0x80) {
 196                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 197                                 } else {
 198                                         goto invalid_char;
 199                                 }
 200                                 if (decoded < min_code                      ||
 201                                                 decoded > 0x10FFFF                      ||
 202                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 203                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 204                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 205                                         parse_error("invalid byte sequence in input");
 206                                 }
 207                         } else if ((*src & 0xF0) == 0xE0) {
 208                                 min_code = 0x800;
 209                                 decoded  = *src++ & 0x0F;
 210 two_more:
 211                                 if (src == end) {
 212                                         part_decoded_min_code = min_code;
 213                                         part_decoded_char     = decoded;
 214                                         part_decoded_rest_len = 2;
 215                                         break;
 216                                 }
 217                                 if ((*src & 0xC0) == 0x80) {
 218                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 219                                 } else {
 220                                         goto invalid_char;
 221                                 }
 222                                 goto one_more;
 223                         } else if ((*src & 0xF8) == 0xF0) {
 224                                 min_code = 0x10000;
 225                                 decoded  = *src++ & 0x07;
 226 three_more:
 227                                 if (src == end) {
 228                                         part_decoded_min_code = min_code;
 229                                         part_decoded_char     = decoded;
 230                                         part_decoded_rest_len = 3;
 231                                         break;
 232                                 }
 233                                 if ((*src & 0xC0) == 0x80) {
 234                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 235                                 } else {
 236                                         goto invalid_char;
 237                                 }
 238                                 goto two_more;
 239                         } else {
 240 invalid_char:
 241                                 parse_error("invalid byte sequence in input");
 242 realign:
 243                                 do {
 244                                         ++src;
 245                                         if (src == end) {
 246                                                 part_decoded_rest_len = 4;
 247                                                 break;
 248                                         }
 249                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 250                                 continue;
 251                         }
 252                         *dst++ = decoded;
 253                 }
 254
 255                 bufpos = buf + MAX_PUTBACK;
 256                 bufend = dst;
 257         } while (bufpos == bufend);
 258 }
 259
 260 static void decode_windows_1252(void)
 261 {
 262         unsigned char read_buf[BUF_SIZE];
 263         size_t const s = read_block(read_buf, sizeof(read_buf));
 264         if (s == 0)
 265                 return;
 266
 267         unsigned char const *src = read_buf;
 268         unsigned char const *end = read_buf + s;
 269         utf32               *dst = buf + MAX_PUTBACK;
 270         while (src != end) {
 271                 utf32 tc = *src++;
 272                 switch (tc) {
 273                         case 0x80: tc = 0x20AC; break; // €
 274                         case 0x82: tc = 0x201A; break; // ‚
 275                         case 0x83: tc = 0x0192; break; // ƒ
 276                         case 0x84: tc = 0x201E; break; // „
 277                         case 0x85: tc = 0x2026; break; // …
 278                         case 0x86: tc = 0x2020; break; // †
 279                         case 0x87: tc = 0x2021; break; // ‡
 280                         case 0x88: tc = 0x02C6; break; // ˆ
 281                         case 0x89: tc = 0x2030; break; // ‰
 282                         case 0x8A: tc = 0x0160; break; // Š
 283                         case 0x8B: tc = 0x2039; break; // ‹
 284                         case 0x8C: tc = 0x0152; break; // Œ
 285                         case 0x8E: tc = 0x017D; break; // Ž
 286                         case 0x91: tc = 0x2018; break; // ‘
 287                         case 0x92: tc = 0x2019; break; // ’
 288                         case 0x93: tc = 0x201C; break; // “
 289                         case 0x94: tc = 0x201D; break; // ”
 290                         case 0x95: tc = 0x2022; break; // •
 291                         case 0x96: tc = 0x2013; break; // –
 292                         case 0x97: tc = 0x2014; break; // —
 293                         case 0x98: tc = 0x02DC; break; // ˜
 294                         case 0x99: tc = 0x2122; break; // ™
 295                         case 0x9A: tc = 0x0161; break; // š
 296                         case 0x9B: tc = 0x203A; break; // ›
 297                         case 0x9C: tc = 0x0153; break; // œ
 298                         case 0x9E: tc = 0x017E; break; // ž
 299                         case 0x9F: tc = 0x0178; break; // Ÿ
 300                 }
 301                 *dst++ = tc;
 302         }
 303
 304         bufpos = buf + MAX_PUTBACK;
 305         bufend = dst;
 306 }
 307
 308 typedef void (*decoder_t)(void);
 309
 310 static decoder_t decoder = decode_utf8;
 311
 312 typedef struct named_decoder_t {
 313         char const *name;
 314         decoder_t   decoder;
 315 } named_decoder_t;
 316
 317 static named_decoder_t const decoders[] = {
 318         { "CP819",           decode_iso_8859_1   }, // offical alias
 319         { "IBM819",          decode_iso_8859_1   }, // offical alias
 320         { "ISO-8859-1",      decode_iso_8859_1   }, // offical alias
 321         { "ISO-8859-15",     decode_iso_8859_15  }, // offical name
 322         { "ISO8859-1",       decode_iso_8859_1   },
 323         { "ISO8859-15",      decode_iso_8859_15  },
 324         { "ISO_8859-1",      decode_iso_8859_1   }, // offical alias
 325         { "ISO_8859-15",     decode_iso_8859_15  }, // offical alias
 326         { "ISO_8859-1:1987", decode_iso_8859_1   }, // offical name
 327         { "Latin-9",         decode_iso_8859_15  }, // offical alias
 328         { "UTF-8",           decode_utf8         }, // offical name
 329         { "csISOLatin1",     decode_iso_8859_1   }, // offical alias
 330         { "cp1252",          decode_windows_1252 },
 331         { "iso-ir-100",      decode_iso_8859_1   }, // offical alias
 332         { "l1",              decode_iso_8859_1   }, // offical alias
 333         { "latin1",          decode_iso_8859_1   }, // offical alias
 334         { "windows-1252",    decode_windows_1252 }, // official name
 335
 336         { NULL,              NULL                }
 337 };
 338
 339 void select_input_encoding(char const* const encoding)
 340 {
 341         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 342                 if (strcasecmp(encoding, i->name) != 0)
 343                         continue;
 344                 decoder = i->decoder;
 345                 return;
 346         }
 347         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 348 }
 349
 350 static inline void next_real_char(void)
 351 {
 352         assert(bufpos <= bufend);
 353         if (bufpos >= bufend) {
 354                 if (input == NULL) {
 355                         c = EOF;
 356                         return;
 357                 }
 358                 decoder();
 359         }
 360         c = *bufpos++;
 361 }
 362
 363 /**
 364  * Put a character back into the buffer.
 365  *
 366  * @param pc  the character to put back
 367  */
 368 static inline void put_back(utf32 const pc)
 369 {
 370         assert(bufpos > buf);
 371         *(--bufpos - buf + buf) = pc;
 372
 373 #ifdef DEBUG_CHARS
 374         printf("putback '%lc'\n", pc);
 375 #endif
 376 }
 377
 378 static inline void next_char(void);
 379
 380 #define MATCH_NEWLINE(code)                   \
 381         case '\r':                                \
 382                 next_char();                          \
 383                 if(c == '\n') {                       \
 384                         next_char();                      \
 385                 }                                     \
 386                 lexer_token.source_position.linenr++; \
 387                 code                                  \
 388         case '\n':                                \
 389                 next_char();                          \
 390                 lexer_token.source_position.linenr++; \
 391                 code
 392
 393 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 394
 395 static void maybe_concat_lines(void)
 396 {
 397         eat('\\');
 398
 399         switch(c) {
 400         MATCH_NEWLINE(return;)
 401
 402         default:
 403                 break;
 404         }
 405
 406         put_back(c);
 407         c = '\\';
 408 }
 409
 410 /**
 411  * Set c to the next input character, ie.
 412  * after expanding trigraphs.
 413  */
 414 static inline void next_char(void)
 415 {
 416         next_real_char();
 417
 418         /* filter trigraphs */
 419         if(UNLIKELY(c == '\\')) {
 420                 maybe_concat_lines();
 421                 goto end_of_next_char;
 422         }
 423
 424         if(LIKELY(c != '?'))
 425                 goto end_of_next_char;
 426
 427         next_real_char();
 428         if(LIKELY(c != '?')) {
 429                 put_back(c);
 430                 c = '?';
 431                 goto end_of_next_char;
 432         }
 433
 434         next_real_char();
 435         switch(c) {
 436         case '=': c = '#'; break;
 437         case '(': c = '['; break;
 438         case '/': c = '\\'; maybe_concat_lines(); break;
 439         case ')': c = ']'; break;
 440         case '\'': c = '^'; break;
 441         case '<': c = '{'; break;
 442         case '!': c = '|'; break;
 443         case '>': c = '}'; break;
 444         case '-': c = '~'; break;
 445         default:
 446                 put_back(c);
 447                 put_back('?');
 448                 c = '?';
 449                 break;
 450         }
 451
 452 end_of_next_char:;
 453 #ifdef DEBUG_CHARS
 454         printf("nchar '%c'\n", c);
 455 #endif
 456 }
 457
 458 #define SYMBOL_CHARS  \
 459         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 460         case 'a':         \
 461         case 'b':         \
 462         case 'c':         \
 463         case 'd':         \
 464         case 'e':         \
 465         case 'f':         \
 466         case 'g':         \
 467         case 'h':         \
 468         case 'i':         \
 469         case 'j':         \
 470         case 'k':         \
 471         case 'l':         \
 472         case 'm':         \
 473         case 'n':         \
 474         case 'o':         \
 475         case 'p':         \
 476         case 'q':         \
 477         case 'r':         \
 478         case 's':         \
 479         case 't':         \
 480         case 'u':         \
 481         case 'v':         \
 482         case 'w':         \
 483         case 'x':         \
 484         case 'y':         \
 485         case 'z':         \
 486         case 'A':         \
 487         case 'B':         \
 488         case 'C':         \
 489         case 'D':         \
 490         case 'E':         \
 491         case 'F':         \
 492         case 'G':         \
 493         case 'H':         \
 494         case 'I':         \
 495         case 'J':         \
 496         case 'K':         \
 497         case 'L':         \
 498         case 'M':         \
 499         case 'N':         \
 500         case 'O':         \
 501         case 'P':         \
 502         case 'Q':         \
 503         case 'R':         \
 504         case 'S':         \
 505         case 'T':         \
 506         case 'U':         \
 507         case 'V':         \
 508         case 'W':         \
 509         case 'X':         \
 510         case 'Y':         \
 511         case 'Z':         \
 512         case '_':
 513
 514 #define DIGITS        \
 515         case '0':         \
 516         case '1':         \
 517         case '2':         \
 518         case '3':         \
 519         case '4':         \
 520         case '5':         \
 521         case '6':         \
 522         case '7':         \
 523         case '8':         \
 524         case '9':
 525
 526 /**
 527  * Read a symbol from the input and build
 528  * the lexer_token.
 529  */
 530 static void parse_symbol(void)
 531 {
 532         symbol_t *symbol;
 533         char     *string;
 534
 535         obstack_1grow(&symbol_obstack, (char) c);
 536         next_char();
 537
 538         while(1) {
 539                 switch(c) {
 540                 DIGITS
 541                 SYMBOL_CHARS
 542                         obstack_1grow(&symbol_obstack, (char) c);
 543                         next_char();
 544                         break;
 545
 546                 default:
 547 dollar_sign:
 548                         goto end_symbol;
 549                 }
 550         }
 551
 552 end_symbol:
 553         obstack_1grow(&symbol_obstack, '\0');
 554
 555         string = obstack_finish(&symbol_obstack);
 556         symbol = symbol_table_insert(string);
 557
 558         lexer_token.type     = symbol->ID;
 559         lexer_token.v.symbol = symbol;
 560
 561         if(symbol->string != string) {
 562                 obstack_free(&symbol_obstack, string);
 563         }
 564 }
 565
 566 static void parse_integer_suffix(bool is_oct_hex)
 567 {
 568         bool is_unsigned     = false;
 569         bool min_long        = false;
 570         bool min_longlong    = false;
 571         bool not_traditional = false;
 572         int  pos             = 0;
 573         char suffix[4];
 574
 575         if (c == 'U' || c == 'u') {
 576                 not_traditional = true;
 577                 suffix[pos++]   = toupper(c);
 578                 is_unsigned     = true;
 579                 next_char();
 580                 if (c == 'L' || c == 'l') {
 581                         suffix[pos++] = toupper(c);
 582                         min_long = true;
 583                         next_char();
 584                         if (c == 'L' || c == 'l') {
 585                                 suffix[pos++] = toupper(c);
 586                                 min_longlong = true;
 587                                 next_char();
 588                         }
 589                 }
 590         } else if (c == 'l' || c == 'L') {
 591                 suffix[pos++] = toupper(c);
 592                 min_long = true;
 593                 next_char();
 594                 if (c == 'l' || c == 'L') {
 595                         not_traditional = true;
 596                         suffix[pos++]   = toupper(c);
 597                         min_longlong    = true;
 598                         next_char();
 599                         if (c == 'u' || c == 'U') {
 600                                 suffix[pos++] = toupper(c);
 601                                 is_unsigned   = true;
 602                                 next_char();
 603                         }
 604                 } else if (c == 'u' || c == 'U') {
 605                         not_traditional = true;
 606                         suffix[pos++]   = toupper(c);
 607                         is_unsigned     = true;
 608                         next_char();
 609                         lexer_token.datatype = type_unsigned_long;
 610                 }
 611         }
 612
 613         if (warning.traditional && not_traditional) {
 614                 suffix[pos] = '\0';
 615                 warningf(&lexer_token.source_position,
 616                         "traditional C rejects the '%s' suffix", suffix);
 617         }
 618         if (!is_unsigned) {
 619                 long long v = lexer_token.v.intvalue;
 620                 if (!min_long) {
 621                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 622                                 lexer_token.datatype = type_int;
 623                                 return;
 624                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 625                                 lexer_token.datatype = type_unsigned_int;
 626                                 return;
 627                         }
 628                 }
 629                 if (!min_longlong) {
 630                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 631                                 lexer_token.datatype = type_long;
 632                                 return;
 633                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
 634                                 lexer_token.datatype = type_unsigned_long;
 635                                 return;
 636                         }
 637                 }
 638                 unsigned long long uv = (unsigned long long) v;
 639                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 640                         lexer_token.datatype = type_unsigned_long_long;
 641                         return;
 642                 }
 643
 644                 lexer_token.datatype = type_long_long;
 645         } else {
 646                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 647                 if (!min_long && v <= TARGET_UINT_MAX) {
 648                         lexer_token.datatype = type_unsigned_int;
 649                         return;
 650                 }
 651                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
 652                         lexer_token.datatype = type_unsigned_long;
 653                         return;
 654                 }
 655                 lexer_token.datatype = type_unsigned_long_long;
 656         }
 657 }
 658
 659 static void parse_floating_suffix(void)
 660 {
 661         switch(c) {
 662         /* TODO: do something useful with the suffixes... */
 663         case 'f':
 664         case 'F':
 665                 if (warning.traditional) {
 666                         warningf(&lexer_token.source_position,
 667                                 "traditional C rejects the 'F' suffix");
 668                 }
 669                 next_char();
 670                 lexer_token.datatype = type_float;
 671                 break;
 672         case 'l':
 673         case 'L':
 674                 if (warning.traditional) {
 675                         warningf(&lexer_token.source_position,
 676                                 "traditional C rejects the 'F' suffix");
 677                 }
 678                 next_char();
 679                 lexer_token.datatype = type_long_double;
 680                 break;
 681         default:
 682                 lexer_token.datatype = type_double;
 683                 break;
 684         }
 685 }
 686
 687 /**
 688  * A replacement for strtoull. Only those parts needed for
 689  * our parser are implemented.
 690  */
 691 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
 692 {
 693         unsigned long long v = 0;
 694
 695         switch (base) {
 696         case 16:
 697                 for (;; ++s) {
 698                         /* check for overrun */
 699                         if (v >= 0x1000000000000000ULL)
 700                                 break;
 701                         switch (tolower(*s)) {
 702                         case '0': v <<= 4; break;
 703                         case '1': v <<= 4; v |= 0x1; break;
 704                         case '2': v <<= 4; v |= 0x2; break;
 705                         case '3': v <<= 4; v |= 0x3; break;
 706                         case '4': v <<= 4; v |= 0x4; break;
 707                         case '5': v <<= 4; v |= 0x5; break;
 708                         case '6': v <<= 4; v |= 0x6; break;
 709                         case '7': v <<= 4; v |= 0x7; break;
 710                         case '8': v <<= 4; v |= 0x8; break;
 711                         case '9': v <<= 4; v |= 0x9; break;
 712                         case 'a': v <<= 4; v |= 0xa; break;
 713                         case 'b': v <<= 4; v |= 0xb; break;
 714                         case 'c': v <<= 4; v |= 0xc; break;
 715                         case 'd': v <<= 4; v |= 0xd; break;
 716                         case 'e': v <<= 4; v |= 0xe; break;
 717                         case 'f': v <<= 4; v |= 0xf; break;
 718                         default:
 719                                 goto end;
 720                         }
 721                 }
 722                 break;
 723         case 8:
 724                 for (;; ++s) {
 725                         /* check for overrun */
 726                         if (v >= 0x2000000000000000ULL)
 727                                 break;
 728                         switch (tolower(*s)) {
 729                         case '0': v <<= 3; break;
 730                         case '1': v <<= 3; v |= 1; break;
 731                         case '2': v <<= 3; v |= 2; break;
 732                         case '3': v <<= 3; v |= 3; break;
 733                         case '4': v <<= 3; v |= 4; break;
 734                         case '5': v <<= 3; v |= 5; break;
 735                         case '6': v <<= 3; v |= 6; break;
 736                         case '7': v <<= 3; v |= 7; break;
 737                         default:
 738                                 goto end;
 739                         }
 740                 }
 741                 break;
 742         case 10:
 743                 for (;; ++s) {
 744                         /* check for overrun */
 745                         if (v > 0x1999999999999999ULL)
 746                                 break;
 747                         switch (tolower(*s)) {
 748                         case '0': v *= 10; break;
 749                         case '1': v *= 10; v += 1; break;
 750                         case '2': v *= 10; v += 2; break;
 751                         case '3': v *= 10; v += 3; break;
 752                         case '4': v *= 10; v += 4; break;
 753                         case '5': v *= 10; v += 5; break;
 754                         case '6': v *= 10; v += 6; break;
 755                         case '7': v *= 10; v += 7; break;
 756                         case '8': v *= 10; v += 8; break;
 757                         case '9': v *= 10; v += 9; break;
 758                         default:
 759                                 goto end;
 760                         }
 761                 }
 762                 break;
 763         default:
 764                 assert(0);
 765                 break;
 766         }
 767 end:
 768         *endptr = s;
 769         return v;
 770 }
 771
 772 /**
 773  * Parses a hex number including hex floats and set the
 774  * lexer_token.
 775  */
 776 static void parse_number_hex(void)
 777 {
 778         bool is_float = false;
 779         assert(c == 'x' || c == 'X');
 780         next_char();
 781
 782         obstack_1grow(&symbol_obstack, '0');
 783         obstack_1grow(&symbol_obstack, 'x');
 784
 785         while(isxdigit(c)) {
 786                 obstack_1grow(&symbol_obstack, (char) c);
 787                 next_char();
 788         }
 789
 790         if (c == '.') {
 791                 obstack_1grow(&symbol_obstack, (char) c);
 792                 next_char();
 793
 794                 while (isxdigit(c)) {
 795                         obstack_1grow(&symbol_obstack, (char) c);
 796                         next_char();
 797                 }
 798                 is_float = true;
 799         }
 800         if (c == 'p' || c == 'P') {
 801                 obstack_1grow(&symbol_obstack, (char) c);
 802                 next_char();
 803
 804                 if (c == '-' || c == '+') {
 805                         obstack_1grow(&symbol_obstack, (char) c);
 806                         next_char();
 807                 }
 808
 809                 while (isxdigit(c)) {
 810                         obstack_1grow(&symbol_obstack, (char) c);
 811                         next_char();
 812                 }
 813                 is_float = true;
 814         }
 815
 816         obstack_1grow(&symbol_obstack, '\0');
 817         char *string = obstack_finish(&symbol_obstack);
 818         if(*string == '\0') {
 819                 parse_error("invalid hex number");
 820                 lexer_token.type = T_ERROR;
 821                 obstack_free(&symbol_obstack, string);
 822                 return;
 823         }
 824
 825         if (is_float) {
 826                 char *endptr;
 827                 lexer_token.type         = T_FLOATINGPOINT;
 828                 lexer_token.v.floatvalue = strtold(string, &endptr);
 829
 830                 if(*endptr != '\0') {
 831                         parse_error("invalid hex float literal");
 832                 }
 833
 834                 parse_floating_suffix();
 835         } else {
 836                 const char *endptr;
 837                 lexer_token.type       = T_INTEGER;
 838                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
 839                 if(*endptr != '\0') {
 840                         parse_error("hex number literal too long");
 841                 }
 842                 parse_integer_suffix(true);
 843         }
 844
 845         obstack_free(&symbol_obstack, string);
 846 }
 847
 848 /**
 849  * Returns true if the given char is a octal digit.
 850  *
 851  * @param char  the character to check
 852  */
 853 static inline bool is_octal_digit(utf32 chr)
 854 {
 855         switch(chr) {
 856         case '0':
 857         case '1':
 858         case '2':
 859         case '3':
 860         case '4':
 861         case '5':
 862         case '6':
 863         case '7':
 864                 return true;
 865         default:
 866                 return false;
 867         }
 868 }
 869
 870 /**
 871  * Parses a octal number and set the lexer_token.
 872  */
 873 static void parse_number_oct(void)
 874 {
 875         while(is_octal_digit(c)) {
 876                 obstack_1grow(&symbol_obstack, (char) c);
 877                 next_char();
 878         }
 879         obstack_1grow(&symbol_obstack, '\0');
 880         char *string = obstack_finish(&symbol_obstack);
 881
 882         const char *endptr;
 883         lexer_token.type       = T_INTEGER;
 884         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 885         if(*endptr != '\0') {
 886                 parse_error("octal number literal too long");
 887         }
 888
 889         obstack_free(&symbol_obstack, string);
 890         parse_integer_suffix(true);
 891 }
 892
 893 /**
 894  * Parses a decimal including float number and set the
 895  * lexer_token.
 896  */
 897 static void parse_number_dec(void)
 898 {
 899         bool is_float = false;
 900         while (isdigit(c)) {
 901                 obstack_1grow(&symbol_obstack, (char) c);
 902                 next_char();
 903         }
 904
 905         if (c == '.') {
 906                 obstack_1grow(&symbol_obstack, '.');
 907                 next_char();
 908
 909                 while (isdigit(c)) {
 910                         obstack_1grow(&symbol_obstack, (char) c);
 911                         next_char();
 912                 }
 913                 is_float = true;
 914         }
 915         if(c == 'e' || c == 'E') {
 916                 obstack_1grow(&symbol_obstack, (char) c);
 917                 next_char();
 918
 919                 if(c == '-' || c == '+') {
 920                         obstack_1grow(&symbol_obstack, (char) c);
 921                         next_char();
 922                 }
 923
 924                 while(isdigit(c)) {
 925                         obstack_1grow(&symbol_obstack, (char) c);
 926                         next_char();
 927                 }
 928                 is_float = true;
 929         }
 930
 931         obstack_1grow(&symbol_obstack, '\0');
 932         char *string = obstack_finish(&symbol_obstack);
 933
 934         if(is_float) {
 935                 char *endptr;
 936                 lexer_token.type         = T_FLOATINGPOINT;
 937                 lexer_token.v.floatvalue = strtold(string, &endptr);
 938
 939                 if(*endptr != '\0') {
 940                         parse_error("invalid number literal");
 941                 }
 942
 943                 parse_floating_suffix();
 944         } else {
 945                 const char *endptr;
 946                 lexer_token.type       = T_INTEGER;
 947                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 948
 949                 if(*endptr != '\0') {
 950                         parse_error("invalid number literal");
 951                 }
 952
 953                 parse_integer_suffix(false);
 954         }
 955         obstack_free(&symbol_obstack, string);
 956 }
 957
 958 /**
 959  * Parses a number and sets the lexer_token.
 960  */
 961 static void parse_number(void)
 962 {
 963         if (c == '0') {
 964                 next_char();
 965                 switch (c) {
 966                         case 'X':
 967                         case 'x':
 968                                 parse_number_hex();
 969                                 break;
 970                         case '0':
 971                         case '1':
 972                         case '2':
 973                         case '3':
 974                         case '4':
 975                         case '5':
 976                         case '6':
 977                         case '7':
 978                                 parse_number_oct();
 979                                 break;
 980                         case '8':
 981                         case '9':
 982                                 next_char();
 983                                 parse_error("invalid octal number");
 984                                 lexer_token.type = T_ERROR;
 985                                 return;
 986                         case '.':
 987                         case 'e':
 988                         case 'E':
 989                         default:
 990                                 obstack_1grow(&symbol_obstack, '0');
 991                                 parse_number_dec();
 992                                 return;
 993                 }
 994         } else {
 995                 parse_number_dec();
 996         }
 997 }
 998
 999 /**
1000  * Returns the value of a digit.
1001  * The only portable way to do it ...
1002  */
1003 static int digit_value(utf32 const digit)
1004 {
1005         switch (digit) {
1006         case '0': return 0;
1007         case '1': return 1;
1008         case '2': return 2;
1009         case '3': return 3;
1010         case '4': return 4;
1011         case '5': return 5;
1012         case '6': return 6;
1013         case '7': return 7;
1014         case '8': return 8;
1015         case '9': return 9;
1016         case 'a':
1017         case 'A': return 10;
1018         case 'b':
1019         case 'B': return 11;
1020         case 'c':
1021         case 'C': return 12;
1022         case 'd':
1023         case 'D': return 13;
1024         case 'e':
1025         case 'E': return 14;
1026         case 'f':
1027         case 'F': return 15;
1028         default:
1029                 internal_error("wrong character given");
1030         }
1031 }
1032
1033 /**
1034  * Parses an octal character sequence.
1035  *
1036  * @param first_digit  the already read first digit
1037  */
1038 static utf32 parse_octal_sequence(utf32 const first_digit)
1039 {
1040         assert(is_octal_digit(first_digit));
1041         utf32 value = digit_value(first_digit);
1042         if (!is_octal_digit(c)) return value;
1043         value = 8 * value + digit_value(c);
1044         next_char();
1045         if (!is_octal_digit(c)) return value;
1046         value = 8 * value + digit_value(c);
1047         next_char();
1048         return value;
1049 }
1050
1051 /**
1052  * Parses a hex character sequence.
1053  */
1054 static utf32 parse_hex_sequence(void)
1055 {
1056         utf32 value = 0;
1057         while(isxdigit(c)) {
1058                 value = 16 * value + digit_value(c);
1059                 next_char();
1060         }
1061         return value;
1062 }
1063
1064 /**
1065  * Parse an escape sequence.
1066  */
1067 static utf32 parse_escape_sequence(void)
1068 {
1069         eat('\\');
1070
1071         utf32 const ec = c;
1072         next_char();
1073
1074         switch (ec) {
1075         case '"':  return '"';
1076         case '\'': return '\'';
1077         case '\\': return '\\';
1078         case '?': return '\?';
1079         case 'a': return '\a';
1080         case 'b': return '\b';
1081         case 'f': return '\f';
1082         case 'n': return '\n';
1083         case 'r': return '\r';
1084         case 't': return '\t';
1085         case 'v': return '\v';
1086         case 'x':
1087                 return parse_hex_sequence();
1088         case '0':
1089         case '1':
1090         case '2':
1091         case '3':
1092         case '4':
1093         case '5':
1094         case '6':
1095         case '7':
1096                 return parse_octal_sequence(ec);
1097         case EOF:
1098                 parse_error("reached end of file while parsing escape sequence");
1099                 return EOF;
1100         /* \E is not documented, but handled, by GCC.  It is acceptable according
1101          * to §6.11.4, whereas \e is not. */
1102         case 'E':
1103         case 'e':
1104                 if (c_mode & _GNUC)
1105                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1106                 /* FALLTHROUGH */
1107         default:
1108                 /* §6.4.4.4:8 footnote 64 */
1109                 parse_error("unknown escape sequence");
1110                 return EOF;
1111         }
1112 }
1113
1114 /**
1115  * Concatenate two strings.
1116  */
1117 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1118 {
1119         const size_t len1 = s1->size - 1;
1120         const size_t len2 = s2->size - 1;
1121
1122         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1123         memcpy(concat, s1->begin, len1);
1124         memcpy(concat + len1, s2->begin, len2 + 1);
1125
1126         if (warning.traditional) {
1127                 warningf(&lexer_token.source_position,
1128                         "traditional C rejects string constant concatenation");
1129         }
1130 #if 0 /* TODO hash */
1131         const char *result = strset_insert(&stringset, concat);
1132         if(result != concat) {
1133                 obstack_free(&symbol_obstack, concat);
1134         }
1135
1136         return result;
1137 #else
1138         return (string_t){ concat, len1 + len2 + 1 };
1139 #endif
1140 }
1141
1142 /**
1143  * Concatenate a string and a wide string.
1144  */
1145 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1146 {
1147         const size_t len1 = s1->size - 1;
1148         const size_t len2 = s2->size - 1;
1149
1150         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1151         const char *const src = s1->begin;
1152         for (size_t i = 0; i != len1; ++i) {
1153                 concat[i] = src[i];
1154         }
1155         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1156         if (warning.traditional) {
1157                 warningf(&lexer_token.source_position,
1158                         "traditional C rejects string constant concatenation");
1159         }
1160
1161         return (wide_string_t){ concat, len1 + len2 + 1 };
1162 }
1163
1164 /**
1165  * Concatenate two wide strings.
1166  */
1167 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1168 {
1169         const size_t len1 = s1->size - 1;
1170         const size_t len2 = s2->size - 1;
1171
1172         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1173         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1174         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1175         if (warning.traditional) {
1176                 warningf(&lexer_token.source_position,
1177                         "traditional C rejects string constant concatenation");
1178         }
1179
1180         return (wide_string_t){ concat, len1 + len2 + 1 };
1181 }
1182
1183 /**
1184  * Concatenate a wide string and a string.
1185  */
1186 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1187 {
1188         const size_t len1 = s1->size - 1;
1189         const size_t len2 = s2->size - 1;
1190
1191         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1192         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1193         const char  *const src = s2->begin;
1194         wchar_rep_t *const dst = concat + len1;
1195         for (size_t i = 0; i != len2 + 1; ++i) {
1196                 dst[i] = src[i];
1197         }
1198         if (warning.traditional) {
1199                 warningf(&lexer_token.source_position,
1200                         "traditional C rejects string constant concatenation");
1201         }
1202
1203         return (wide_string_t){ concat, len1 + len2 + 1 };
1204 }
1205
1206 static void grow_symbol(utf32 const tc)
1207 {
1208         struct obstack *const o  = &symbol_obstack;
1209         if (tc < 0x80U) {
1210                 obstack_1grow(o, tc);
1211         } else if (tc < 0x800) {
1212                 obstack_1grow(o, 0xC0 | (tc >> 6));
1213                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1214         } else if (tc < 0x10000) {
1215                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1216                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1217                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1218         } else {
1219                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1220                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1221                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1222                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1223         }
1224 }
1225
1226 /**
1227  * Parse a string literal and set lexer_token.
1228  */
1229 static void parse_string_literal(void)
1230 {
1231         const unsigned start_linenr = lexer_token.source_position.linenr;
1232
1233         eat('"');
1234
1235         while(1) {
1236                 switch(c) {
1237                 case '\\': {
1238                         utf32 const tc = parse_escape_sequence();
1239                         if (tc >= 0x100) {
1240                                 warningf(&lexer_token.source_position,
1241                                                 "escape sequence out of range");
1242                         }
1243                         obstack_1grow(&symbol_obstack, tc);
1244                         break;
1245                 }
1246
1247                 case EOF: {
1248                         source_position_t source_position;
1249                         source_position.input_name = lexer_token.source_position.input_name;
1250                         source_position.linenr     = start_linenr;
1251                         errorf(&source_position, "string has no end");
1252                         lexer_token.type = T_ERROR;
1253                         return;
1254                 }
1255
1256                 case '"':
1257                         next_char();
1258                         goto end_of_string;
1259
1260                 default:
1261                         grow_symbol(c);
1262                         next_char();
1263                         break;
1264                 }
1265         }
1266
1267 end_of_string:
1268
1269         /* TODO: concatenate multiple strings separated by whitespace... */
1270
1271         /* add finishing 0 to the string */
1272         obstack_1grow(&symbol_obstack, '\0');
1273         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1274         const char *const string = obstack_finish(&symbol_obstack);
1275
1276 #if 0 /* TODO hash */
1277         /* check if there is already a copy of the string */
1278         result = strset_insert(&stringset, string);
1279         if(result != string) {
1280                 obstack_free(&symbol_obstack, string);
1281         }
1282 #else
1283         const char *const result = string;
1284 #endif
1285
1286         lexer_token.type           = T_STRING_LITERAL;
1287         lexer_token.v.string.begin = result;
1288         lexer_token.v.string.size  = size;
1289 }
1290
1291 /**
1292  * Parse a wide character constant and set lexer_token.
1293  */
1294 static void parse_wide_character_constant(void)
1295 {
1296         const unsigned start_linenr = lexer_token.source_position.linenr;
1297
1298         eat('\'');
1299
1300         while(1) {
1301                 switch(c) {
1302                 case '\\': {
1303                         wchar_rep_t tc = parse_escape_sequence();
1304                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1305                         break;
1306                 }
1307
1308                 MATCH_NEWLINE(
1309                         parse_error("newline while parsing character constant");
1310                         break;
1311                 )
1312
1313                 case '\'':
1314                         next_char();
1315                         goto end_of_wide_char_constant;
1316
1317                 case EOF: {
1318                         source_position_t source_position = lexer_token.source_position;
1319                         source_position.linenr = start_linenr;
1320                         errorf(&source_position, "EOF while parsing character constant");
1321                         lexer_token.type = T_ERROR;
1322                         return;
1323                 }
1324
1325                 default: {
1326                         wchar_rep_t tc = (wchar_rep_t) c;
1327                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1328                         next_char();
1329                         break;
1330                 }
1331                 }
1332         }
1333
1334 end_of_wide_char_constant:;
1335         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1336         assert(size % sizeof(wchar_rep_t) == 0);
1337         size /= sizeof(wchar_rep_t);
1338
1339         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1340
1341         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1342         lexer_token.v.wide_string.begin = string;
1343         lexer_token.v.wide_string.size  = size;
1344         lexer_token.datatype            = type_wchar_t;
1345 }
1346
1347 /**
1348  * Parse a wide string literal and set lexer_token.
1349  */
1350 static void parse_wide_string_literal(void)
1351 {
1352         const unsigned start_linenr = lexer_token.source_position.linenr;
1353
1354         assert(c == '"');
1355         next_char();
1356
1357         while(1) {
1358                 switch(c) {
1359                 case '\\': {
1360                         wchar_rep_t tc = parse_escape_sequence();
1361                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1362                         break;
1363                 }
1364
1365                 case EOF: {
1366                         source_position_t source_position;
1367                         source_position.input_name = lexer_token.source_position.input_name;
1368                         source_position.linenr     = start_linenr;
1369                         errorf(&source_position, "string has no end");
1370                         lexer_token.type = T_ERROR;
1371                         return;
1372                 }
1373
1374                 case '"':
1375                         next_char();
1376                         goto end_of_string;
1377
1378                 default: {
1379                         wchar_rep_t tc = c;
1380                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1381                         next_char();
1382                         break;
1383                 }
1384                 }
1385         }
1386
1387 end_of_string:;
1388
1389         /* TODO: concatenate multiple strings separated by whitespace... */
1390
1391         /* add finishing 0 to the string */
1392         wchar_rep_t nul = L'\0';
1393         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1394         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1395         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1396
1397 #if 0 /* TODO hash */
1398         /* check if there is already a copy of the string */
1399         const wchar_rep_t *const result = strset_insert(&stringset, string);
1400         if(result != string) {
1401                 obstack_free(&symbol_obstack, string);
1402         }
1403 #else
1404         const wchar_rep_t *const result = string;
1405 #endif
1406
1407         lexer_token.type                = T_WIDE_STRING_LITERAL;
1408         lexer_token.v.wide_string.begin = result;
1409         lexer_token.v.wide_string.size  = size;
1410 }
1411
1412 /**
1413  * Parse a character constant and set lexer_token.
1414  */
1415 static void parse_character_constant(void)
1416 {
1417         const unsigned start_linenr = lexer_token.source_position.linenr;
1418
1419         eat('\'');
1420
1421         while(1) {
1422                 switch(c) {
1423                 case '\\': {
1424                         utf32 const tc = parse_escape_sequence();
1425                         if (tc >= 0x100) {
1426                                 warningf(&lexer_token.source_position,
1427                                                 "escape sequence out of range");
1428                         }
1429                         obstack_1grow(&symbol_obstack, tc);
1430                         break;
1431                 }
1432
1433                 MATCH_NEWLINE(
1434                         parse_error("newline while parsing character constant");
1435                         break;
1436                 )
1437
1438                 case '\'':
1439                         next_char();
1440                         goto end_of_char_constant;
1441
1442                 case EOF: {
1443                         source_position_t source_position;
1444                         source_position.input_name = lexer_token.source_position.input_name;
1445                         source_position.linenr     = start_linenr;
1446                         errorf(&source_position, "EOF while parsing character constant");
1447                         lexer_token.type = T_ERROR;
1448                         return;
1449                 }
1450
1451                 default:
1452                         grow_symbol(c);
1453                         next_char();
1454                         break;
1455
1456                 }
1457         }
1458
1459 end_of_char_constant:;
1460         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1461         const char *const string = obstack_finish(&symbol_obstack);
1462
1463         lexer_token.type           = T_CHARACTER_CONSTANT;
1464         lexer_token.v.string.begin = string;
1465         lexer_token.v.string.size  = size;
1466         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1467 }
1468
1469 /**
1470  * Skip a multiline comment.
1471  */
1472 static void skip_multiline_comment(void)
1473 {
1474         unsigned start_linenr = lexer_token.source_position.linenr;
1475
1476         while(1) {
1477                 switch(c) {
1478                 case '/':
1479                         next_char();
1480                         if (c == '*') {
1481                                 /* nested comment, warn here */
1482                                 if (warning.comment) {
1483                                         warningf(&lexer_token.source_position, "'/*' within comment");
1484                                 }
1485                         }
1486                         break;
1487                 case '*':
1488                         next_char();
1489                         if(c == '/') {
1490                                 next_char();
1491                                 return;
1492                         }
1493                         break;
1494
1495                 MATCH_NEWLINE(break;)
1496
1497                 case EOF: {
1498                         source_position_t source_position;
1499                         source_position.input_name = lexer_token.source_position.input_name;
1500                         source_position.linenr     = start_linenr;
1501                         errorf(&source_position, "at end of file while looking for comment end");
1502                         return;
1503                 }
1504
1505                 default:
1506                         next_char();
1507                         break;
1508                 }
1509         }
1510 }
1511
1512 /**
1513  * Skip a single line comment.
1514  */
1515 static void skip_line_comment(void)
1516 {
1517         while(1) {
1518                 switch(c) {
1519                 case EOF:
1520                         return;
1521
1522                 case '\n':
1523                 case '\r':
1524                         return;
1525
1526                 case '\\':
1527                         next_char();
1528                         if (c == '\n' || c == '\r') {
1529                                 if (warning.comment)
1530                                         warningf(&lexer_token.source_position, "multi-line comment");
1531                                 return;
1532                         }
1533                         break;
1534
1535                 default:
1536                         next_char();
1537                         break;
1538                 }
1539         }
1540 }
1541
1542 /** The current preprocessor token. */
1543 static token_t pp_token;
1544
1545 /**
1546  * Read the next preprocessor token.
1547  */
1548 static inline void next_pp_token(void)
1549 {
1550         lexer_next_preprocessing_token();
1551         pp_token = lexer_token;
1552 }
1553
1554 /**
1555  * Eat all preprocessor tokens until newline.
1556  */
1557 static void eat_until_newline(void)
1558 {
1559         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1560                 next_pp_token();
1561         }
1562 }
1563
1564 /**
1565  * Handle the define directive.
1566  */
1567 static void define_directive(void)
1568 {
1569         lexer_next_preprocessing_token();
1570         if(lexer_token.type != T_IDENTIFIER) {
1571                 parse_error("expected identifier after #define\n");
1572                 eat_until_newline();
1573         }
1574 }
1575
1576 /**
1577  * Handle the ifdef directive.
1578  */
1579 static void ifdef_directive(int is_ifndef)
1580 {
1581         (void) is_ifndef;
1582         lexer_next_preprocessing_token();
1583         //expect_identifier();
1584         //extect_newline();
1585 }
1586
1587 /**
1588  * Handle the endif directive.
1589  */
1590 static void endif_directive(void)
1591 {
1592         //expect_newline();
1593 }
1594
1595 /**
1596  * Parse the line directive.
1597  */
1598 static void parse_line_directive(void)
1599 {
1600         if(pp_token.type != T_INTEGER) {
1601                 parse_error("expected integer");
1602         } else {
1603                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1604                 next_pp_token();
1605         }
1606         if(pp_token.type == T_STRING_LITERAL) {
1607                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1608                 next_pp_token();
1609         }
1610
1611         eat_until_newline();
1612 }
1613
1614 /**
1615  * STDC pragmas.
1616  */
1617 typedef enum stdc_pragma_kind_t {
1618         STDC_UNKNOWN,
1619         STDC_FP_CONTRACT,
1620         STDC_FENV_ACCESS,
1621         STDC_CX_LIMITED_RANGE
1622 } stdc_pragma_kind_t;
1623
1624 /**
1625  * STDC pragma values.
1626  */
1627 typedef enum stdc_pragma_value_kind_t {
1628         STDC_VALUE_UNKNOWN,
1629         STDC_VALUE_ON,
1630         STDC_VALUE_OFF,
1631         STDC_VALUE_DEFAULT
1632 } stdc_pragma_value_kind_t;
1633
1634 /**
1635  * Parse a pragma directive.
1636  */
1637 static void parse_pragma(void)
1638 {
1639         bool unknown_pragma = true;
1640
1641         next_pp_token();
1642         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1643                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1644                 /* a STDC pragma */
1645                 if (c_mode & _C99) {
1646                         next_pp_token();
1647
1648                         switch (pp_token.v.symbol->pp_ID) {
1649                         case TP_FP_CONTRACT:
1650                                 kind = STDC_FP_CONTRACT;
1651                                 break;
1652                         case TP_FENV_ACCESS:
1653                                 kind = STDC_FENV_ACCESS;
1654                                 break;
1655                         case TP_CX_LIMITED_RANGE:
1656                                 kind = STDC_CX_LIMITED_RANGE;
1657                                 break;
1658                         default:
1659                                 break;
1660                         }
1661                         if (kind != STDC_UNKNOWN) {
1662                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1663                                 next_pp_token();
1664                                 switch (pp_token.v.symbol->pp_ID) {
1665                                 case TP_ON:
1666                                         value = STDC_VALUE_ON;
1667                                         break;
1668                                 case TP_OFF:
1669                                         value = STDC_VALUE_OFF;
1670                                         break;
1671                                 case TP_DEFAULT:
1672                                         value = STDC_VALUE_DEFAULT;
1673                                         break;
1674                                 default:
1675                                         break;
1676                                 }
1677                                 if (value != STDC_VALUE_UNKNOWN) {
1678                                         unknown_pragma = false;
1679                                 } else {
1680                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1681                                 }
1682                         }
1683                 }
1684         } else {
1685                 unknown_pragma = true;
1686         }
1687         eat_until_newline();
1688         if (unknown_pragma && warning.unknown_pragmas) {
1689                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1690         }
1691 }
1692
1693 /**
1694  * Parse a preprocessor non-null directive.
1695  */
1696 static void parse_preprocessor_identifier(void)
1697 {
1698         assert(pp_token.type == T_IDENTIFIER);
1699         symbol_t *symbol = pp_token.v.symbol;
1700
1701         switch(symbol->pp_ID) {
1702         case TP_include:
1703                 printf("include - enable header name parsing!\n");
1704                 break;
1705         case TP_define:
1706                 define_directive();
1707                 break;
1708         case TP_ifdef:
1709                 ifdef_directive(0);
1710                 break;
1711         case TP_ifndef:
1712                 ifdef_directive(1);
1713                 break;
1714         case TP_endif:
1715                 endif_directive();
1716                 break;
1717         case TP_line:
1718                 next_pp_token();
1719                 parse_line_directive();
1720                 break;
1721         case TP_if:
1722         case TP_else:
1723         case TP_elif:
1724         case TP_undef:
1725         case TP_error:
1726                 /* TODO; output the rest of the line */
1727                 parse_error("#error directive: ");
1728                 break;
1729         case TP_pragma:
1730                 parse_pragma();
1731                 break;
1732         }
1733 }
1734
1735 /**
1736  * Parse a preprocessor directive.
1737  */
1738 static void parse_preprocessor_directive(void)
1739 {
1740         next_pp_token();
1741
1742         switch(pp_token.type) {
1743         case T_IDENTIFIER:
1744                 parse_preprocessor_identifier();
1745                 break;
1746         case T_INTEGER:
1747                 parse_line_directive();
1748                 break;
1749         case '\n':
1750                 /* NULL directive, see §6.10.7 */
1751                 break;
1752         default:
1753                 parse_error("invalid preprocessor directive");
1754                 eat_until_newline();
1755                 break;
1756         }
1757 }
1758
1759 #define MAYBE_PROLOG                                       \
1760                         next_char();                                   \
1761                         while(1) {                                     \
1762                                 switch(c) {
1763
1764 #define MAYBE(ch, set_type)                                \
1765                                 case ch:                                   \
1766                                         next_char();                           \
1767                                         lexer_token.type = set_type;           \
1768                                         return;
1769
1770 #define ELSE_CODE(code)                                    \
1771                                 default:                                   \
1772                                         code                                   \
1773                                 }                                          \
1774                         } /* end of while(1) */                        \
1775                         break;
1776
1777 #define ELSE(set_type)                                     \
1778                 ELSE_CODE(                                         \
1779                         lexer_token.type = set_type;                   \
1780                         return;                                        \
1781                 )
1782
1783 void lexer_next_preprocessing_token(void)
1784 {
1785         while(1) {
1786                 switch(c) {
1787                 case ' ':
1788                 case '\t':
1789                         next_char();
1790                         break;
1791
1792                 MATCH_NEWLINE(
1793                         lexer_token.type = '\n';
1794                         return;
1795                 )
1796
1797                 SYMBOL_CHARS
1798                         parse_symbol();
1799                         /* might be a wide string ( L"string" ) */
1800                         if(lexer_token.type == T_IDENTIFIER &&
1801                             lexer_token.v.symbol == symbol_L) {
1802                             if(c == '"') {
1803                                         parse_wide_string_literal();
1804                                 } else if(c == '\'') {
1805                                         parse_wide_character_constant();
1806                                 }
1807                         }
1808                         return;
1809
1810                 DIGITS
1811                         parse_number();
1812                         return;
1813
1814                 case '"':
1815                         parse_string_literal();
1816                         return;
1817
1818                 case '\'':
1819                         parse_character_constant();
1820                         return;
1821
1822                 case '.':
1823                         MAYBE_PROLOG
1824                                 DIGITS
1825                                         put_back(c);
1826                                         c = '.';
1827                                         parse_number_dec();
1828                                         return;
1829
1830                                 case '.':
1831                                         MAYBE_PROLOG
1832                                         MAYBE('.', T_DOTDOTDOT)
1833                                         ELSE_CODE(
1834                                                 put_back(c);
1835                                                 c = '.';
1836                                                 lexer_token.type = '.';
1837                                                 return;
1838                                         )
1839                         ELSE('.')
1840                 case '&':
1841                         MAYBE_PROLOG
1842                         MAYBE('&', T_ANDAND)
1843                         MAYBE('=', T_ANDEQUAL)
1844                         ELSE('&')
1845                 case '*':
1846                         MAYBE_PROLOG
1847                         MAYBE('=', T_ASTERISKEQUAL)
1848                         ELSE('*')
1849                 case '+':
1850                         MAYBE_PROLOG
1851                         MAYBE('+', T_PLUSPLUS)
1852                         MAYBE('=', T_PLUSEQUAL)
1853                         ELSE('+')
1854                 case '-':
1855                         MAYBE_PROLOG
1856                         MAYBE('>', T_MINUSGREATER)
1857                         MAYBE('-', T_MINUSMINUS)
1858                         MAYBE('=', T_MINUSEQUAL)
1859                         ELSE('-')
1860                 case '!':
1861                         MAYBE_PROLOG
1862                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1863                         ELSE('!')
1864                 case '/':
1865                         MAYBE_PROLOG
1866                         MAYBE('=', T_SLASHEQUAL)
1867                                 case '*':
1868                                         next_char();
1869                                         skip_multiline_comment();
1870                                         lexer_next_preprocessing_token();
1871                                         return;
1872                                 case '/':
1873                                         next_char();
1874                                         skip_line_comment();
1875                                         lexer_next_preprocessing_token();
1876                                         return;
1877                         ELSE('/')
1878                 case '%':
1879                         MAYBE_PROLOG
1880                         MAYBE('>', '}')
1881                         MAYBE('=', T_PERCENTEQUAL)
1882                                 case ':':
1883                                         MAYBE_PROLOG
1884                                                 case '%':
1885                                                         MAYBE_PROLOG
1886                                                         MAYBE(':', T_HASHHASH)
1887                                                         ELSE_CODE(
1888                                                                 put_back(c);
1889                                                                 c = '%';
1890                                                                 lexer_token.type = '#';
1891                                                                 return;
1892                                                         )
1893                                         ELSE('#')
1894                         ELSE('%')
1895                 case '<':
1896                         MAYBE_PROLOG
1897                         MAYBE(':', '[')
1898                         MAYBE('%', '{')
1899                         MAYBE('=', T_LESSEQUAL)
1900                                 case '<':
1901                                         MAYBE_PROLOG
1902                                         MAYBE('=', T_LESSLESSEQUAL)
1903                                         ELSE(T_LESSLESS)
1904                         ELSE('<')
1905                 case '>':
1906                         MAYBE_PROLOG
1907                         MAYBE('=', T_GREATEREQUAL)
1908                                 case '>':
1909                                         MAYBE_PROLOG
1910                                         MAYBE('=', T_GREATERGREATEREQUAL)
1911                                         ELSE(T_GREATERGREATER)
1912                         ELSE('>')
1913                 case '^':
1914                         MAYBE_PROLOG
1915                         MAYBE('=', T_CARETEQUAL)
1916                         ELSE('^')
1917                 case '|':
1918                         MAYBE_PROLOG
1919                         MAYBE('=', T_PIPEEQUAL)
1920                         MAYBE('|', T_PIPEPIPE)
1921                         ELSE('|')
1922                 case ':':
1923                         MAYBE_PROLOG
1924                         MAYBE('>', ']')
1925                         ELSE(':')
1926                 case '=':
1927                         MAYBE_PROLOG
1928                         MAYBE('=', T_EQUALEQUAL)
1929                         ELSE('=')
1930                 case '#':
1931                         MAYBE_PROLOG
1932                         MAYBE('#', T_HASHHASH)
1933                         ELSE('#')
1934
1935                 case '?':
1936                 case '[':
1937                 case ']':
1938                 case '(':
1939                 case ')':
1940                 case '{':
1941                 case '}':
1942                 case '~':
1943                 case ';':
1944                 case ',':
1945                 case '\\':
1946                         lexer_token.type = c;
1947                         next_char();
1948                         return;
1949
1950                 case EOF:
1951                         lexer_token.type = T_EOF;
1952                         return;
1953
1954                 default:
1955 dollar_sign:
1956                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1957                         next_char();
1958                         lexer_token.type = T_ERROR;
1959                         return;
1960                 }
1961         }
1962 }
1963
1964 void lexer_next_token(void)
1965 {
1966         lexer_next_preprocessing_token();
1967
1968         while (lexer_token.type == '\n') {
1969 newline_found:
1970                 lexer_next_preprocessing_token();
1971         }
1972
1973         if (lexer_token.type == '#') {
1974                 parse_preprocessor_directive();
1975                 goto newline_found;
1976         }
1977 }
1978
1979 void init_lexer(void)
1980 {
1981         strset_init(&stringset);
1982         symbol_L = symbol_table_insert("L");
1983 }
1984
1985 void lexer_open_stream(FILE *stream, const char *input_name)
1986 {
1987         input                                  = stream;
1988         lexer_token.source_position.linenr     = 0;
1989         lexer_token.source_position.input_name = input_name;
1990
1991         bufpos = NULL;
1992         bufend = NULL;
1993
1994         /* place a virtual \n at the beginning so the lexer knows that we're
1995          * at the beginning of a line */
1996         c = '\n';
1997 }
1998
1999 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
2000 {
2001         input                                  = NULL;
2002         lexer_token.source_position.linenr     = 0;
2003         lexer_token.source_position.input_name = input_name;
2004
2005 #if 0 // TODO
2006         bufpos = buffer;
2007         bufend = buffer + len;
2008 #else
2009         (void)buffer;
2010         (void)len;
2011         panic("builtin lexing not done yet");
2012 #endif
2013
2014         /* place a virtual \n at the beginning so the lexer knows that we're
2015          * at the beginning of a line */
2016         c = '\n';
2017 }
2018
2019 void exit_lexer(void)
2020 {
2021         strset_destroy(&stringset);
2022 }
2023
2024 static __attribute__((unused))
2025 void dbg_pos(const source_position_t source_position)
2026 {
2027         fprintf(stdout, "%s:%u\n", source_position.input_name,
2028                 source_position.linenr);
2029         fflush(stdout);
2030 }