nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <stdbool.h>
  41 #include <ctype.h>
  42
  43 #ifndef _WIN32
  44 #include <strings.h>
  45 #endif
  46
  47 //#define DEBUG_CHARS
  48 #define MAX_PUTBACK 3
  49 #define BUF_SIZE    1024
  50
  51 #if defined(_WIN32) || defined(__CYGWIN__)
  52 /* No strtold on windows and no replacement yet */
  53 #define strtold(s, e)     strtod(s, e)
  54 #define strcasecmp(a, b)  stricmp(a, b)
  55 #endif
  56
  57 typedef unsigned int utf32;
  58
  59 static utf32        c;
  60 token_t             lexer_token;
  61 symbol_t           *symbol_L;
  62 static FILE        *input;
  63 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  64 static const utf32 *bufend;
  65 static const utf32 *bufpos;
  66 static strset_t     stringset;
  67 bool                allow_dollar_in_symbol = true;
  68
  69 /**
  70  * Prints a parse error message at the current token.
  71  *
  72  * @param msg   the error message
  73  */
  74 static void parse_error(const char *msg)
  75 {
  76         errorf(&lexer_token.source_position, "%s", msg);
  77 }
  78
  79 /**
  80  * Prints an internal error message at the current token.
  81  *
  82  * @param msg   the error message
  83  */
  84 static NORETURN internal_error(const char *msg)
  85 {
  86         internal_errorf(&lexer_token.source_position, "%s", msg);
  87 }
  88
  89 static size_t read_block(unsigned char *const read_buf, size_t const n)
  90 {
  91         size_t const s = fread(read_buf, 1, n, input);
  92         if (s == 0) {
  93                 if (ferror(input))
  94                         parse_error("read from input failed");
  95                 buf[MAX_PUTBACK] = EOF;
  96                 bufpos           = buf + MAX_PUTBACK;
  97                 bufend           = buf + MAX_PUTBACK + 1;
  98         }
  99         return s;
 100 }
 101
 102 static void decode_iso_8859_1(void)
 103 {
 104         unsigned char read_buf[BUF_SIZE];
 105         size_t const s = read_block(read_buf, sizeof(read_buf));
 106         if (s == 0)
 107                 return;
 108
 109         unsigned char const *src = read_buf;
 110         unsigned char const *end = read_buf + s;
 111         utf32               *dst = buf + MAX_PUTBACK;
 112         while (src != end)
 113                 *dst++ = *src++;
 114
 115         bufpos = buf + MAX_PUTBACK;
 116         bufend = dst;
 117 }
 118
 119 static void decode_iso_8859_15(void)
 120 {
 121         unsigned char read_buf[BUF_SIZE];
 122         size_t const s = read_block(read_buf, sizeof(read_buf));
 123         if (s == 0)
 124                 return;
 125
 126         unsigned char const *src = read_buf;
 127         unsigned char const *end = read_buf + s;
 128         utf32               *dst = buf + MAX_PUTBACK;
 129         while (src != end) {
 130                 utf32 tc = *src++;
 131                 switch (tc) {
 132                         case 0xA4: tc = 0x20AC; break; // €
 133                         case 0xA6: tc = 0x0160; break; // Š
 134                         case 0xA8: tc = 0x0161; break; // š
 135                         case 0xB4: tc = 0x017D; break; // Ž
 136                         case 0xB8: tc = 0x017E; break; // ž
 137                         case 0xBC: tc = 0x0152; break; // Œ
 138                         case 0xBD: tc = 0x0153; break; // œ
 139                         case 0xBE: tc = 0x0178; break; // Ÿ
 140                 }
 141                 *dst++ = tc;
 142         }
 143
 144         bufpos = buf + MAX_PUTBACK;
 145         bufend = dst;
 146 }
 147
 148 static void decode_utf8(void)
 149 {
 150         static utf32  part_decoded_min_code;
 151         static utf32  part_decoded_char;
 152         static size_t part_decoded_rest_len;
 153
 154         do {
 155                 unsigned char read_buf[BUF_SIZE];
 156                 size_t const s = read_block(read_buf, sizeof(read_buf));
 157                 if (s == 0) {
 158                         if (part_decoded_rest_len > 0)
 159                                 parse_error("incomplete input char at end of input");
 160                         return;
 161                 }
 162
 163                 unsigned char const *src = read_buf;
 164                 unsigned char const *end = read_buf + s;
 165                 utf32               *dst = buf + MAX_PUTBACK;
 166                 utf32                decoded;
 167                 utf32                min_code;
 168
 169                 if (part_decoded_rest_len != 0) {
 170                         min_code              = part_decoded_min_code;
 171                         decoded               = part_decoded_char;
 172                         size_t const rest_len = part_decoded_rest_len;
 173                         part_decoded_rest_len = 0;
 174                         switch (rest_len) {
 175                                 case 4:  goto realign;
 176                                 case 3:  goto three_more;
 177                                 case 2:  goto two_more;
 178                                 default: goto one_more;
 179                         }
 180                 }
 181
 182                 while (src != end) {
 183                         if ((*src & 0x80) == 0) {
 184                                 decoded = *src++;
 185                         } else if ((*src & 0xE0) == 0xC0) {
 186                                 min_code = 0x80;
 187                                 decoded  = *src++ & 0x1F;
 188 one_more:
 189                                 if (src == end) {
 190                                         part_decoded_min_code = min_code;
 191                                         part_decoded_char     = decoded;
 192                                         part_decoded_rest_len = 1;
 193                                         break;
 194                                 }
 195                                 if ((*src & 0xC0) == 0x80) {
 196                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 197                                 } else {
 198                                         goto invalid_char;
 199                                 }
 200                                 if (decoded < min_code                      ||
 201                                                 decoded > 0x10FFFF                      ||
 202                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 203                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 204                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 205                                         parse_error("invalid byte sequence in input");
 206                                 }
 207                         } else if ((*src & 0xF0) == 0xE0) {
 208                                 min_code = 0x800;
 209                                 decoded  = *src++ & 0x0F;
 210 two_more:
 211                                 if (src == end) {
 212                                         part_decoded_min_code = min_code;
 213                                         part_decoded_char     = decoded;
 214                                         part_decoded_rest_len = 2;
 215                                         break;
 216                                 }
 217                                 if ((*src & 0xC0) == 0x80) {
 218                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 219                                 } else {
 220                                         goto invalid_char;
 221                                 }
 222                                 goto one_more;
 223                         } else if ((*src & 0xF8) == 0xF0) {
 224                                 min_code = 0x10000;
 225                                 decoded  = *src++ & 0x07;
 226 three_more:
 227                                 if (src == end) {
 228                                         part_decoded_min_code = min_code;
 229                                         part_decoded_char     = decoded;
 230                                         part_decoded_rest_len = 3;
 231                                         break;
 232                                 }
 233                                 if ((*src & 0xC0) == 0x80) {
 234                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 235                                 } else {
 236                                         goto invalid_char;
 237                                 }
 238                                 goto two_more;
 239                         } else {
 240 invalid_char:
 241                                 parse_error("invalid byte sequence in input");
 242 realign:
 243                                 do {
 244                                         ++src;
 245                                         if (src == end) {
 246                                                 part_decoded_rest_len = 4;
 247                                                 break;
 248                                         }
 249                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 250                                 continue;
 251                         }
 252                         *dst++ = decoded;
 253                 }
 254
 255                 bufpos = buf + MAX_PUTBACK;
 256                 bufend = dst;
 257         } while (bufpos == bufend);
 258 }
 259
 260 typedef void (*decoder_t)(void);
 261
 262 static decoder_t decoder = decode_utf8;
 263
 264 typedef struct named_decoder_t {
 265         char const *name;
 266         decoder_t   decoder;
 267 } named_decoder_t;
 268
 269 static named_decoder_t const decoders[] = {
 270         { "CP819",           decode_iso_8859_1  }, // offical alias
 271         { "IBM819",          decode_iso_8859_1  }, // offical alias
 272         { "ISO-8859-1",      decode_iso_8859_1  }, // offical alias
 273         { "ISO-8859-15",     decode_iso_8859_15 }, // offical name
 274         { "ISO8859-1",       decode_iso_8859_1  },
 275         { "ISO8859-15",      decode_iso_8859_15 },
 276         { "ISO_8859-1",      decode_iso_8859_1  }, // offical alias
 277         { "ISO_8859-15",     decode_iso_8859_15 }, // offical alias
 278         { "ISO_8859-1:1987", decode_iso_8859_1  }, // offical name
 279         { "Latin-9",         decode_iso_8859_15 }, // offical alias
 280         { "UTF-8",           decode_utf8        }, // offical name
 281         { "csISOLatin1",     decode_iso_8859_1  }, // offical alias
 282         { "iso-ir-100",      decode_iso_8859_1  }, // offical alias
 283         { "l1",              decode_iso_8859_1  }, // offical alias
 284         { "latin1",          decode_iso_8859_1  }, // offical alias
 285
 286         { NULL,              NULL               }
 287 };
 288
 289 void select_input_encoding(char const* const encoding)
 290 {
 291         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 292                 if (strcasecmp(encoding, i->name) != 0)
 293                         continue;
 294                 decoder = i->decoder;
 295                 return;
 296         }
 297         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 298 }
 299
 300 static inline void next_real_char(void)
 301 {
 302         assert(bufpos <= bufend);
 303         if (bufpos >= bufend) {
 304                 if (input == NULL) {
 305                         c = EOF;
 306                         return;
 307                 }
 308                 decoder();
 309         }
 310         c = *bufpos++;
 311 }
 312
 313 /**
 314  * Put a character back into the buffer.
 315  *
 316  * @param pc  the character to put back
 317  */
 318 static inline void put_back(utf32 const pc)
 319 {
 320         assert(bufpos > buf);
 321         *(--bufpos - buf + buf) = pc;
 322
 323 #ifdef DEBUG_CHARS
 324         printf("putback '%lc'\n", pc);
 325 #endif
 326 }
 327
 328 static inline void next_char(void);
 329
 330 #define MATCH_NEWLINE(code)                   \
 331         case '\r':                                \
 332                 next_char();                          \
 333                 if(c == '\n') {                       \
 334                         next_char();                      \
 335                 }                                     \
 336                 lexer_token.source_position.linenr++; \
 337                 code                                  \
 338         case '\n':                                \
 339                 next_char();                          \
 340                 lexer_token.source_position.linenr++; \
 341                 code
 342
 343 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
 344
 345 static void maybe_concat_lines(void)
 346 {
 347         eat('\\');
 348
 349         switch(c) {
 350         MATCH_NEWLINE(return;)
 351
 352         default:
 353                 break;
 354         }
 355
 356         put_back(c);
 357         c = '\\';
 358 }
 359
 360 /**
 361  * Set c to the next input character, ie.
 362  * after expanding trigraphs.
 363  */
 364 static inline void next_char(void)
 365 {
 366         next_real_char();
 367
 368         /* filter trigraphs */
 369         if(UNLIKELY(c == '\\')) {
 370                 maybe_concat_lines();
 371                 goto end_of_next_char;
 372         }
 373
 374         if(LIKELY(c != '?'))
 375                 goto end_of_next_char;
 376
 377         next_real_char();
 378         if(LIKELY(c != '?')) {
 379                 put_back(c);
 380                 c = '?';
 381                 goto end_of_next_char;
 382         }
 383
 384         next_real_char();
 385         switch(c) {
 386         case '=': c = '#'; break;
 387         case '(': c = '['; break;
 388         case '/': c = '\\'; maybe_concat_lines(); break;
 389         case ')': c = ']'; break;
 390         case '\'': c = '^'; break;
 391         case '<': c = '{'; break;
 392         case '!': c = '|'; break;
 393         case '>': c = '}'; break;
 394         case '-': c = '~'; break;
 395         default:
 396                 put_back(c);
 397                 put_back('?');
 398                 c = '?';
 399                 break;
 400         }
 401
 402 end_of_next_char:;
 403 #ifdef DEBUG_CHARS
 404         printf("nchar '%c'\n", c);
 405 #endif
 406 }
 407
 408 #define SYMBOL_CHARS  \
 409         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 410         case 'a':         \
 411         case 'b':         \
 412         case 'c':         \
 413         case 'd':         \
 414         case 'e':         \
 415         case 'f':         \
 416         case 'g':         \
 417         case 'h':         \
 418         case 'i':         \
 419         case 'j':         \
 420         case 'k':         \
 421         case 'l':         \
 422         case 'm':         \
 423         case 'n':         \
 424         case 'o':         \
 425         case 'p':         \
 426         case 'q':         \
 427         case 'r':         \
 428         case 's':         \
 429         case 't':         \
 430         case 'u':         \
 431         case 'v':         \
 432         case 'w':         \
 433         case 'x':         \
 434         case 'y':         \
 435         case 'z':         \
 436         case 'A':         \
 437         case 'B':         \
 438         case 'C':         \
 439         case 'D':         \
 440         case 'E':         \
 441         case 'F':         \
 442         case 'G':         \
 443         case 'H':         \
 444         case 'I':         \
 445         case 'J':         \
 446         case 'K':         \
 447         case 'L':         \
 448         case 'M':         \
 449         case 'N':         \
 450         case 'O':         \
 451         case 'P':         \
 452         case 'Q':         \
 453         case 'R':         \
 454         case 'S':         \
 455         case 'T':         \
 456         case 'U':         \
 457         case 'V':         \
 458         case 'W':         \
 459         case 'X':         \
 460         case 'Y':         \
 461         case 'Z':         \
 462         case '_':
 463
 464 #define DIGITS        \
 465         case '0':         \
 466         case '1':         \
 467         case '2':         \
 468         case '3':         \
 469         case '4':         \
 470         case '5':         \
 471         case '6':         \
 472         case '7':         \
 473         case '8':         \
 474         case '9':
 475
 476 /**
 477  * Read a symbol from the input and build
 478  * the lexer_token.
 479  */
 480 static void parse_symbol(void)
 481 {
 482         symbol_t *symbol;
 483         char     *string;
 484
 485         obstack_1grow(&symbol_obstack, (char) c);
 486         next_char();
 487
 488         while(1) {
 489                 switch(c) {
 490                 DIGITS
 491                 SYMBOL_CHARS
 492                         obstack_1grow(&symbol_obstack, (char) c);
 493                         next_char();
 494                         break;
 495
 496                 default:
 497 dollar_sign:
 498                         goto end_symbol;
 499                 }
 500         }
 501
 502 end_symbol:
 503         obstack_1grow(&symbol_obstack, '\0');
 504
 505         string = obstack_finish(&symbol_obstack);
 506         symbol = symbol_table_insert(string);
 507
 508         lexer_token.type     = symbol->ID;
 509         lexer_token.v.symbol = symbol;
 510
 511         if(symbol->string != string) {
 512                 obstack_free(&symbol_obstack, string);
 513         }
 514 }
 515
 516 static void parse_integer_suffix(bool is_oct_hex)
 517 {
 518         bool is_unsigned     = false;
 519         bool min_long        = false;
 520         bool min_longlong    = false;
 521         bool not_traditional = false;
 522         int  pos             = 0;
 523         char suffix[4];
 524
 525         if (c == 'U' || c == 'u') {
 526                 not_traditional = true;
 527                 suffix[pos++]   = toupper(c);
 528                 is_unsigned     = true;
 529                 next_char();
 530                 if (c == 'L' || c == 'l') {
 531                         suffix[pos++] = toupper(c);
 532                         min_long = true;
 533                         next_char();
 534                         if (c == 'L' || c == 'l') {
 535                                 suffix[pos++] = toupper(c);
 536                                 min_longlong = true;
 537                                 next_char();
 538                         }
 539                 }
 540         } else if (c == 'l' || c == 'L') {
 541                 suffix[pos++] = toupper(c);
 542                 min_long = true;
 543                 next_char();
 544                 if (c == 'l' || c == 'L') {
 545                         not_traditional = true;
 546                         suffix[pos++]   = toupper(c);
 547                         min_longlong    = true;
 548                         next_char();
 549                         if (c == 'u' || c == 'U') {
 550                                 suffix[pos++] = toupper(c);
 551                                 is_unsigned   = true;
 552                                 next_char();
 553                         }
 554                 } else if (c == 'u' || c == 'U') {
 555                         not_traditional = true;
 556                         suffix[pos++]   = toupper(c);
 557                         is_unsigned     = true;
 558                         next_char();
 559                         lexer_token.datatype = type_unsigned_long;
 560                 }
 561         }
 562
 563         if (warning.traditional && not_traditional) {
 564                 suffix[pos] = '\0';
 565                 warningf(&lexer_token.source_position,
 566                         "traditional C rejects the '%s' suffix", suffix);
 567         }
 568         if (!is_unsigned) {
 569                 long long v = lexer_token.v.intvalue;
 570                 if (!min_long) {
 571                         if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
 572                                 lexer_token.datatype = type_int;
 573                                 return;
 574                         } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
 575                                 lexer_token.datatype = type_unsigned_int;
 576                                 return;
 577                         }
 578                 }
 579                 if (!min_longlong) {
 580                         if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
 581                                 lexer_token.datatype = type_long;
 582                                 return;
 583                         } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
 584                                 lexer_token.datatype = type_unsigned_long;
 585                                 return;
 586                         }
 587                 }
 588                 unsigned long long uv = (unsigned long long) v;
 589                 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
 590                         lexer_token.datatype = type_unsigned_long_long;
 591                         return;
 592                 }
 593
 594                 lexer_token.datatype = type_long_long;
 595         } else {
 596                 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
 597                 if (!min_long && v <= TARGET_UINT_MAX) {
 598                         lexer_token.datatype = type_unsigned_int;
 599                         return;
 600                 }
 601                 if (!min_longlong && v <= TARGET_ULONG_MAX) {
 602                         lexer_token.datatype = type_unsigned_long;
 603                         return;
 604                 }
 605                 lexer_token.datatype = type_unsigned_long_long;
 606         }
 607 }
 608
 609 static void parse_floating_suffix(void)
 610 {
 611         switch(c) {
 612         /* TODO: do something useful with the suffixes... */
 613         case 'f':
 614         case 'F':
 615                 if (warning.traditional) {
 616                         warningf(&lexer_token.source_position,
 617                                 "traditional C rejects the 'F' suffix");
 618                 }
 619                 next_char();
 620                 lexer_token.datatype = type_float;
 621                 break;
 622         case 'l':
 623         case 'L':
 624                 if (warning.traditional) {
 625                         warningf(&lexer_token.source_position,
 626                                 "traditional C rejects the 'F' suffix");
 627                 }
 628                 next_char();
 629                 lexer_token.datatype = type_long_double;
 630                 break;
 631         default:
 632                 lexer_token.datatype = type_double;
 633                 break;
 634         }
 635 }
 636
 637 /**
 638  * A replacement for strtoull. Only those parts needed for
 639  * our parser are implemented.
 640  */
 641 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
 642 {
 643         unsigned long long v = 0;
 644
 645         switch (base) {
 646         case 16:
 647                 for (;; ++s) {
 648                         /* check for overrun */
 649                         if (v >= 0x1000000000000000ULL)
 650                                 break;
 651                         switch (tolower(*s)) {
 652                         case '0': v <<= 4; break;
 653                         case '1': v <<= 4; v |= 0x1; break;
 654                         case '2': v <<= 4; v |= 0x2; break;
 655                         case '3': v <<= 4; v |= 0x3; break;
 656                         case '4': v <<= 4; v |= 0x4; break;
 657                         case '5': v <<= 4; v |= 0x5; break;
 658                         case '6': v <<= 4; v |= 0x6; break;
 659                         case '7': v <<= 4; v |= 0x7; break;
 660                         case '8': v <<= 4; v |= 0x8; break;
 661                         case '9': v <<= 4; v |= 0x9; break;
 662                         case 'a': v <<= 4; v |= 0xa; break;
 663                         case 'b': v <<= 4; v |= 0xb; break;
 664                         case 'c': v <<= 4; v |= 0xc; break;
 665                         case 'd': v <<= 4; v |= 0xd; break;
 666                         case 'e': v <<= 4; v |= 0xe; break;
 667                         case 'f': v <<= 4; v |= 0xf; break;
 668                         default:
 669                                 goto end;
 670                         }
 671                 }
 672                 break;
 673         case 8:
 674                 for (;; ++s) {
 675                         /* check for overrun */
 676                         if (v >= 0x2000000000000000ULL)
 677                                 break;
 678                         switch (tolower(*s)) {
 679                         case '0': v <<= 3; break;
 680                         case '1': v <<= 3; v |= 1; break;
 681                         case '2': v <<= 3; v |= 2; break;
 682                         case '3': v <<= 3; v |= 3; break;
 683                         case '4': v <<= 3; v |= 4; break;
 684                         case '5': v <<= 3; v |= 5; break;
 685                         case '6': v <<= 3; v |= 6; break;
 686                         case '7': v <<= 3; v |= 7; break;
 687                         default:
 688                                 goto end;
 689                         }
 690                 }
 691                 break;
 692         case 10:
 693                 for (;; ++s) {
 694                         /* check for overrun */
 695                         if (v > 0x1999999999999999ULL)
 696                                 break;
 697                         switch (tolower(*s)) {
 698                         case '0': v *= 10; break;
 699                         case '1': v *= 10; v += 1; break;
 700                         case '2': v *= 10; v += 2; break;
 701                         case '3': v *= 10; v += 3; break;
 702                         case '4': v *= 10; v += 4; break;
 703                         case '5': v *= 10; v += 5; break;
 704                         case '6': v *= 10; v += 6; break;
 705                         case '7': v *= 10; v += 7; break;
 706                         case '8': v *= 10; v += 8; break;
 707                         case '9': v *= 10; v += 9; break;
 708                         default:
 709                                 goto end;
 710                         }
 711                 }
 712                 break;
 713         default:
 714                 assert(0);
 715                 break;
 716         }
 717 end:
 718         *endptr = s;
 719         return v;
 720 }
 721
 722 /**
 723  * Parses a hex number including hex floats and set the
 724  * lexer_token.
 725  */
 726 static void parse_number_hex(void)
 727 {
 728         bool is_float = false;
 729         assert(c == 'x' || c == 'X');
 730         next_char();
 731
 732         obstack_1grow(&symbol_obstack, '0');
 733         obstack_1grow(&symbol_obstack, 'x');
 734
 735         while(isxdigit(c)) {
 736                 obstack_1grow(&symbol_obstack, (char) c);
 737                 next_char();
 738         }
 739
 740         if (c == '.') {
 741                 obstack_1grow(&symbol_obstack, (char) c);
 742                 next_char();
 743
 744                 while (isxdigit(c)) {
 745                         obstack_1grow(&symbol_obstack, (char) c);
 746                         next_char();
 747                 }
 748                 is_float = true;
 749         }
 750         if (c == 'p' || c == 'P') {
 751                 obstack_1grow(&symbol_obstack, (char) c);
 752                 next_char();
 753
 754                 if (c == '-' || c == '+') {
 755                         obstack_1grow(&symbol_obstack, (char) c);
 756                         next_char();
 757                 }
 758
 759                 while (isxdigit(c)) {
 760                         obstack_1grow(&symbol_obstack, (char) c);
 761                         next_char();
 762                 }
 763                 is_float = true;
 764         }
 765
 766         obstack_1grow(&symbol_obstack, '\0');
 767         char *string = obstack_finish(&symbol_obstack);
 768         if(*string == '\0') {
 769                 parse_error("invalid hex number");
 770                 lexer_token.type = T_ERROR;
 771                 obstack_free(&symbol_obstack, string);
 772                 return;
 773         }
 774
 775         if (is_float) {
 776                 char *endptr;
 777                 lexer_token.type         = T_FLOATINGPOINT;
 778                 lexer_token.v.floatvalue = strtold(string, &endptr);
 779
 780                 if(*endptr != '\0') {
 781                         parse_error("invalid hex float literal");
 782                 }
 783
 784                 parse_floating_suffix();
 785         } else {
 786                 const char *endptr;
 787                 lexer_token.type       = T_INTEGER;
 788                 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
 789                 if(*endptr != '\0') {
 790                         parse_error("hex number literal too long");
 791                 }
 792                 parse_integer_suffix(true);
 793         }
 794
 795         obstack_free(&symbol_obstack, string);
 796 }
 797
 798 /**
 799  * Returns true if the given char is a octal digit.
 800  *
 801  * @param char  the character to check
 802  */
 803 static inline bool is_octal_digit(utf32 chr)
 804 {
 805         switch(chr) {
 806         case '0':
 807         case '1':
 808         case '2':
 809         case '3':
 810         case '4':
 811         case '5':
 812         case '6':
 813         case '7':
 814                 return true;
 815         default:
 816                 return false;
 817         }
 818 }
 819
 820 /**
 821  * Parses a octal number and set the lexer_token.
 822  */
 823 static void parse_number_oct(void)
 824 {
 825         while(is_octal_digit(c)) {
 826                 obstack_1grow(&symbol_obstack, (char) c);
 827                 next_char();
 828         }
 829         obstack_1grow(&symbol_obstack, '\0');
 830         char *string = obstack_finish(&symbol_obstack);
 831
 832         const char *endptr;
 833         lexer_token.type       = T_INTEGER;
 834         lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
 835         if(*endptr != '\0') {
 836                 parse_error("octal number literal too long");
 837         }
 838
 839         obstack_free(&symbol_obstack, string);
 840         parse_integer_suffix(true);
 841 }
 842
 843 /**
 844  * Parses a decimal including float number and set the
 845  * lexer_token.
 846  */
 847 static void parse_number_dec(void)
 848 {
 849         bool is_float = false;
 850         while (isdigit(c)) {
 851                 obstack_1grow(&symbol_obstack, (char) c);
 852                 next_char();
 853         }
 854
 855         if (c == '.') {
 856                 obstack_1grow(&symbol_obstack, '.');
 857                 next_char();
 858
 859                 while (isdigit(c)) {
 860                         obstack_1grow(&symbol_obstack, (char) c);
 861                         next_char();
 862                 }
 863                 is_float = true;
 864         }
 865         if(c == 'e' || c == 'E') {
 866                 obstack_1grow(&symbol_obstack, (char) c);
 867                 next_char();
 868
 869                 if(c == '-' || c == '+') {
 870                         obstack_1grow(&symbol_obstack, (char) c);
 871                         next_char();
 872                 }
 873
 874                 while(isdigit(c)) {
 875                         obstack_1grow(&symbol_obstack, (char) c);
 876                         next_char();
 877                 }
 878                 is_float = true;
 879         }
 880
 881         obstack_1grow(&symbol_obstack, '\0');
 882         char *string = obstack_finish(&symbol_obstack);
 883
 884         if(is_float) {
 885                 char *endptr;
 886                 lexer_token.type         = T_FLOATINGPOINT;
 887                 lexer_token.v.floatvalue = strtold(string, &endptr);
 888
 889                 if(*endptr != '\0') {
 890                         parse_error("invalid number literal");
 891                 }
 892
 893                 parse_floating_suffix();
 894         } else {
 895                 const char *endptr;
 896                 lexer_token.type       = T_INTEGER;
 897                 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
 898
 899                 if(*endptr != '\0') {
 900                         parse_error("invalid number literal");
 901                 }
 902
 903                 parse_integer_suffix(false);
 904         }
 905         obstack_free(&symbol_obstack, string);
 906 }
 907
 908 /**
 909  * Parses a number and sets the lexer_token.
 910  */
 911 static void parse_number(void)
 912 {
 913         if (c == '0') {
 914                 next_char();
 915                 switch (c) {
 916                         case 'X':
 917                         case 'x':
 918                                 parse_number_hex();
 919                                 break;
 920                         case '0':
 921                         case '1':
 922                         case '2':
 923                         case '3':
 924                         case '4':
 925                         case '5':
 926                         case '6':
 927                         case '7':
 928                                 parse_number_oct();
 929                                 break;
 930                         case '8':
 931                         case '9':
 932                                 next_char();
 933                                 parse_error("invalid octal number");
 934                                 lexer_token.type = T_ERROR;
 935                                 return;
 936                         case '.':
 937                         case 'e':
 938                         case 'E':
 939                         default:
 940                                 obstack_1grow(&symbol_obstack, '0');
 941                                 parse_number_dec();
 942                                 return;
 943                 }
 944         } else {
 945                 parse_number_dec();
 946         }
 947 }
 948
 949 /**
 950  * Returns the value of a digit.
 951  * The only portable way to do it ...
 952  */
 953 static int digit_value(utf32 const digit)
 954 {
 955         switch (digit) {
 956         case '0': return 0;
 957         case '1': return 1;
 958         case '2': return 2;
 959         case '3': return 3;
 960         case '4': return 4;
 961         case '5': return 5;
 962         case '6': return 6;
 963         case '7': return 7;
 964         case '8': return 8;
 965         case '9': return 9;
 966         case 'a':
 967         case 'A': return 10;
 968         case 'b':
 969         case 'B': return 11;
 970         case 'c':
 971         case 'C': return 12;
 972         case 'd':
 973         case 'D': return 13;
 974         case 'e':
 975         case 'E': return 14;
 976         case 'f':
 977         case 'F': return 15;
 978         default:
 979                 internal_error("wrong character given");
 980         }
 981 }
 982
 983 /**
 984  * Parses an octal character sequence.
 985  *
 986  * @param first_digit  the already read first digit
 987  */
 988 static utf32 parse_octal_sequence(utf32 const first_digit)
 989 {
 990         assert(is_octal_digit(first_digit));
 991         utf32 value = digit_value(first_digit);
 992         if (!is_octal_digit(c)) return value;
 993         value = 8 * value + digit_value(c);
 994         next_char();
 995         if (!is_octal_digit(c)) return value;
 996         value = 8 * value + digit_value(c);
 997         next_char();
 998         return value;
 999 }
1000
1001 /**
1002  * Parses a hex character sequence.
1003  */
1004 static utf32 parse_hex_sequence(void)
1005 {
1006         utf32 value = 0;
1007         while(isxdigit(c)) {
1008                 value = 16 * value + digit_value(c);
1009                 next_char();
1010         }
1011         return value;
1012 }
1013
1014 /**
1015  * Parse an escape sequence.
1016  */
1017 static utf32 parse_escape_sequence(void)
1018 {
1019         eat('\\');
1020
1021         utf32 const ec = c;
1022         next_char();
1023
1024         switch (ec) {
1025         case '"':  return '"';
1026         case '\'': return '\'';
1027         case '\\': return '\\';
1028         case '?': return '\?';
1029         case 'a': return '\a';
1030         case 'b': return '\b';
1031         case 'f': return '\f';
1032         case 'n': return '\n';
1033         case 'r': return '\r';
1034         case 't': return '\t';
1035         case 'v': return '\v';
1036         case 'x':
1037                 return parse_hex_sequence();
1038         case '0':
1039         case '1':
1040         case '2':
1041         case '3':
1042         case '4':
1043         case '5':
1044         case '6':
1045         case '7':
1046                 return parse_octal_sequence(ec);
1047         case EOF:
1048                 parse_error("reached end of file while parsing escape sequence");
1049                 return EOF;
1050         /* \E is not documented, but handled, by GCC.  It is acceptable according
1051          * to §6.11.4, whereas \e is not. */
1052         case 'E':
1053         case 'e':
1054                 if (c_mode & _GNUC)
1055                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
1056                 /* FALLTHROUGH */
1057         default:
1058                 /* §6.4.4.4:8 footnote 64 */
1059                 parse_error("unknown escape sequence");
1060                 return EOF;
1061         }
1062 }
1063
1064 /**
1065  * Concatenate two strings.
1066  */
1067 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1068 {
1069         const size_t len1 = s1->size - 1;
1070         const size_t len2 = s2->size - 1;
1071
1072         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1073         memcpy(concat, s1->begin, len1);
1074         memcpy(concat + len1, s2->begin, len2 + 1);
1075
1076         if (warning.traditional) {
1077                 warningf(&lexer_token.source_position,
1078                         "traditional C rejects string constant concatenation");
1079         }
1080 #if 0 /* TODO hash */
1081         const char *result = strset_insert(&stringset, concat);
1082         if(result != concat) {
1083                 obstack_free(&symbol_obstack, concat);
1084         }
1085
1086         return result;
1087 #else
1088         return (string_t){ concat, len1 + len2 + 1 };
1089 #endif
1090 }
1091
1092 /**
1093  * Concatenate a string and a wide string.
1094  */
1095 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1096 {
1097         const size_t len1 = s1->size - 1;
1098         const size_t len2 = s2->size - 1;
1099
1100         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1101         const char *const src = s1->begin;
1102         for (size_t i = 0; i != len1; ++i) {
1103                 concat[i] = src[i];
1104         }
1105         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1106         if (warning.traditional) {
1107                 warningf(&lexer_token.source_position,
1108                         "traditional C rejects string constant concatenation");
1109         }
1110
1111         return (wide_string_t){ concat, len1 + len2 + 1 };
1112 }
1113
1114 /**
1115  * Concatenate two wide strings.
1116  */
1117 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1118 {
1119         const size_t len1 = s1->size - 1;
1120         const size_t len2 = s2->size - 1;
1121
1122         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1123         memcpy(concat,        s1->begin, len1       * sizeof(*concat));
1124         memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1125         if (warning.traditional) {
1126                 warningf(&lexer_token.source_position,
1127                         "traditional C rejects string constant concatenation");
1128         }
1129
1130         return (wide_string_t){ concat, len1 + len2 + 1 };
1131 }
1132
1133 /**
1134  * Concatenate a wide string and a string.
1135  */
1136 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1137 {
1138         const size_t len1 = s1->size - 1;
1139         const size_t len2 = s2->size - 1;
1140
1141         wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1142         memcpy(concat, s1->begin, len1 * sizeof(*concat));
1143         const char  *const src = s2->begin;
1144         wchar_rep_t *const dst = concat + len1;
1145         for (size_t i = 0; i != len2 + 1; ++i) {
1146                 dst[i] = src[i];
1147         }
1148         if (warning.traditional) {
1149                 warningf(&lexer_token.source_position,
1150                         "traditional C rejects string constant concatenation");
1151         }
1152
1153         return (wide_string_t){ concat, len1 + len2 + 1 };
1154 }
1155
1156 static void grow_symbol(utf32 const tc)
1157 {
1158         struct obstack *const o  = &symbol_obstack;
1159         if (tc < 0x80U) {
1160                 obstack_1grow(o, tc);
1161         } else if (tc < 0x800) {
1162                 obstack_1grow(o, 0xC0 | (tc >> 6));
1163                 obstack_1grow(o, 0x80 | (tc & 0x3F));
1164         } else if (tc < 0x10000) {
1165                 obstack_1grow(o, 0xE0 | ( tc >> 12));
1166                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1167                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1168         } else {
1169                 obstack_1grow(o, 0xF0 | ( tc >> 18));
1170                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1171                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
1172                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
1173         }
1174 }
1175
1176 /**
1177  * Parse a string literal and set lexer_token.
1178  */
1179 static void parse_string_literal(void)
1180 {
1181         const unsigned start_linenr = lexer_token.source_position.linenr;
1182
1183         eat('"');
1184
1185         while(1) {
1186                 switch(c) {
1187                 case '\\': {
1188                         utf32 const tc = parse_escape_sequence();
1189                         if (tc >= 0x100) {
1190                                 warningf(&lexer_token.source_position,
1191                                                 "escape sequence out of range");
1192                         }
1193                         obstack_1grow(&symbol_obstack, tc);
1194                         break;
1195                 }
1196
1197                 case EOF: {
1198                         source_position_t source_position;
1199                         source_position.input_name = lexer_token.source_position.input_name;
1200                         source_position.linenr     = start_linenr;
1201                         errorf(&source_position, "string has no end");
1202                         lexer_token.type = T_ERROR;
1203                         return;
1204                 }
1205
1206                 case '"':
1207                         next_char();
1208                         goto end_of_string;
1209
1210                 default:
1211                         grow_symbol(c);
1212                         next_char();
1213                         break;
1214                 }
1215         }
1216
1217 end_of_string:
1218
1219         /* TODO: concatenate multiple strings separated by whitespace... */
1220
1221         /* add finishing 0 to the string */
1222         obstack_1grow(&symbol_obstack, '\0');
1223         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1224         const char *const string = obstack_finish(&symbol_obstack);
1225
1226 #if 0 /* TODO hash */
1227         /* check if there is already a copy of the string */
1228         result = strset_insert(&stringset, string);
1229         if(result != string) {
1230                 obstack_free(&symbol_obstack, string);
1231         }
1232 #else
1233         const char *const result = string;
1234 #endif
1235
1236         lexer_token.type           = T_STRING_LITERAL;
1237         lexer_token.v.string.begin = result;
1238         lexer_token.v.string.size  = size;
1239 }
1240
1241 /**
1242  * Parse a wide character constant and set lexer_token.
1243  */
1244 static void parse_wide_character_constant(void)
1245 {
1246         const unsigned start_linenr = lexer_token.source_position.linenr;
1247
1248         eat('\'');
1249
1250         while(1) {
1251                 switch(c) {
1252                 case '\\': {
1253                         wchar_rep_t tc = parse_escape_sequence();
1254                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1255                         break;
1256                 }
1257
1258                 MATCH_NEWLINE(
1259                         parse_error("newline while parsing character constant");
1260                         break;
1261                 )
1262
1263                 case '\'':
1264                         next_char();
1265                         goto end_of_wide_char_constant;
1266
1267                 case EOF: {
1268                         source_position_t source_position = lexer_token.source_position;
1269                         source_position.linenr = start_linenr;
1270                         errorf(&source_position, "EOF while parsing character constant");
1271                         lexer_token.type = T_ERROR;
1272                         return;
1273                 }
1274
1275                 default: {
1276                         wchar_rep_t tc = (wchar_rep_t) c;
1277                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1278                         next_char();
1279                         break;
1280                 }
1281                 }
1282         }
1283
1284 end_of_wide_char_constant:;
1285         size_t             size   = (size_t) obstack_object_size(&symbol_obstack);
1286         assert(size % sizeof(wchar_rep_t) == 0);
1287         size /= sizeof(wchar_rep_t);
1288
1289         const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1290
1291         lexer_token.type                = T_WIDE_CHARACTER_CONSTANT;
1292         lexer_token.v.wide_string.begin = string;
1293         lexer_token.v.wide_string.size  = size;
1294         lexer_token.datatype            = type_wchar_t;
1295 }
1296
1297 /**
1298  * Parse a wide string literal and set lexer_token.
1299  */
1300 static void parse_wide_string_literal(void)
1301 {
1302         const unsigned start_linenr = lexer_token.source_position.linenr;
1303
1304         assert(c == '"');
1305         next_char();
1306
1307         while(1) {
1308                 switch(c) {
1309                 case '\\': {
1310                         wchar_rep_t tc = parse_escape_sequence();
1311                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1312                         break;
1313                 }
1314
1315                 case EOF: {
1316                         source_position_t source_position;
1317                         source_position.input_name = lexer_token.source_position.input_name;
1318                         source_position.linenr     = start_linenr;
1319                         errorf(&source_position, "string has no end");
1320                         lexer_token.type = T_ERROR;
1321                         return;
1322                 }
1323
1324                 case '"':
1325                         next_char();
1326                         goto end_of_string;
1327
1328                 default: {
1329                         wchar_rep_t tc = c;
1330                         obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1331                         next_char();
1332                         break;
1333                 }
1334                 }
1335         }
1336
1337 end_of_string:;
1338
1339         /* TODO: concatenate multiple strings separated by whitespace... */
1340
1341         /* add finishing 0 to the string */
1342         wchar_rep_t nul = L'\0';
1343         obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1344         const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1345         const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1346
1347 #if 0 /* TODO hash */
1348         /* check if there is already a copy of the string */
1349         const wchar_rep_t *const result = strset_insert(&stringset, string);
1350         if(result != string) {
1351                 obstack_free(&symbol_obstack, string);
1352         }
1353 #else
1354         const wchar_rep_t *const result = string;
1355 #endif
1356
1357         lexer_token.type                = T_WIDE_STRING_LITERAL;
1358         lexer_token.v.wide_string.begin = result;
1359         lexer_token.v.wide_string.size  = size;
1360 }
1361
1362 /**
1363  * Parse a character constant and set lexer_token.
1364  */
1365 static void parse_character_constant(void)
1366 {
1367         const unsigned start_linenr = lexer_token.source_position.linenr;
1368
1369         eat('\'');
1370
1371         while(1) {
1372                 switch(c) {
1373                 case '\\': {
1374                         utf32 const tc = parse_escape_sequence();
1375                         if (tc >= 0x100) {
1376                                 warningf(&lexer_token.source_position,
1377                                                 "escape sequence out of range");
1378                         }
1379                         obstack_1grow(&symbol_obstack, tc);
1380                         break;
1381                 }
1382
1383                 MATCH_NEWLINE(
1384                         parse_error("newline while parsing character constant");
1385                         break;
1386                 )
1387
1388                 case '\'':
1389                         next_char();
1390                         goto end_of_char_constant;
1391
1392                 case EOF: {
1393                         source_position_t source_position;
1394                         source_position.input_name = lexer_token.source_position.input_name;
1395                         source_position.linenr     = start_linenr;
1396                         errorf(&source_position, "EOF while parsing character constant");
1397                         lexer_token.type = T_ERROR;
1398                         return;
1399                 }
1400
1401                 default:
1402                         grow_symbol(c);
1403                         next_char();
1404                         break;
1405
1406                 }
1407         }
1408
1409 end_of_char_constant:;
1410         const size_t      size   = (size_t)obstack_object_size(&symbol_obstack);
1411         const char *const string = obstack_finish(&symbol_obstack);
1412
1413         lexer_token.type           = T_CHARACTER_CONSTANT;
1414         lexer_token.v.string.begin = string;
1415         lexer_token.v.string.size  = size;
1416         lexer_token.datatype       = c_mode & _CXX && size == 1 ? type_char : type_int;
1417 }
1418
1419 /**
1420  * Skip a multiline comment.
1421  */
1422 static void skip_multiline_comment(void)
1423 {
1424         unsigned start_linenr = lexer_token.source_position.linenr;
1425
1426         while(1) {
1427                 switch(c) {
1428                 case '/':
1429                         next_char();
1430                         if (c == '*') {
1431                                 /* nested comment, warn here */
1432                                 if (warning.comment) {
1433                                         warningf(&lexer_token.source_position, "'/*' within comment");
1434                                 }
1435                         }
1436                         break;
1437                 case '*':
1438                         next_char();
1439                         if(c == '/') {
1440                                 next_char();
1441                                 return;
1442                         }
1443                         break;
1444
1445                 MATCH_NEWLINE(break;)
1446
1447                 case EOF: {
1448                         source_position_t source_position;
1449                         source_position.input_name = lexer_token.source_position.input_name;
1450                         source_position.linenr     = start_linenr;
1451                         errorf(&source_position, "at end of file while looking for comment end");
1452                         return;
1453                 }
1454
1455                 default:
1456                         next_char();
1457                         break;
1458                 }
1459         }
1460 }
1461
1462 /**
1463  * Skip a single line comment.
1464  */
1465 static void skip_line_comment(void)
1466 {
1467         while(1) {
1468                 switch(c) {
1469                 case EOF:
1470                         return;
1471
1472                 case '\n':
1473                 case '\r':
1474                         return;
1475
1476                 case '\\':
1477                         next_char();
1478                         if (c == '\n' || c == '\r') {
1479                                 if (warning.comment)
1480                                         warningf(&lexer_token.source_position, "multi-line comment");
1481                                 return;
1482                         }
1483                         break;
1484
1485                 default:
1486                         next_char();
1487                         break;
1488                 }
1489         }
1490 }
1491
1492 /** The current preprocessor token. */
1493 static token_t pp_token;
1494
1495 /**
1496  * Read the next preprocessor token.
1497  */
1498 static inline void next_pp_token(void)
1499 {
1500         lexer_next_preprocessing_token();
1501         pp_token = lexer_token;
1502 }
1503
1504 /**
1505  * Eat all preprocessor tokens until newline.
1506  */
1507 static void eat_until_newline(void)
1508 {
1509         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1510                 next_pp_token();
1511         }
1512 }
1513
1514 /**
1515  * Handle the define directive.
1516  */
1517 static void define_directive(void)
1518 {
1519         lexer_next_preprocessing_token();
1520         if(lexer_token.type != T_IDENTIFIER) {
1521                 parse_error("expected identifier after #define\n");
1522                 eat_until_newline();
1523         }
1524 }
1525
1526 /**
1527  * Handle the ifdef directive.
1528  */
1529 static void ifdef_directive(int is_ifndef)
1530 {
1531         (void) is_ifndef;
1532         lexer_next_preprocessing_token();
1533         //expect_identifier();
1534         //extect_newline();
1535 }
1536
1537 /**
1538  * Handle the endif directive.
1539  */
1540 static void endif_directive(void)
1541 {
1542         //expect_newline();
1543 }
1544
1545 /**
1546  * Parse the line directive.
1547  */
1548 static void parse_line_directive(void)
1549 {
1550         if(pp_token.type != T_INTEGER) {
1551                 parse_error("expected integer");
1552         } else {
1553                 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1554                 next_pp_token();
1555         }
1556         if(pp_token.type == T_STRING_LITERAL) {
1557                 lexer_token.source_position.input_name = pp_token.v.string.begin;
1558                 next_pp_token();
1559         }
1560
1561         eat_until_newline();
1562 }
1563
1564 /**
1565  * STDC pragmas.
1566  */
1567 typedef enum stdc_pragma_kind_t {
1568         STDC_UNKNOWN,
1569         STDC_FP_CONTRACT,
1570         STDC_FENV_ACCESS,
1571         STDC_CX_LIMITED_RANGE
1572 } stdc_pragma_kind_t;
1573
1574 /**
1575  * STDC pragma values.
1576  */
1577 typedef enum stdc_pragma_value_kind_t {
1578         STDC_VALUE_UNKNOWN,
1579         STDC_VALUE_ON,
1580         STDC_VALUE_OFF,
1581         STDC_VALUE_DEFAULT
1582 } stdc_pragma_value_kind_t;
1583
1584 /**
1585  * Parse a pragma directive.
1586  */
1587 static void parse_pragma(void)
1588 {
1589         bool unknown_pragma = true;
1590
1591         next_pp_token();
1592         if (pp_token.v.symbol->pp_ID == TP_STDC) {
1593                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1594                 /* a STDC pragma */
1595                 if (c_mode & _C99) {
1596                         next_pp_token();
1597
1598                         switch (pp_token.v.symbol->pp_ID) {
1599                         case TP_FP_CONTRACT:
1600                                 kind = STDC_FP_CONTRACT;
1601                                 break;
1602                         case TP_FENV_ACCESS:
1603                                 kind = STDC_FENV_ACCESS;
1604                                 break;
1605                         case TP_CX_LIMITED_RANGE:
1606                                 kind = STDC_CX_LIMITED_RANGE;
1607                                 break;
1608                         default:
1609                                 break;
1610                         }
1611                         if (kind != STDC_UNKNOWN) {
1612                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1613                                 next_pp_token();
1614                                 switch (pp_token.v.symbol->pp_ID) {
1615                                 case TP_ON:
1616                                         value = STDC_VALUE_ON;
1617                                         break;
1618                                 case TP_OFF:
1619                                         value = STDC_VALUE_OFF;
1620                                         break;
1621                                 case TP_DEFAULT:
1622                                         value = STDC_VALUE_DEFAULT;
1623                                         break;
1624                                 default:
1625                                         break;
1626                                 }
1627                                 if (value != STDC_VALUE_UNKNOWN) {
1628                                         unknown_pragma = false;
1629                                 } else {
1630                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1631                                 }
1632                         }
1633                 }
1634         } else {
1635                 unknown_pragma = true;
1636         }
1637         eat_until_newline();
1638         if (unknown_pragma && warning.unknown_pragmas) {
1639                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1640         }
1641 }
1642
1643 /**
1644  * Parse a preprocessor non-null directive.
1645  */
1646 static void parse_preprocessor_identifier(void)
1647 {
1648         assert(pp_token.type == T_IDENTIFIER);
1649         symbol_t *symbol = pp_token.v.symbol;
1650
1651         switch(symbol->pp_ID) {
1652         case TP_include:
1653                 printf("include - enable header name parsing!\n");
1654                 break;
1655         case TP_define:
1656                 define_directive();
1657                 break;
1658         case TP_ifdef:
1659                 ifdef_directive(0);
1660                 break;
1661         case TP_ifndef:
1662                 ifdef_directive(1);
1663                 break;
1664         case TP_endif:
1665                 endif_directive();
1666                 break;
1667         case TP_line:
1668                 next_pp_token();
1669                 parse_line_directive();
1670                 break;
1671         case TP_if:
1672         case TP_else:
1673         case TP_elif:
1674         case TP_undef:
1675         case TP_error:
1676                 /* TODO; output the rest of the line */
1677                 parse_error("#error directive: ");
1678                 break;
1679         case TP_pragma:
1680                 parse_pragma();
1681                 break;
1682         }
1683 }
1684
1685 /**
1686  * Parse a preprocessor directive.
1687  */
1688 static void parse_preprocessor_directive(void)
1689 {
1690         next_pp_token();
1691
1692         switch(pp_token.type) {
1693         case T_IDENTIFIER:
1694                 parse_preprocessor_identifier();
1695                 break;
1696         case T_INTEGER:
1697                 parse_line_directive();
1698                 break;
1699         case '\n':
1700                 /* NULL directive, see § 6.10.7 */
1701                 break;
1702         default:
1703                 parse_error("invalid preprocessor directive");
1704                 eat_until_newline();
1705                 break;
1706         }
1707 }
1708
1709 #define MAYBE_PROLOG                                       \
1710                         next_char();                                   \
1711                         while(1) {                                     \
1712                                 switch(c) {
1713
1714 #define MAYBE(ch, set_type)                                \
1715                                 case ch:                                   \
1716                                         next_char();                           \
1717                                         lexer_token.type = set_type;           \
1718                                         return;
1719
1720 #define ELSE_CODE(code)                                    \
1721                                 default:                                   \
1722                                         code                                   \
1723                                 }                                          \
1724                         } /* end of while(1) */                        \
1725                         break;
1726
1727 #define ELSE(set_type)                                     \
1728                 ELSE_CODE(                                         \
1729                         lexer_token.type = set_type;                   \
1730                         return;                                        \
1731                 )
1732
1733 void lexer_next_preprocessing_token(void)
1734 {
1735         while(1) {
1736                 switch(c) {
1737                 case ' ':
1738                 case '\t':
1739                         next_char();
1740                         break;
1741
1742                 MATCH_NEWLINE(
1743                         lexer_token.type = '\n';
1744                         return;
1745                 )
1746
1747                 SYMBOL_CHARS
1748                         parse_symbol();
1749                         /* might be a wide string ( L"string" ) */
1750                         if(lexer_token.type == T_IDENTIFIER &&
1751                             lexer_token.v.symbol == symbol_L) {
1752                             if(c == '"') {
1753                                         parse_wide_string_literal();
1754                                 } else if(c == '\'') {
1755                                         parse_wide_character_constant();
1756                                 }
1757                         }
1758                         return;
1759
1760                 DIGITS
1761                         parse_number();
1762                         return;
1763
1764                 case '"':
1765                         parse_string_literal();
1766                         return;
1767
1768                 case '\'':
1769                         parse_character_constant();
1770                         return;
1771
1772                 case '.':
1773                         MAYBE_PROLOG
1774                                 DIGITS
1775                                         put_back(c);
1776                                         c = '.';
1777                                         parse_number_dec();
1778                                         return;
1779
1780                                 case '.':
1781                                         MAYBE_PROLOG
1782                                         MAYBE('.', T_DOTDOTDOT)
1783                                         ELSE_CODE(
1784                                                 put_back(c);
1785                                                 c = '.';
1786                                                 lexer_token.type = '.';
1787                                                 return;
1788                                         )
1789                         ELSE('.')
1790                 case '&':
1791                         MAYBE_PROLOG
1792                         MAYBE('&', T_ANDAND)
1793                         MAYBE('=', T_ANDEQUAL)
1794                         ELSE('&')
1795                 case '*':
1796                         MAYBE_PROLOG
1797                         MAYBE('=', T_ASTERISKEQUAL)
1798                         ELSE('*')
1799                 case '+':
1800                         MAYBE_PROLOG
1801                         MAYBE('+', T_PLUSPLUS)
1802                         MAYBE('=', T_PLUSEQUAL)
1803                         ELSE('+')
1804                 case '-':
1805                         MAYBE_PROLOG
1806                         MAYBE('>', T_MINUSGREATER)
1807                         MAYBE('-', T_MINUSMINUS)
1808                         MAYBE('=', T_MINUSEQUAL)
1809                         ELSE('-')
1810                 case '!':
1811                         MAYBE_PROLOG
1812                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1813                         ELSE('!')
1814                 case '/':
1815                         MAYBE_PROLOG
1816                         MAYBE('=', T_SLASHEQUAL)
1817                                 case '*':
1818                                         next_char();
1819                                         skip_multiline_comment();
1820                                         lexer_next_preprocessing_token();
1821                                         return;
1822                                 case '/':
1823                                         next_char();
1824                                         skip_line_comment();
1825                                         lexer_next_preprocessing_token();
1826                                         return;
1827                         ELSE('/')
1828                 case '%':
1829                         MAYBE_PROLOG
1830                         MAYBE('>', '}')
1831                         MAYBE('=', T_PERCENTEQUAL)
1832                                 case ':':
1833                                         MAYBE_PROLOG
1834                                                 case '%':
1835                                                         MAYBE_PROLOG
1836                                                         MAYBE(':', T_HASHHASH)
1837                                                         ELSE_CODE(
1838                                                                 put_back(c);
1839                                                                 c = '%';
1840                                                                 lexer_token.type = '#';
1841                                                                 return;
1842                                                         )
1843                                         ELSE('#')
1844                         ELSE('%')
1845                 case '<':
1846                         MAYBE_PROLOG
1847                         MAYBE(':', '[')
1848                         MAYBE('%', '{')
1849                         MAYBE('=', T_LESSEQUAL)
1850                                 case '<':
1851                                         MAYBE_PROLOG
1852                                         MAYBE('=', T_LESSLESSEQUAL)
1853                                         ELSE(T_LESSLESS)
1854                         ELSE('<')
1855                 case '>':
1856                         MAYBE_PROLOG
1857                         MAYBE('=', T_GREATEREQUAL)
1858                                 case '>':
1859                                         MAYBE_PROLOG
1860                                         MAYBE('=', T_GREATERGREATEREQUAL)
1861                                         ELSE(T_GREATERGREATER)
1862                         ELSE('>')
1863                 case '^':
1864                         MAYBE_PROLOG
1865                         MAYBE('=', T_CARETEQUAL)
1866                         ELSE('^')
1867                 case '|':
1868                         MAYBE_PROLOG
1869                         MAYBE('=', T_PIPEEQUAL)
1870                         MAYBE('|', T_PIPEPIPE)
1871                         ELSE('|')
1872                 case ':':
1873                         MAYBE_PROLOG
1874                         MAYBE('>', ']')
1875                         ELSE(':')
1876                 case '=':
1877                         MAYBE_PROLOG
1878                         MAYBE('=', T_EQUALEQUAL)
1879                         ELSE('=')
1880                 case '#':
1881                         MAYBE_PROLOG
1882                         MAYBE('#', T_HASHHASH)
1883                         ELSE('#')
1884
1885                 case '?':
1886                 case '[':
1887                 case ']':
1888                 case '(':
1889                 case ')':
1890                 case '{':
1891                 case '}':
1892                 case '~':
1893                 case ';':
1894                 case ',':
1895                 case '\\':
1896                         lexer_token.type = c;
1897                         next_char();
1898                         return;
1899
1900                 case EOF:
1901                         lexer_token.type = T_EOF;
1902                         return;
1903
1904                 default:
1905 dollar_sign:
1906                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1907                         next_char();
1908                         lexer_token.type = T_ERROR;
1909                         return;
1910                 }
1911         }
1912 }
1913
1914 void lexer_next_token(void)
1915 {
1916         lexer_next_preprocessing_token();
1917
1918         while (lexer_token.type == '\n') {
1919 newline_found:
1920                 lexer_next_preprocessing_token();
1921         }
1922
1923         if (lexer_token.type == '#') {
1924                 parse_preprocessor_directive();
1925                 goto newline_found;
1926         }
1927 }
1928
1929 void init_lexer(void)
1930 {
1931         strset_init(&stringset);
1932         symbol_L = symbol_table_insert("L");
1933 }
1934
1935 void lexer_open_stream(FILE *stream, const char *input_name)
1936 {
1937         input                                  = stream;
1938         lexer_token.source_position.linenr     = 0;
1939         lexer_token.source_position.input_name = input_name;
1940
1941         bufpos = NULL;
1942         bufend = NULL;
1943
1944         /* place a virtual \n at the beginning so the lexer knows that we're
1945          * at the beginning of a line */
1946         c = '\n';
1947 }
1948
1949 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1950 {
1951         input                                  = NULL;
1952         lexer_token.source_position.linenr     = 0;
1953         lexer_token.source_position.input_name = input_name;
1954
1955 #if 0 // TODO
1956         bufpos = buffer;
1957         bufend = buffer + len;
1958 #else
1959         (void)buffer;
1960         (void)len;
1961         panic("builtin lexing not done yet");
1962 #endif
1963
1964         /* place a virtual \n at the beginning so the lexer knows that we're
1965          * at the beginning of a line */
1966         c = '\n';
1967 }
1968
1969 void exit_lexer(void)
1970 {
1971         strset_destroy(&stringset);
1972 }
1973
1974 static __attribute__((unused))
1975 void dbg_pos(const source_position_t source_position)
1976 {
1977         fprintf(stdout, "%s:%u\n", source_position.input_name,
1978                 source_position.linenr);
1979         fflush(stdout);
1980 }