nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2009 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <stdbool.h>
  41 #include <ctype.h>
  42
  43 #ifndef _WIN32
  44 #include <strings.h>
  45 #endif
  46
  47 //#define DEBUG_CHARS
  48 #define MAX_PUTBACK 3
  49 #define BUF_SIZE    1024
  50
  51 static utf32             c;
  52 static source_position_t lexer_pos;
  53 token_t                  lexer_token;
  54 static symbol_t         *symbol_L;
  55 static FILE             *input;
  56 static utf32             buf[BUF_SIZE + MAX_PUTBACK];
  57 static const utf32      *bufend;
  58 static const utf32      *bufpos;
  59 static strset_t          stringset;
  60 bool                     allow_dollar_in_symbol = true;
  61
  62 /**
  63  * Prints a parse error message at the current token.
  64  *
  65  * @param msg   the error message
  66  */
  67 static void parse_error(const char *msg)
  68 {
  69         errorf(&lexer_pos, "%s", msg);
  70 }
  71
  72 /**
  73  * Prints an internal error message at the current token.
  74  *
  75  * @param msg   the error message
  76  */
  77 static NORETURN internal_error(const char *msg)
  78 {
  79         internal_errorf(&lexer_pos, "%s", msg);
  80 }
  81
  82 static size_t read_block(unsigned char *const read_buf, size_t const n)
  83 {
  84         size_t const s = fread(read_buf, 1, n, input);
  85         if (s == 0) {
  86                 /* on OS/X ferror appears to return true on eof as well when running
  87                  * the application in gdb... */
  88                 if (!feof(input) && ferror(input))
  89                         parse_error("read from input failed");
  90                 buf[MAX_PUTBACK] = EOF;
  91                 bufpos           = buf + MAX_PUTBACK;
  92                 bufend           = buf + MAX_PUTBACK + 1;
  93         }
  94         return s;
  95 }
  96
  97 static void decode_iso_8859_1(void)
  98 {
  99         unsigned char read_buf[BUF_SIZE];
 100         size_t const s = read_block(read_buf, sizeof(read_buf));
 101         if (s == 0)
 102                 return;
 103
 104         unsigned char const *src = read_buf;
 105         unsigned char const *end = read_buf + s;
 106         utf32               *dst = buf + MAX_PUTBACK;
 107         while (src != end)
 108                 *dst++ = *src++;
 109
 110         bufpos = buf + MAX_PUTBACK;
 111         bufend = dst;
 112 }
 113
 114 static void decode_iso_8859_15(void)
 115 {
 116         unsigned char read_buf[BUF_SIZE];
 117         size_t const s = read_block(read_buf, sizeof(read_buf));
 118         if (s == 0)
 119                 return;
 120
 121         unsigned char const *src = read_buf;
 122         unsigned char const *end = read_buf + s;
 123         utf32               *dst = buf + MAX_PUTBACK;
 124         while (src != end) {
 125                 utf32 tc = *src++;
 126                 switch (tc) {
 127                         case 0xA4: tc = 0x20AC; break; // €
 128                         case 0xA6: tc = 0x0160; break; // Š
 129                         case 0xA8: tc = 0x0161; break; // š
 130                         case 0xB4: tc = 0x017D; break; // Ž
 131                         case 0xB8: tc = 0x017E; break; // ž
 132                         case 0xBC: tc = 0x0152; break; // Œ
 133                         case 0xBD: tc = 0x0153; break; // œ
 134                         case 0xBE: tc = 0x0178; break; // Ÿ
 135                 }
 136                 *dst++ = tc;
 137         }
 138
 139         bufpos = buf + MAX_PUTBACK;
 140         bufend = dst;
 141 }
 142
 143 static void decode_utf8(void)
 144 {
 145         static utf32  part_decoded_min_code;
 146         static utf32  part_decoded_char;
 147         static size_t part_decoded_rest_len;
 148
 149         do {
 150                 unsigned char read_buf[BUF_SIZE];
 151                 size_t const s = read_block(read_buf, sizeof(read_buf));
 152                 if (s == 0) {
 153                         if (part_decoded_rest_len > 0)
 154                                 parse_error("incomplete input char at end of input");
 155                         return;
 156                 }
 157
 158                 unsigned char const *src = read_buf;
 159                 unsigned char const *end = read_buf + s;
 160                 utf32               *dst = buf + MAX_PUTBACK;
 161                 utf32                decoded;
 162                 utf32                min_code;
 163
 164                 if (part_decoded_rest_len != 0) {
 165                         min_code              = part_decoded_min_code;
 166                         decoded               = part_decoded_char;
 167                         size_t const rest_len = part_decoded_rest_len;
 168                         part_decoded_rest_len = 0;
 169                         switch (rest_len) {
 170                                 case 4:  goto realign;
 171                                 case 3:  goto three_more;
 172                                 case 2:  goto two_more;
 173                                 default: goto one_more;
 174                         }
 175                 }
 176
 177                 while (src != end) {
 178                         if ((*src & 0x80) == 0) {
 179                                 decoded = *src++;
 180                         } else if ((*src & 0xE0) == 0xC0) {
 181                                 min_code = 0x80;
 182                                 decoded  = *src++ & 0x1F;
 183 one_more:
 184                                 if (src == end) {
 185                                         part_decoded_min_code = min_code;
 186                                         part_decoded_char     = decoded;
 187                                         part_decoded_rest_len = 1;
 188                                         break;
 189                                 }
 190                                 if ((*src & 0xC0) == 0x80) {
 191                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 192                                 } else {
 193                                         goto invalid_char;
 194                                 }
 195                                 if (decoded < min_code                      ||
 196                                                 decoded > 0x10FFFF                      ||
 197                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 198                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 199                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 200                                         parse_error("invalid byte sequence in input");
 201                                 }
 202                         } else if ((*src & 0xF0) == 0xE0) {
 203                                 min_code = 0x800;
 204                                 decoded  = *src++ & 0x0F;
 205 two_more:
 206                                 if (src == end) {
 207                                         part_decoded_min_code = min_code;
 208                                         part_decoded_char     = decoded;
 209                                         part_decoded_rest_len = 2;
 210                                         break;
 211                                 }
 212                                 if ((*src & 0xC0) == 0x80) {
 213                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 214                                 } else {
 215                                         goto invalid_char;
 216                                 }
 217                                 goto one_more;
 218                         } else if ((*src & 0xF8) == 0xF0) {
 219                                 min_code = 0x10000;
 220                                 decoded  = *src++ & 0x07;
 221 three_more:
 222                                 if (src == end) {
 223                                         part_decoded_min_code = min_code;
 224                                         part_decoded_char     = decoded;
 225                                         part_decoded_rest_len = 3;
 226                                         break;
 227                                 }
 228                                 if ((*src & 0xC0) == 0x80) {
 229                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 230                                 } else {
 231                                         goto invalid_char;
 232                                 }
 233                                 goto two_more;
 234                         } else {
 235 invalid_char:
 236                                 parse_error("invalid byte sequence in input");
 237 realign:
 238                                 do {
 239                                         ++src;
 240                                         if (src == end) {
 241                                                 part_decoded_rest_len = 4;
 242                                                 break;
 243                                         }
 244                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 245                                 continue;
 246                         }
 247                         *dst++ = decoded;
 248                 }
 249
 250                 bufpos = buf + MAX_PUTBACK;
 251                 bufend = dst;
 252         } while (bufpos == bufend);
 253 }
 254
 255 static void decode_windows_1252(void)
 256 {
 257         unsigned char read_buf[BUF_SIZE];
 258         size_t const s = read_block(read_buf, sizeof(read_buf));
 259         if (s == 0)
 260                 return;
 261
 262         unsigned char const *src = read_buf;
 263         unsigned char const *end = read_buf + s;
 264         utf32               *dst = buf + MAX_PUTBACK;
 265         while (src != end) {
 266                 utf32 tc = *src++;
 267                 switch (tc) {
 268                         case 0x80: tc = 0x20AC; break; // €
 269                         case 0x82: tc = 0x201A; break; // ‚
 270                         case 0x83: tc = 0x0192; break; // ƒ
 271                         case 0x84: tc = 0x201E; break; // „
 272                         case 0x85: tc = 0x2026; break; // …
 273                         case 0x86: tc = 0x2020; break; // †
 274                         case 0x87: tc = 0x2021; break; // ‡
 275                         case 0x88: tc = 0x02C6; break; // ˆ
 276                         case 0x89: tc = 0x2030; break; // ‰
 277                         case 0x8A: tc = 0x0160; break; // Š
 278                         case 0x8B: tc = 0x2039; break; // ‹
 279                         case 0x8C: tc = 0x0152; break; // Œ
 280                         case 0x8E: tc = 0x017D; break; // Ž
 281                         case 0x91: tc = 0x2018; break; // ‘
 282                         case 0x92: tc = 0x2019; break; // ’
 283                         case 0x93: tc = 0x201C; break; // “
 284                         case 0x94: tc = 0x201D; break; // ”
 285                         case 0x95: tc = 0x2022; break; // •
 286                         case 0x96: tc = 0x2013; break; // –
 287                         case 0x97: tc = 0x2014; break; // —
 288                         case 0x98: tc = 0x02DC; break; // ˜
 289                         case 0x99: tc = 0x2122; break; // ™
 290                         case 0x9A: tc = 0x0161; break; // š
 291                         case 0x9B: tc = 0x203A; break; // ›
 292                         case 0x9C: tc = 0x0153; break; // œ
 293                         case 0x9E: tc = 0x017E; break; // ž
 294                         case 0x9F: tc = 0x0178; break; // Ÿ
 295                 }
 296                 *dst++ = tc;
 297         }
 298
 299         bufpos = buf + MAX_PUTBACK;
 300         bufend = dst;
 301 }
 302
 303 typedef void (*decoder_t)(void);
 304
 305 static decoder_t decoder = decode_utf8;
 306
 307 typedef struct named_decoder_t {
 308         char const *name;
 309         decoder_t   decoder;
 310 } named_decoder_t;
 311
 312 static named_decoder_t const decoders[] = {
 313         { "CP819",           decode_iso_8859_1   }, // offical alias
 314         { "IBM819",          decode_iso_8859_1   }, // offical alias
 315         { "ISO-8859-1",      decode_iso_8859_1   }, // offical alias
 316         { "ISO-8859-15",     decode_iso_8859_15  }, // offical name
 317         { "ISO8859-1",       decode_iso_8859_1   },
 318         { "ISO8859-15",      decode_iso_8859_15  },
 319         { "ISO_8859-1",      decode_iso_8859_1   }, // offical alias
 320         { "ISO_8859-15",     decode_iso_8859_15  }, // offical alias
 321         { "ISO_8859-1:1987", decode_iso_8859_1   }, // offical name
 322         { "Latin-9",         decode_iso_8859_15  }, // offical alias
 323         { "UTF-8",           decode_utf8         }, // offical name
 324         { "csISOLatin1",     decode_iso_8859_1   }, // offical alias
 325         { "cp1252",          decode_windows_1252 },
 326         { "iso-ir-100",      decode_iso_8859_1   }, // offical alias
 327         { "l1",              decode_iso_8859_1   }, // offical alias
 328         { "latin1",          decode_iso_8859_1   }, // offical alias
 329         { "windows-1252",    decode_windows_1252 }, // official name
 330
 331         { NULL,              NULL                }
 332 };
 333
 334 /** strcasecmp is not part of C99 so we need our own implementation here */
 335 static int my_strcasecmp(const char *s1, const char *s2)
 336 {
 337         for ( ; *s1 != 0; ++s1, ++s2) {
 338                 if (tolower(*s1) != tolower(*s2))
 339                         break;
 340         }
 341         return (unsigned char)*s1 - (unsigned char)*s2;
 342 }
 343
 344 void select_input_encoding(char const* const encoding)
 345 {
 346         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 347                 if (my_strcasecmp(encoding, i->name) != 0)
 348                         continue;
 349                 decoder = i->decoder;
 350                 return;
 351         }
 352         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 353 }
 354
 355 static inline void next_real_char(void)
 356 {
 357         assert(bufpos <= bufend);
 358         if (bufpos >= bufend) {
 359                 if (input == NULL) {
 360                         c = EOF;
 361                         return;
 362                 }
 363                 decoder();
 364         }
 365         c = *bufpos++;
 366         ++lexer_pos.colno;
 367 }
 368
 369 /**
 370  * Put a character back into the buffer.
 371  *
 372  * @param pc  the character to put back
 373  */
 374 static inline void put_back(utf32 const pc)
 375 {
 376         assert(bufpos > buf);
 377         *(--bufpos - buf + buf) = pc;
 378         --lexer_pos.colno;
 379
 380 #ifdef DEBUG_CHARS
 381         printf("putback '%lc'\n", pc);
 382 #endif
 383 }
 384
 385 static inline void next_char(void);
 386
 387 #define MATCH_NEWLINE(code)  \
 388         case '\r':               \
 389                 next_char();         \
 390                 if (c == '\n') {     \
 391         case '\n':               \
 392                         next_char();     \
 393                 }                    \
 394                 lexer_pos.lineno++;  \
 395                 lexer_pos.colno = 1; \
 396                 code
 397
 398 #define eat(c_type) (assert(c == c_type), next_char())
 399
 400 static void maybe_concat_lines(void)
 401 {
 402         eat('\\');
 403
 404         switch (c) {
 405         MATCH_NEWLINE(return;)
 406
 407         default:
 408                 break;
 409         }
 410
 411         put_back(c);
 412         c = '\\';
 413 }
 414
 415 /**
 416  * Set c to the next input character, ie.
 417  * after expanding trigraphs.
 418  */
 419 static inline void next_char(void)
 420 {
 421         next_real_char();
 422
 423         /* filter trigraphs */
 424         if (UNLIKELY(c == '\\')) {
 425                 maybe_concat_lines();
 426                 goto end_of_next_char;
 427         }
 428
 429         if (LIKELY(c != '?'))
 430                 goto end_of_next_char;
 431
 432         next_real_char();
 433         if (LIKELY(c != '?')) {
 434                 put_back(c);
 435                 c = '?';
 436                 goto end_of_next_char;
 437         }
 438
 439         next_real_char();
 440         switch (c) {
 441         case '=': c = '#'; break;
 442         case '(': c = '['; break;
 443         case '/': c = '\\'; maybe_concat_lines(); break;
 444         case ')': c = ']'; break;
 445         case '\'': c = '^'; break;
 446         case '<': c = '{'; break;
 447         case '!': c = '|'; break;
 448         case '>': c = '}'; break;
 449         case '-': c = '~'; break;
 450         default:
 451                 put_back(c);
 452                 put_back('?');
 453                 c = '?';
 454                 break;
 455         }
 456
 457 end_of_next_char:;
 458 #ifdef DEBUG_CHARS
 459         printf("nchar '%c'\n", c);
 460 #endif
 461 }
 462
 463 #define SYMBOL_CHARS  \
 464         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 465         case 'a':         \
 466         case 'b':         \
 467         case 'c':         \
 468         case 'd':         \
 469         case 'e':         \
 470         case 'f':         \
 471         case 'g':         \
 472         case 'h':         \
 473         case 'i':         \
 474         case 'j':         \
 475         case 'k':         \
 476         case 'l':         \
 477         case 'm':         \
 478         case 'n':         \
 479         case 'o':         \
 480         case 'p':         \
 481         case 'q':         \
 482         case 'r':         \
 483         case 's':         \
 484         case 't':         \
 485         case 'u':         \
 486         case 'v':         \
 487         case 'w':         \
 488         case 'x':         \
 489         case 'y':         \
 490         case 'z':         \
 491         case 'A':         \
 492         case 'B':         \
 493         case 'C':         \
 494         case 'D':         \
 495         case 'E':         \
 496         case 'F':         \
 497         case 'G':         \
 498         case 'H':         \
 499         case 'I':         \
 500         case 'J':         \
 501         case 'K':         \
 502         case 'L':         \
 503         case 'M':         \
 504         case 'N':         \
 505         case 'O':         \
 506         case 'P':         \
 507         case 'Q':         \
 508         case 'R':         \
 509         case 'S':         \
 510         case 'T':         \
 511         case 'U':         \
 512         case 'V':         \
 513         case 'W':         \
 514         case 'X':         \
 515         case 'Y':         \
 516         case 'Z':         \
 517         case '_':
 518
 519 #define DIGITS        \
 520         case '0':         \
 521         case '1':         \
 522         case '2':         \
 523         case '3':         \
 524         case '4':         \
 525         case '5':         \
 526         case '6':         \
 527         case '7':         \
 528         case '8':         \
 529         case '9':
 530
 531 /**
 532  * Read a symbol from the input and build
 533  * the lexer_token.
 534  */
 535 static void parse_symbol(void)
 536 {
 537         obstack_1grow(&symbol_obstack, (char) c);
 538         next_char();
 539
 540         while (true) {
 541                 switch (c) {
 542                 DIGITS
 543                 SYMBOL_CHARS
 544                         obstack_1grow(&symbol_obstack, (char) c);
 545                         next_char();
 546                         break;
 547
 548                 default:
 549 dollar_sign:
 550                         goto end_symbol;
 551                 }
 552         }
 553
 554 end_symbol:
 555         obstack_1grow(&symbol_obstack, '\0');
 556
 557         char     *string = obstack_finish(&symbol_obstack);
 558         symbol_t *symbol = symbol_table_insert(string);
 559
 560         lexer_token.type   = symbol->ID;
 561         lexer_token.symbol = symbol;
 562
 563         if (symbol->string != string) {
 564                 obstack_free(&symbol_obstack, string);
 565         }
 566 }
 567
 568 /**
 569  * parse suffixes like 'LU' or 'f' after numbers
 570  */
 571 static void parse_number_suffix(void)
 572 {
 573         assert(obstack_object_size(&symbol_obstack) == 0);
 574         while (true) {
 575                 switch (c) {
 576                 SYMBOL_CHARS
 577                         obstack_1grow(&symbol_obstack, (char) c);
 578                         next_char();
 579                         break;
 580                 default:
 581                 dollar_sign:
 582                         goto finish_suffix;
 583                 }
 584         }
 585 finish_suffix:
 586         if (obstack_object_size(&symbol_obstack) == 0) {
 587                 lexer_token.symbol = NULL;
 588                 return;
 589         }
 590
 591         obstack_1grow(&symbol_obstack, '\0');
 592         char     *string = obstack_finish(&symbol_obstack);
 593         symbol_t *symbol = symbol_table_insert(string);
 594
 595         if (symbol->string != string) {
 596                 obstack_free(&symbol_obstack, string);
 597         }
 598         lexer_token.symbol = symbol;
 599 }
 600
 601 static string_t identify_string(char *string, size_t len)
 602 {
 603         /* TODO hash */
 604 #if 0
 605         const char *result = strset_insert(&stringset, concat);
 606         if (result != concat) {
 607                 obstack_free(&symbol_obstack, concat);
 608         }
 609 #else
 610         const char *result = string;
 611 #endif
 612         return (string_t) {result, len};
 613 }
 614
 615 /**
 616  * Parses a hex number including hex floats and set the
 617  * lexer_token.
 618  */
 619 static void parse_number_hex(void)
 620 {
 621         bool is_float   = false;
 622         bool has_digits = false;
 623
 624         assert(obstack_object_size(&symbol_obstack) == 0);
 625         while (isxdigit(c)) {
 626                 has_digits = true;
 627                 obstack_1grow(&symbol_obstack, (char) c);
 628                 next_char();
 629         }
 630
 631         if (c == '.') {
 632                 is_float = true;
 633                 obstack_1grow(&symbol_obstack, (char) c);
 634                 next_char();
 635
 636                 while (isxdigit(c)) {
 637                         has_digits = true;
 638                         obstack_1grow(&symbol_obstack, (char) c);
 639                         next_char();
 640                 }
 641         }
 642         if (c == 'p' || c == 'P') {
 643                 is_float = true;
 644                 obstack_1grow(&symbol_obstack, (char) c);
 645                 next_char();
 646
 647                 if (c == '-' || c == '+') {
 648                         obstack_1grow(&symbol_obstack, (char) c);
 649                         next_char();
 650                 }
 651
 652                 while (isxdigit(c)) {
 653                         obstack_1grow(&symbol_obstack, (char) c);
 654                         next_char();
 655                 }
 656         } else if (is_float) {
 657                 errorf(&lexer_token.source_position,
 658                        "hexadecimal floatingpoint constant requires an exponent");
 659         }
 660         obstack_1grow(&symbol_obstack, '\0');
 661
 662         size_t  size   = obstack_object_size(&symbol_obstack) - 1;
 663         char   *string = obstack_finish(&symbol_obstack);
 664         lexer_token.literal = identify_string(string, size);
 665
 666         lexer_token.type    =
 667                 is_float ? T_FLOATINGPOINT_HEXADECIMAL : T_INTEGER_HEXADECIMAL;
 668
 669         if (!has_digits) {
 670                 errorf(&lexer_token.source_position, "invalid number literal '0x%S'",
 671                        &lexer_token.literal);
 672                 lexer_token.literal.begin = "0";
 673                 lexer_token.literal.size  = 1;
 674         }
 675
 676         parse_number_suffix();
 677 }
 678
 679 /**
 680  * Returns true if the given char is a octal digit.
 681  *
 682  * @param char  the character to check
 683  */
 684 static bool is_octal_digit(utf32 chr)
 685 {
 686         return '0' <= chr && chr <= '7';
 687 }
 688
 689 /**
 690  * Parses a number and sets the lexer_token.
 691  */
 692 static void parse_number(void)
 693 {
 694         bool is_float   = false;
 695         bool has_digits = false;
 696
 697         assert(obstack_object_size(&symbol_obstack) == 0);
 698         if (c == '0') {
 699                 next_char();
 700                 if (c == 'x' || c == 'X') {
 701                         next_char();
 702                         parse_number_hex();
 703                         return;
 704                 } else {
 705                         has_digits = true;
 706                 }
 707                 obstack_1grow(&symbol_obstack, '0');
 708         }
 709
 710         while (isdigit(c)) {
 711                 has_digits = true;
 712                 obstack_1grow(&symbol_obstack, (char) c);
 713                 next_char();
 714         }
 715
 716         if (c == '.') {
 717                 is_float = true;
 718                 obstack_1grow(&symbol_obstack, '.');
 719                 next_char();
 720
 721                 while (isdigit(c)) {
 722                         has_digits = true;
 723                         obstack_1grow(&symbol_obstack, (char) c);
 724                         next_char();
 725                 }
 726         }
 727         if (c == 'e' || c == 'E') {
 728                 is_float = true;
 729                 obstack_1grow(&symbol_obstack, 'e');
 730                 next_char();
 731
 732                 if (c == '-' || c == '+') {
 733                         obstack_1grow(&symbol_obstack, (char) c);
 734                         next_char();
 735                 }
 736
 737                 while (isdigit(c)) {
 738                         obstack_1grow(&symbol_obstack, (char) c);
 739                         next_char();
 740                 }
 741         }
 742
 743         obstack_1grow(&symbol_obstack, '\0');
 744         size_t  size   = obstack_object_size(&symbol_obstack) - 1;
 745         char   *string = obstack_finish(&symbol_obstack);
 746         lexer_token.literal = identify_string(string, size);
 747
 748         /* is it an octal number? */
 749         if (is_float) {
 750                 lexer_token.type = T_FLOATINGPOINT;
 751         } else if (string[0] == '0') {
 752                 lexer_token.type = T_INTEGER_OCTAL;
 753
 754                 /* check for invalid octal digits */
 755                 for (size_t i= 0; i < size; ++i) {
 756                         char t = string[i];
 757                         if (t >= '8')
 758                                 errorf(&lexer_token.source_position,
 759                                        "invalid digit '%c' in octal number", t);
 760                 }
 761         } else {
 762                 lexer_token.type = T_INTEGER;
 763         }
 764
 765         if (!has_digits) {
 766                 errorf(&lexer_token.source_position, "invalid number literal '%S'",
 767                        &lexer_token.literal);
 768         }
 769
 770         parse_number_suffix();
 771 }
 772
 773 /**
 774  * Returns the value of a digit.
 775  * The only portable way to do it ...
 776  */
 777 static int digit_value(utf32 const digit)
 778 {
 779         switch (digit) {
 780         case '0': return 0;
 781         case '1': return 1;
 782         case '2': return 2;
 783         case '3': return 3;
 784         case '4': return 4;
 785         case '5': return 5;
 786         case '6': return 6;
 787         case '7': return 7;
 788         case '8': return 8;
 789         case '9': return 9;
 790         case 'a':
 791         case 'A': return 10;
 792         case 'b':
 793         case 'B': return 11;
 794         case 'c':
 795         case 'C': return 12;
 796         case 'd':
 797         case 'D': return 13;
 798         case 'e':
 799         case 'E': return 14;
 800         case 'f':
 801         case 'F': return 15;
 802         default:
 803                 internal_error("wrong character given");
 804         }
 805 }
 806
 807 /**
 808  * Parses an octal character sequence.
 809  *
 810  * @param first_digit  the already read first digit
 811  */
 812 static utf32 parse_octal_sequence(utf32 const first_digit)
 813 {
 814         assert(is_octal_digit(first_digit));
 815         utf32 value = digit_value(first_digit);
 816         if (!is_octal_digit(c)) return value;
 817         value = 8 * value + digit_value(c);
 818         next_char();
 819         if (!is_octal_digit(c)) return value;
 820         value = 8 * value + digit_value(c);
 821         next_char();
 822         return value;
 823 }
 824
 825 /**
 826  * Parses a hex character sequence.
 827  */
 828 static utf32 parse_hex_sequence(void)
 829 {
 830         utf32 value = 0;
 831         while (isxdigit(c)) {
 832                 value = 16 * value + digit_value(c);
 833                 next_char();
 834         }
 835         return value;
 836 }
 837
 838 /**
 839  * Parse an escape sequence.
 840  */
 841 static utf32 parse_escape_sequence(void)
 842 {
 843         eat('\\');
 844
 845         utf32 const ec = c;
 846         next_char();
 847
 848         switch (ec) {
 849         case '"':  return '"';
 850         case '\'': return '\'';
 851         case '\\': return '\\';
 852         case '?': return '\?';
 853         case 'a': return '\a';
 854         case 'b': return '\b';
 855         case 'f': return '\f';
 856         case 'n': return '\n';
 857         case 'r': return '\r';
 858         case 't': return '\t';
 859         case 'v': return '\v';
 860         case 'x':
 861                 return parse_hex_sequence();
 862         case '0':
 863         case '1':
 864         case '2':
 865         case '3':
 866         case '4':
 867         case '5':
 868         case '6':
 869         case '7':
 870                 return parse_octal_sequence(ec);
 871         case EOF:
 872                 parse_error("reached end of file while parsing escape sequence");
 873                 return EOF;
 874         /* \E is not documented, but handled, by GCC.  It is acceptable according
 875          * to §6.11.4, whereas \e is not. */
 876         case 'E':
 877         case 'e':
 878                 if (c_mode & _GNUC)
 879                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
 880                 break;
 881         case 'u':
 882         case 'U':
 883                 parse_error("universal character parsing not implemented yet");
 884                 return EOF;
 885         default:
 886                 break;
 887         }
 888         /* §6.4.4.4:8 footnote 64 */
 889         parse_error("unknown escape sequence");
 890         return EOF;
 891 }
 892
 893 /**
 894  * Concatenate two strings.
 895  */
 896 string_t concat_strings(const string_t *const s1, const string_t *const s2)
 897 {
 898         const size_t len1 = s1->size - 1;
 899         const size_t len2 = s2->size - 1;
 900
 901         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 902         memcpy(concat, s1->begin, len1);
 903         memcpy(concat + len1, s2->begin, len2 + 1);
 904
 905         return identify_string(concat, len1 + len2 + 1);
 906 }
 907
 908 string_t make_string(const char *string)
 909 {
 910         size_t      len   = strlen(string) + 1;
 911         char *const space = obstack_alloc(&symbol_obstack, len);
 912         memcpy(space, string, len);
 913
 914         return identify_string(space, len);
 915 }
 916
 917 static void grow_symbol(utf32 const tc)
 918 {
 919         struct obstack *const o  = &symbol_obstack;
 920         if (tc < 0x80U) {
 921                 obstack_1grow(o, tc);
 922         } else if (tc < 0x800) {
 923                 obstack_1grow(o, 0xC0 | (tc >> 6));
 924                 obstack_1grow(o, 0x80 | (tc & 0x3F));
 925         } else if (tc < 0x10000) {
 926                 obstack_1grow(o, 0xE0 | ( tc >> 12));
 927                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
 928                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
 929         } else {
 930                 obstack_1grow(o, 0xF0 | ( tc >> 18));
 931                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
 932                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
 933                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
 934         }
 935 }
 936
 937 /**
 938  * Parse a string literal and set lexer_token.
 939  */
 940 static void parse_string_literal(void)
 941 {
 942         eat('"');
 943
 944         while (true) {
 945                 switch (c) {
 946                 case '\\': {
 947                         utf32 const tc = parse_escape_sequence();
 948                         if (tc >= 0x100) {
 949                                 warningf(WARN_OTHER, &lexer_pos, "escape sequence out of range");
 950                         }
 951                         obstack_1grow(&symbol_obstack, tc);
 952                         break;
 953                 }
 954
 955                 case EOF: {
 956                         errorf(&lexer_token.source_position, "string has no end");
 957                         lexer_token.type = T_ERROR;
 958                         return;
 959                 }
 960
 961                 case '"':
 962                         next_char();
 963                         goto end_of_string;
 964
 965                 default:
 966                         grow_symbol(c);
 967                         next_char();
 968                         break;
 969                 }
 970         }
 971
 972 end_of_string:
 973
 974         /* TODO: concatenate multiple strings separated by whitespace... */
 975
 976         /* add finishing 0 to the string */
 977         obstack_1grow(&symbol_obstack, '\0');
 978         const size_t  size   = (size_t)obstack_object_size(&symbol_obstack);
 979         char         *string = obstack_finish(&symbol_obstack);
 980
 981         lexer_token.type    = T_STRING_LITERAL;
 982         lexer_token.literal = identify_string(string, size);
 983 }
 984
 985 /**
 986  * Parse a wide character constant and set lexer_token.
 987  */
 988 static void parse_wide_character_constant(void)
 989 {
 990         eat('\'');
 991
 992         while (true) {
 993                 switch (c) {
 994                 case '\\': {
 995                         const utf32 tc = parse_escape_sequence();
 996                         grow_symbol(tc);
 997                         break;
 998                 }
 999
1000                 MATCH_NEWLINE(
1001                         parse_error("newline while parsing character constant");
1002                         break;
1003                 )
1004
1005                 case '\'':
1006                         next_char();
1007                         goto end_of_wide_char_constant;
1008
1009                 case EOF: {
1010                         errorf(&lexer_token.source_position, "EOF while parsing character constant");
1011                         lexer_token.type = T_ERROR;
1012                         return;
1013                 }
1014
1015                 default:
1016                         grow_symbol(c);
1017                         next_char();
1018                         break;
1019                 }
1020         }
1021
1022 end_of_wide_char_constant:;
1023         obstack_1grow(&symbol_obstack, '\0');
1024         size_t  size   = (size_t) obstack_object_size(&symbol_obstack) - 1;
1025         char   *string = obstack_finish(&symbol_obstack);
1026
1027         lexer_token.type     = T_WIDE_CHARACTER_CONSTANT;
1028         lexer_token.literal  = identify_string(string, size);
1029
1030         if (size == 0) {
1031                 errorf(&lexer_token.source_position, "empty character constant");
1032         }
1033 }
1034
1035 /**
1036  * Parse a wide string literal and set lexer_token.
1037  */
1038 static void parse_wide_string_literal(void)
1039 {
1040         parse_string_literal();
1041         if (lexer_token.type == T_STRING_LITERAL)
1042                 lexer_token.type = T_WIDE_STRING_LITERAL;
1043 }
1044
1045 /**
1046  * Parse a character constant and set lexer_token.
1047  */
1048 static void parse_character_constant(void)
1049 {
1050         eat('\'');
1051
1052         while (true) {
1053                 switch (c) {
1054                 case '\\': {
1055                         utf32 const tc = parse_escape_sequence();
1056                         if (tc >= 0x100) {
1057                                 warningf(WARN_OTHER, &lexer_pos, "escape sequence out of range");
1058                         }
1059                         obstack_1grow(&symbol_obstack, tc);
1060                         break;
1061                 }
1062
1063                 MATCH_NEWLINE(
1064                         parse_error("newline while parsing character constant");
1065                         break;
1066                 )
1067
1068                 case '\'':
1069                         next_char();
1070                         goto end_of_char_constant;
1071
1072                 case EOF: {
1073                         errorf(&lexer_token.source_position, "EOF while parsing character constant");
1074                         lexer_token.type = T_ERROR;
1075                         return;
1076                 }
1077
1078                 default:
1079                         grow_symbol(c);
1080                         next_char();
1081                         break;
1082
1083                 }
1084         }
1085
1086 end_of_char_constant:;
1087         obstack_1grow(&symbol_obstack, '\0');
1088         const size_t        size   = (size_t)obstack_object_size(&symbol_obstack)-1;
1089         char         *const string = obstack_finish(&symbol_obstack);
1090
1091         lexer_token.type    = T_CHARACTER_CONSTANT;
1092         lexer_token.literal = identify_string(string, size);
1093
1094         if (size == 0) {
1095                 errorf(&lexer_token.source_position, "empty character constant");
1096         }
1097 }
1098
1099 /**
1100  * Skip a multiline comment.
1101  */
1102 static void skip_multiline_comment(void)
1103 {
1104         while (true) {
1105                 switch (c) {
1106                 case '/':
1107                         next_char();
1108                         if (c == '*') {
1109                                 /* nested comment, warn here */
1110                                 warningf(WARN_COMMENT, &lexer_pos, "'/*' within comment");
1111                         }
1112                         break;
1113                 case '*':
1114                         next_char();
1115                         if (c == '/') {
1116                                 next_char();
1117                                 return;
1118                         }
1119                         break;
1120
1121                 MATCH_NEWLINE(break;)
1122
1123                 case EOF: {
1124                         errorf(&lexer_token.source_position, "at end of file while looking for comment end");
1125                         return;
1126                 }
1127
1128                 default:
1129                         next_char();
1130                         break;
1131                 }
1132         }
1133 }
1134
1135 /**
1136  * Skip a single line comment.
1137  */
1138 static void skip_line_comment(void)
1139 {
1140         while (true) {
1141                 switch (c) {
1142                 case EOF:
1143                         return;
1144
1145                 case '\n':
1146                 case '\r':
1147                         return;
1148
1149                 case '\\':
1150                         next_char();
1151                         if (c == '\n' || c == '\r') {
1152                                 warningf(WARN_COMMENT, &lexer_pos, "multi-line comment");
1153                                 return;
1154                         }
1155                         break;
1156
1157                 default:
1158                         next_char();
1159                         break;
1160                 }
1161         }
1162 }
1163
1164 /** The current preprocessor token. */
1165 static token_t pp_token;
1166
1167 /**
1168  * Read the next preprocessor token.
1169  */
1170 static inline void next_pp_token(void)
1171 {
1172         lexer_next_preprocessing_token();
1173         pp_token = lexer_token;
1174 }
1175
1176 /**
1177  * Eat all preprocessor tokens until newline.
1178  */
1179 static void eat_until_newline(void)
1180 {
1181         while (pp_token.type != '\n' && pp_token.type != T_EOF) {
1182                 next_pp_token();
1183         }
1184 }
1185
1186 /**
1187  * Handle the define directive.
1188  */
1189 static void define_directive(void)
1190 {
1191         lexer_next_preprocessing_token();
1192         if (lexer_token.type != T_IDENTIFIER) {
1193                 parse_error("expected identifier after #define\n");
1194                 eat_until_newline();
1195         }
1196 }
1197
1198 /**
1199  * Handle the ifdef directive.
1200  */
1201 static void ifdef_directive(int is_ifndef)
1202 {
1203         (void) is_ifndef;
1204         lexer_next_preprocessing_token();
1205         //expect_identifier();
1206         //extect_newline();
1207 }
1208
1209 /**
1210  * Handle the endif directive.
1211  */
1212 static void endif_directive(void)
1213 {
1214         //expect_newline();
1215 }
1216
1217 /**
1218  * Parse the line directive.
1219  */
1220 static void parse_line_directive(void)
1221 {
1222         if (pp_token.type != T_INTEGER) {
1223                 parse_error("expected integer");
1224         } else {
1225                 /* use offset -1 as this is about the next line */
1226                 lexer_pos.lineno = atoi(pp_token.literal.begin) - 1;
1227                 next_pp_token();
1228         }
1229         if (pp_token.type == T_STRING_LITERAL) {
1230                 lexer_pos.input_name = pp_token.literal.begin;
1231                 next_pp_token();
1232         }
1233
1234         eat_until_newline();
1235 }
1236
1237 /**
1238  * STDC pragmas.
1239  */
1240 typedef enum stdc_pragma_kind_t {
1241         STDC_UNKNOWN,
1242         STDC_FP_CONTRACT,
1243         STDC_FENV_ACCESS,
1244         STDC_CX_LIMITED_RANGE
1245 } stdc_pragma_kind_t;
1246
1247 /**
1248  * STDC pragma values.
1249  */
1250 typedef enum stdc_pragma_value_kind_t {
1251         STDC_VALUE_UNKNOWN,
1252         STDC_VALUE_ON,
1253         STDC_VALUE_OFF,
1254         STDC_VALUE_DEFAULT
1255 } stdc_pragma_value_kind_t;
1256
1257 /**
1258  * Parse a pragma directive.
1259  */
1260 static void parse_pragma(void)
1261 {
1262         bool unknown_pragma = true;
1263
1264         next_pp_token();
1265         if (pp_token.symbol->pp_ID == TP_STDC) {
1266                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1267                 /* a STDC pragma */
1268                 if (c_mode & _C99) {
1269                         next_pp_token();
1270
1271                         switch (pp_token.symbol->pp_ID) {
1272                         case TP_FP_CONTRACT:
1273                                 kind = STDC_FP_CONTRACT;
1274                                 break;
1275                         case TP_FENV_ACCESS:
1276                                 kind = STDC_FENV_ACCESS;
1277                                 break;
1278                         case TP_CX_LIMITED_RANGE:
1279                                 kind = STDC_CX_LIMITED_RANGE;
1280                                 break;
1281                         default:
1282                                 break;
1283                         }
1284                         if (kind != STDC_UNKNOWN) {
1285                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1286                                 next_pp_token();
1287                                 switch (pp_token.symbol->pp_ID) {
1288                                 case TP_ON:
1289                                         value = STDC_VALUE_ON;
1290                                         break;
1291                                 case TP_OFF:
1292                                         value = STDC_VALUE_OFF;
1293                                         break;
1294                                 case TP_DEFAULT:
1295                                         value = STDC_VALUE_DEFAULT;
1296                                         break;
1297                                 default:
1298                                         break;
1299                                 }
1300                                 if (value != STDC_VALUE_UNKNOWN) {
1301                                         unknown_pragma = false;
1302                                 } else {
1303                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1304                                 }
1305                         }
1306                 }
1307         } else {
1308                 unknown_pragma = true;
1309         }
1310         eat_until_newline();
1311         if (unknown_pragma) {
1312                 warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.source_position, "encountered unknown #pragma");
1313         }
1314 }
1315
1316 /**
1317  * Parse a preprocessor non-null directive.
1318  */
1319 static void parse_preprocessor_identifier(void)
1320 {
1321         assert(pp_token.type == T_IDENTIFIER);
1322         symbol_t *symbol = pp_token.symbol;
1323
1324         switch (symbol->pp_ID) {
1325         case TP_include:
1326                 printf("include - enable header name parsing!\n");
1327                 break;
1328         case TP_define:
1329                 define_directive();
1330                 break;
1331         case TP_ifdef:
1332                 ifdef_directive(0);
1333                 break;
1334         case TP_ifndef:
1335                 ifdef_directive(1);
1336                 break;
1337         case TP_endif:
1338                 endif_directive();
1339                 break;
1340         case TP_line:
1341                 next_pp_token();
1342                 parse_line_directive();
1343                 break;
1344         case TP_if:
1345         case TP_else:
1346         case TP_elif:
1347         case TP_undef:
1348         case TP_error:
1349                 /* TODO; output the rest of the line */
1350                 parse_error("#error directive: ");
1351                 break;
1352         case TP_pragma:
1353                 parse_pragma();
1354                 break;
1355         }
1356 }
1357
1358 /**
1359  * Parse a preprocessor directive.
1360  */
1361 static void parse_preprocessor_directive(void)
1362 {
1363         next_pp_token();
1364
1365         switch (pp_token.type) {
1366         case T_IDENTIFIER:
1367                 parse_preprocessor_identifier();
1368                 break;
1369         case T_INTEGER:
1370                 parse_line_directive();
1371                 break;
1372         case '\n':
1373                 /* NULL directive, see §6.10.7 */
1374                 break;
1375         default:
1376                 parse_error("invalid preprocessor directive");
1377                 eat_until_newline();
1378                 break;
1379         }
1380 }
1381
1382 #define MAYBE_PROLOG                                       \
1383                         next_char();                                   \
1384                         while (true) {                                 \
1385                                 switch (c) {
1386
1387 #define MAYBE(ch, set_type)                                \
1388                                 case ch:                                   \
1389                                         next_char();                           \
1390                                         lexer_token.type = set_type;           \
1391                                         return;
1392
1393 /* must use this as last thing */
1394 #define MAYBE_MODE(ch, set_type, mode)                     \
1395                                 case ch:                                   \
1396                                         if (c_mode & mode) {                   \
1397                                                 next_char();                       \
1398                                                 lexer_token.type = set_type;       \
1399                                                 return;                            \
1400                                         }                                      \
1401                                         /* fallthrough */
1402
1403 #define ELSE_CODE(code)                                    \
1404                                 default:                                   \
1405                                         code                                   \
1406                                         return;                                \
1407                                 }                                          \
1408                         } /* end of while (true) */                    \
1409
1410 #define ELSE(set_type)                                     \
1411                 ELSE_CODE(                                         \
1412                         lexer_token.type = set_type;                   \
1413                 )
1414
1415 void lexer_next_preprocessing_token(void)
1416 {
1417         while (true) {
1418                 lexer_token.source_position = lexer_pos;
1419
1420                 switch (c) {
1421                 case ' ':
1422                 case '\t':
1423                         next_char();
1424                         break;
1425
1426                 MATCH_NEWLINE(
1427                         lexer_token.type = '\n';
1428                         return;
1429                 )
1430
1431                 SYMBOL_CHARS
1432                         parse_symbol();
1433                         /* might be a wide string ( L"string" ) */
1434                         if (lexer_token.symbol == symbol_L) {
1435                                 switch (c) {
1436                                         case '"':  parse_wide_string_literal();     break;
1437                                         case '\'': parse_wide_character_constant(); break;
1438                                 }
1439                         }
1440                         return;
1441
1442                 DIGITS
1443                         parse_number();
1444                         return;
1445
1446                 case '"':
1447                         parse_string_literal();
1448                         return;
1449
1450                 case '\'':
1451                         parse_character_constant();
1452                         return;
1453
1454                 case '.':
1455                         MAYBE_PROLOG
1456                                 DIGITS
1457                                         put_back(c);
1458                                         c = '.';
1459                                         parse_number();
1460                                         return;
1461
1462                                 case '.':
1463                                         MAYBE_PROLOG
1464                                         MAYBE('.', T_DOTDOTDOT)
1465                                         ELSE_CODE(
1466                                                 put_back(c);
1467                                                 c = '.';
1468                                                 lexer_token.type = '.';
1469                                         )
1470                         ELSE('.')
1471                 case '&':
1472                         MAYBE_PROLOG
1473                         MAYBE('&', T_ANDAND)
1474                         MAYBE('=', T_ANDEQUAL)
1475                         ELSE('&')
1476                 case '*':
1477                         MAYBE_PROLOG
1478                         MAYBE('=', T_ASTERISKEQUAL)
1479                         ELSE('*')
1480                 case '+':
1481                         MAYBE_PROLOG
1482                         MAYBE('+', T_PLUSPLUS)
1483                         MAYBE('=', T_PLUSEQUAL)
1484                         ELSE('+')
1485                 case '-':
1486                         MAYBE_PROLOG
1487                         MAYBE('>', T_MINUSGREATER)
1488                         MAYBE('-', T_MINUSMINUS)
1489                         MAYBE('=', T_MINUSEQUAL)
1490                         ELSE('-')
1491                 case '!':
1492                         MAYBE_PROLOG
1493                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1494                         ELSE('!')
1495                 case '/':
1496                         MAYBE_PROLOG
1497                         MAYBE('=', T_SLASHEQUAL)
1498                                 case '*':
1499                                         next_char();
1500                                         skip_multiline_comment();
1501                                         lexer_next_preprocessing_token();
1502                                         return;
1503                                 case '/':
1504                                         next_char();
1505                                         skip_line_comment();
1506                                         lexer_next_preprocessing_token();
1507                                         return;
1508                         ELSE('/')
1509                 case '%':
1510                         MAYBE_PROLOG
1511                         MAYBE('>', '}')
1512                         MAYBE('=', T_PERCENTEQUAL)
1513                                 case ':':
1514                                         MAYBE_PROLOG
1515                                                 case '%':
1516                                                         MAYBE_PROLOG
1517                                                         MAYBE(':', T_HASHHASH)
1518                                                         ELSE_CODE(
1519                                                                 put_back(c);
1520                                                                 c = '%';
1521                                                                 lexer_token.type = '#';
1522                                                         )
1523                                         ELSE('#')
1524                         ELSE('%')
1525                 case '<':
1526                         MAYBE_PROLOG
1527                         MAYBE(':', '[')
1528                         MAYBE('%', '{')
1529                         MAYBE('=', T_LESSEQUAL)
1530                                 case '<':
1531                                         MAYBE_PROLOG
1532                                         MAYBE('=', T_LESSLESSEQUAL)
1533                                         ELSE(T_LESSLESS)
1534                         ELSE('<')
1535                 case '>':
1536                         MAYBE_PROLOG
1537                         MAYBE('=', T_GREATEREQUAL)
1538                                 case '>':
1539                                         MAYBE_PROLOG
1540                                         MAYBE('=', T_GREATERGREATEREQUAL)
1541                                         ELSE(T_GREATERGREATER)
1542                         ELSE('>')
1543                 case '^':
1544                         MAYBE_PROLOG
1545                         MAYBE('=', T_CARETEQUAL)
1546                         ELSE('^')
1547                 case '|':
1548                         MAYBE_PROLOG
1549                         MAYBE('=', T_PIPEEQUAL)
1550                         MAYBE('|', T_PIPEPIPE)
1551                         ELSE('|')
1552                 case ':':
1553                         MAYBE_PROLOG
1554                         MAYBE('>', ']')
1555                         MAYBE_MODE(':', T_COLONCOLON, _CXX)
1556                         ELSE(':')
1557                 case '=':
1558                         MAYBE_PROLOG
1559                         MAYBE('=', T_EQUALEQUAL)
1560                         ELSE('=')
1561                 case '#':
1562                         MAYBE_PROLOG
1563                         MAYBE('#', T_HASHHASH)
1564                         ELSE('#')
1565
1566                 case '?':
1567                 case '[':
1568                 case ']':
1569                 case '(':
1570                 case ')':
1571                 case '{':
1572                 case '}':
1573                 case '~':
1574                 case ';':
1575                 case ',':
1576                 case '\\':
1577                         lexer_token.type = c;
1578                         next_char();
1579                         return;
1580
1581                 case EOF:
1582                         lexer_token.type = T_EOF;
1583                         return;
1584
1585                 default:
1586 dollar_sign:
1587                         errorf(&lexer_pos, "unknown character '%c' found", c);
1588                         next_char();
1589                         lexer_token.type = T_ERROR;
1590                         return;
1591                 }
1592         }
1593 }
1594
1595 void lexer_next_token(void)
1596 {
1597         lexer_next_preprocessing_token();
1598
1599         while (lexer_token.type == '\n') {
1600 newline_found:
1601                 lexer_next_preprocessing_token();
1602         }
1603
1604         if (lexer_token.type == '#') {
1605                 parse_preprocessor_directive();
1606                 goto newline_found;
1607         }
1608 }
1609
1610 void init_lexer(void)
1611 {
1612         strset_init(&stringset);
1613         symbol_L = symbol_table_insert("L");
1614 }
1615
1616 void lexer_open_stream(FILE *stream, const char *input_name)
1617 {
1618         input                = stream;
1619         lexer_pos.lineno     = 0;
1620         lexer_pos.colno      = 0;
1621         lexer_pos.input_name = input_name;
1622
1623         bufpos = NULL;
1624         bufend = NULL;
1625
1626         /* place a virtual \n at the beginning so the lexer knows that we're
1627          * at the beginning of a line */
1628         c = '\n';
1629 }
1630
1631 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1632 {
1633         input                = NULL;
1634         lexer_pos.lineno     = 0;
1635         lexer_pos.colno      = 0;
1636         lexer_pos.input_name = input_name;
1637
1638 #if 0 // TODO
1639         bufpos = buffer;
1640         bufend = buffer + len;
1641
1642         /* place a virtual \n at the beginning so the lexer knows that we're
1643          * at the beginning of a line */
1644         c = '\n';
1645 #else
1646         (void)buffer;
1647         (void)len;
1648         panic("builtin lexing not done yet");
1649 #endif
1650 }
1651
1652 void exit_lexer(void)
1653 {
1654         strset_destroy(&stringset);
1655 }
1656
1657 static __attribute__((unused))
1658 void dbg_pos(const source_position_t source_position)
1659 {
1660         fprintf(stdout, "%s:%u:%u\n", source_position.input_name,
1661                 source_position.lineno, source_position.colno);
1662         fflush(stdout);
1663 }