nsz Git - cparser/blob - lexer.c

   1 /*
   2  * This file is part of cparser.
   3  * Copyright (C) 2007-2009 Matthias Braun <matze@braunis.de>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  18  * 02111-1307, USA.
  19  */
  20 #include <config.h>
  21
  22 #include "diagnostic.h"
  23 #include "lexer.h"
  24 #include "symbol_t.h"
  25 #include "token_t.h"
  26 #include "symbol_table_t.h"
  27 #include "adt/error.h"
  28 #include "adt/strset.h"
  29 #include "adt/util.h"
  30 #include "types.h"
  31 #include "type_t.h"
  32 #include "target_architecture.h"
  33 #include "parser.h"
  34 #include "warning.h"
  35 #include "lang_features.h"
  36
  37 #include <assert.h>
  38 #include <errno.h>
  39 #include <string.h>
  40 #include <stdbool.h>
  41 #include <ctype.h>
  42
  43 #ifndef _WIN32
  44 #include <strings.h>
  45 #endif
  46
  47 //#define DEBUG_CHARS
  48 #define MAX_PUTBACK 3
  49 #define BUF_SIZE    1024
  50
  51 #if defined(_WIN32) || defined(__CYGWIN__)
  52 /* No strtold on windows and no replacement yet */
  53 #define strtold(s, e) strtod(s, e)
  54 #endif
  55
  56 static utf32        c;
  57 token_t             lexer_token;
  58 symbol_t           *symbol_L;
  59 static FILE        *input;
  60 static utf32        buf[BUF_SIZE + MAX_PUTBACK];
  61 static const utf32 *bufend;
  62 static const utf32 *bufpos;
  63 static strset_t     stringset;
  64 bool                allow_dollar_in_symbol = true;
  65
  66 /**
  67  * Prints a parse error message at the current token.
  68  *
  69  * @param msg   the error message
  70  */
  71 static void parse_error(const char *msg)
  72 {
  73         errorf(&lexer_token.source_position, "%s", msg);
  74 }
  75
  76 /**
  77  * Prints an internal error message at the current token.
  78  *
  79  * @param msg   the error message
  80  */
  81 static NORETURN internal_error(const char *msg)
  82 {
  83         internal_errorf(&lexer_token.source_position, "%s", msg);
  84 }
  85
  86 static size_t read_block(unsigned char *const read_buf, size_t const n)
  87 {
  88         size_t const s = fread(read_buf, 1, n, input);
  89         if (s == 0) {
  90                 if (ferror(input))
  91                         parse_error("read from input failed");
  92                 buf[MAX_PUTBACK] = EOF;
  93                 bufpos           = buf + MAX_PUTBACK;
  94                 bufend           = buf + MAX_PUTBACK + 1;
  95         }
  96         return s;
  97 }
  98
  99 static void decode_iso_8859_1(void)
 100 {
 101         unsigned char read_buf[BUF_SIZE];
 102         size_t const s = read_block(read_buf, sizeof(read_buf));
 103         if (s == 0)
 104                 return;
 105
 106         unsigned char const *src = read_buf;
 107         unsigned char const *end = read_buf + s;
 108         utf32               *dst = buf + MAX_PUTBACK;
 109         while (src != end)
 110                 *dst++ = *src++;
 111
 112         bufpos = buf + MAX_PUTBACK;
 113         bufend = dst;
 114 }
 115
 116 static void decode_iso_8859_15(void)
 117 {
 118         unsigned char read_buf[BUF_SIZE];
 119         size_t const s = read_block(read_buf, sizeof(read_buf));
 120         if (s == 0)
 121                 return;
 122
 123         unsigned char const *src = read_buf;
 124         unsigned char const *end = read_buf + s;
 125         utf32               *dst = buf + MAX_PUTBACK;
 126         while (src != end) {
 127                 utf32 tc = *src++;
 128                 switch (tc) {
 129                         case 0xA4: tc = 0x20AC; break; // €
 130                         case 0xA6: tc = 0x0160; break; // Š
 131                         case 0xA8: tc = 0x0161; break; // š
 132                         case 0xB4: tc = 0x017D; break; // Ž
 133                         case 0xB8: tc = 0x017E; break; // ž
 134                         case 0xBC: tc = 0x0152; break; // Œ
 135                         case 0xBD: tc = 0x0153; break; // œ
 136                         case 0xBE: tc = 0x0178; break; // Ÿ
 137                 }
 138                 *dst++ = tc;
 139         }
 140
 141         bufpos = buf + MAX_PUTBACK;
 142         bufend = dst;
 143 }
 144
 145 static void decode_utf8(void)
 146 {
 147         static utf32  part_decoded_min_code;
 148         static utf32  part_decoded_char;
 149         static size_t part_decoded_rest_len;
 150
 151         do {
 152                 unsigned char read_buf[BUF_SIZE];
 153                 size_t const s = read_block(read_buf, sizeof(read_buf));
 154                 if (s == 0) {
 155                         if (part_decoded_rest_len > 0)
 156                                 parse_error("incomplete input char at end of input");
 157                         return;
 158                 }
 159
 160                 unsigned char const *src = read_buf;
 161                 unsigned char const *end = read_buf + s;
 162                 utf32               *dst = buf + MAX_PUTBACK;
 163                 utf32                decoded;
 164                 utf32                min_code;
 165
 166                 if (part_decoded_rest_len != 0) {
 167                         min_code              = part_decoded_min_code;
 168                         decoded               = part_decoded_char;
 169                         size_t const rest_len = part_decoded_rest_len;
 170                         part_decoded_rest_len = 0;
 171                         switch (rest_len) {
 172                                 case 4:  goto realign;
 173                                 case 3:  goto three_more;
 174                                 case 2:  goto two_more;
 175                                 default: goto one_more;
 176                         }
 177                 }
 178
 179                 while (src != end) {
 180                         if ((*src & 0x80) == 0) {
 181                                 decoded = *src++;
 182                         } else if ((*src & 0xE0) == 0xC0) {
 183                                 min_code = 0x80;
 184                                 decoded  = *src++ & 0x1F;
 185 one_more:
 186                                 if (src == end) {
 187                                         part_decoded_min_code = min_code;
 188                                         part_decoded_char     = decoded;
 189                                         part_decoded_rest_len = 1;
 190                                         break;
 191                                 }
 192                                 if ((*src & 0xC0) == 0x80) {
 193                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 194                                 } else {
 195                                         goto invalid_char;
 196                                 }
 197                                 if (decoded < min_code                      ||
 198                                                 decoded > 0x10FFFF                      ||
 199                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 200                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 201                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 202                                         parse_error("invalid byte sequence in input");
 203                                 }
 204                         } else if ((*src & 0xF0) == 0xE0) {
 205                                 min_code = 0x800;
 206                                 decoded  = *src++ & 0x0F;
 207 two_more:
 208                                 if (src == end) {
 209                                         part_decoded_min_code = min_code;
 210                                         part_decoded_char     = decoded;
 211                                         part_decoded_rest_len = 2;
 212                                         break;
 213                                 }
 214                                 if ((*src & 0xC0) == 0x80) {
 215                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 216                                 } else {
 217                                         goto invalid_char;
 218                                 }
 219                                 goto one_more;
 220                         } else if ((*src & 0xF8) == 0xF0) {
 221                                 min_code = 0x10000;
 222                                 decoded  = *src++ & 0x07;
 223 three_more:
 224                                 if (src == end) {
 225                                         part_decoded_min_code = min_code;
 226                                         part_decoded_char     = decoded;
 227                                         part_decoded_rest_len = 3;
 228                                         break;
 229                                 }
 230                                 if ((*src & 0xC0) == 0x80) {
 231                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 232                                 } else {
 233                                         goto invalid_char;
 234                                 }
 235                                 goto two_more;
 236                         } else {
 237 invalid_char:
 238                                 parse_error("invalid byte sequence in input");
 239 realign:
 240                                 do {
 241                                         ++src;
 242                                         if (src == end) {
 243                                                 part_decoded_rest_len = 4;
 244                                                 break;
 245                                         }
 246                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 247                                 continue;
 248                         }
 249                         *dst++ = decoded;
 250                 }
 251
 252                 bufpos = buf + MAX_PUTBACK;
 253                 bufend = dst;
 254         } while (bufpos == bufend);
 255 }
 256
 257 static void decode_windows_1252(void)
 258 {
 259         unsigned char read_buf[BUF_SIZE];
 260         size_t const s = read_block(read_buf, sizeof(read_buf));
 261         if (s == 0)
 262                 return;
 263
 264         unsigned char const *src = read_buf;
 265         unsigned char const *end = read_buf + s;
 266         utf32               *dst = buf + MAX_PUTBACK;
 267         while (src != end) {
 268                 utf32 tc = *src++;
 269                 switch (tc) {
 270                         case 0x80: tc = 0x20AC; break; // €
 271                         case 0x82: tc = 0x201A; break; // ‚
 272                         case 0x83: tc = 0x0192; break; // ƒ
 273                         case 0x84: tc = 0x201E; break; // „
 274                         case 0x85: tc = 0x2026; break; // …
 275                         case 0x86: tc = 0x2020; break; // †
 276                         case 0x87: tc = 0x2021; break; // ‡
 277                         case 0x88: tc = 0x02C6; break; // ˆ
 278                         case 0x89: tc = 0x2030; break; // ‰
 279                         case 0x8A: tc = 0x0160; break; // Š
 280                         case 0x8B: tc = 0x2039; break; // ‹
 281                         case 0x8C: tc = 0x0152; break; // Œ
 282                         case 0x8E: tc = 0x017D; break; // Ž
 283                         case 0x91: tc = 0x2018; break; // ‘
 284                         case 0x92: tc = 0x2019; break; // ’
 285                         case 0x93: tc = 0x201C; break; // “
 286                         case 0x94: tc = 0x201D; break; // ”
 287                         case 0x95: tc = 0x2022; break; // •
 288                         case 0x96: tc = 0x2013; break; // –
 289                         case 0x97: tc = 0x2014; break; // —
 290                         case 0x98: tc = 0x02DC; break; // ˜
 291                         case 0x99: tc = 0x2122; break; // ™
 292                         case 0x9A: tc = 0x0161; break; // š
 293                         case 0x9B: tc = 0x203A; break; // ›
 294                         case 0x9C: tc = 0x0153; break; // œ
 295                         case 0x9E: tc = 0x017E; break; // ž
 296                         case 0x9F: tc = 0x0178; break; // Ÿ
 297                 }
 298                 *dst++ = tc;
 299         }
 300
 301         bufpos = buf + MAX_PUTBACK;
 302         bufend = dst;
 303 }
 304
 305 typedef void (*decoder_t)(void);
 306
 307 static decoder_t decoder = decode_utf8;
 308
 309 typedef struct named_decoder_t {
 310         char const *name;
 311         decoder_t   decoder;
 312 } named_decoder_t;
 313
 314 static named_decoder_t const decoders[] = {
 315         { "CP819",           decode_iso_8859_1   }, // offical alias
 316         { "IBM819",          decode_iso_8859_1   }, // offical alias
 317         { "ISO-8859-1",      decode_iso_8859_1   }, // offical alias
 318         { "ISO-8859-15",     decode_iso_8859_15  }, // offical name
 319         { "ISO8859-1",       decode_iso_8859_1   },
 320         { "ISO8859-15",      decode_iso_8859_15  },
 321         { "ISO_8859-1",      decode_iso_8859_1   }, // offical alias
 322         { "ISO_8859-15",     decode_iso_8859_15  }, // offical alias
 323         { "ISO_8859-1:1987", decode_iso_8859_1   }, // offical name
 324         { "Latin-9",         decode_iso_8859_15  }, // offical alias
 325         { "UTF-8",           decode_utf8         }, // offical name
 326         { "csISOLatin1",     decode_iso_8859_1   }, // offical alias
 327         { "cp1252",          decode_windows_1252 },
 328         { "iso-ir-100",      decode_iso_8859_1   }, // offical alias
 329         { "l1",              decode_iso_8859_1   }, // offical alias
 330         { "latin1",          decode_iso_8859_1   }, // offical alias
 331         { "windows-1252",    decode_windows_1252 }, // official name
 332
 333         { NULL,              NULL                }
 334 };
 335
 336 void select_input_encoding(char const* const encoding)
 337 {
 338         for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 339                 if (strcasecmp(encoding, i->name) != 0)
 340                         continue;
 341                 decoder = i->decoder;
 342                 return;
 343         }
 344         fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
 345 }
 346
 347 static inline void next_real_char(void)
 348 {
 349         assert(bufpos <= bufend);
 350         if (bufpos >= bufend) {
 351                 if (input == NULL) {
 352                         c = EOF;
 353                         return;
 354                 }
 355                 decoder();
 356         }
 357         c = *bufpos++;
 358 }
 359
 360 /**
 361  * Put a character back into the buffer.
 362  *
 363  * @param pc  the character to put back
 364  */
 365 static inline void put_back(utf32 const pc)
 366 {
 367         assert(bufpos > buf);
 368         *(--bufpos - buf + buf) = pc;
 369
 370 #ifdef DEBUG_CHARS
 371         printf("putback '%lc'\n", pc);
 372 #endif
 373 }
 374
 375 static inline void next_char(void);
 376
 377 #define MATCH_NEWLINE(code)                   \
 378         case '\r':                                \
 379                 next_char();                          \
 380                 if (c == '\n') {                      \
 381                         next_char();                      \
 382                 }                                     \
 383                 lexer_token.source_position.linenr++; \
 384                 code                                  \
 385         case '\n':                                \
 386                 next_char();                          \
 387                 lexer_token.source_position.linenr++; \
 388                 code
 389
 390 #define eat(c_type)  do { assert(c == c_type); next_char(); } while (0)
 391
 392 static void maybe_concat_lines(void)
 393 {
 394         eat('\\');
 395
 396         switch (c) {
 397         MATCH_NEWLINE(return;)
 398
 399         default:
 400                 break;
 401         }
 402
 403         put_back(c);
 404         c = '\\';
 405 }
 406
 407 /**
 408  * Set c to the next input character, ie.
 409  * after expanding trigraphs.
 410  */
 411 static inline void next_char(void)
 412 {
 413         next_real_char();
 414
 415         /* filter trigraphs */
 416         if (UNLIKELY(c == '\\')) {
 417                 maybe_concat_lines();
 418                 goto end_of_next_char;
 419         }
 420
 421         if (LIKELY(c != '?'))
 422                 goto end_of_next_char;
 423
 424         next_real_char();
 425         if (LIKELY(c != '?')) {
 426                 put_back(c);
 427                 c = '?';
 428                 goto end_of_next_char;
 429         }
 430
 431         next_real_char();
 432         switch (c) {
 433         case '=': c = '#'; break;
 434         case '(': c = '['; break;
 435         case '/': c = '\\'; maybe_concat_lines(); break;
 436         case ')': c = ']'; break;
 437         case '\'': c = '^'; break;
 438         case '<': c = '{'; break;
 439         case '!': c = '|'; break;
 440         case '>': c = '}'; break;
 441         case '-': c = '~'; break;
 442         default:
 443                 put_back(c);
 444                 put_back('?');
 445                 c = '?';
 446                 break;
 447         }
 448
 449 end_of_next_char:;
 450 #ifdef DEBUG_CHARS
 451         printf("nchar '%c'\n", c);
 452 #endif
 453 }
 454
 455 #define SYMBOL_CHARS  \
 456         case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
 457         case 'a':         \
 458         case 'b':         \
 459         case 'c':         \
 460         case 'd':         \
 461         case 'e':         \
 462         case 'f':         \
 463         case 'g':         \
 464         case 'h':         \
 465         case 'i':         \
 466         case 'j':         \
 467         case 'k':         \
 468         case 'l':         \
 469         case 'm':         \
 470         case 'n':         \
 471         case 'o':         \
 472         case 'p':         \
 473         case 'q':         \
 474         case 'r':         \
 475         case 's':         \
 476         case 't':         \
 477         case 'u':         \
 478         case 'v':         \
 479         case 'w':         \
 480         case 'x':         \
 481         case 'y':         \
 482         case 'z':         \
 483         case 'A':         \
 484         case 'B':         \
 485         case 'C':         \
 486         case 'D':         \
 487         case 'E':         \
 488         case 'F':         \
 489         case 'G':         \
 490         case 'H':         \
 491         case 'I':         \
 492         case 'J':         \
 493         case 'K':         \
 494         case 'L':         \
 495         case 'M':         \
 496         case 'N':         \
 497         case 'O':         \
 498         case 'P':         \
 499         case 'Q':         \
 500         case 'R':         \
 501         case 'S':         \
 502         case 'T':         \
 503         case 'U':         \
 504         case 'V':         \
 505         case 'W':         \
 506         case 'X':         \
 507         case 'Y':         \
 508         case 'Z':         \
 509         case '_':
 510
 511 #define DIGITS        \
 512         case '0':         \
 513         case '1':         \
 514         case '2':         \
 515         case '3':         \
 516         case '4':         \
 517         case '5':         \
 518         case '6':         \
 519         case '7':         \
 520         case '8':         \
 521         case '9':
 522
 523 /**
 524  * Read a symbol from the input and build
 525  * the lexer_token.
 526  */
 527 static void parse_symbol(void)
 528 {
 529         obstack_1grow(&symbol_obstack, (char) c);
 530         next_char();
 531
 532         while (true) {
 533                 switch (c) {
 534                 DIGITS
 535                 SYMBOL_CHARS
 536                         obstack_1grow(&symbol_obstack, (char) c);
 537                         next_char();
 538                         break;
 539
 540                 default:
 541 dollar_sign:
 542                         goto end_symbol;
 543                 }
 544         }
 545
 546 end_symbol:
 547         obstack_1grow(&symbol_obstack, '\0');
 548
 549         char     *string = obstack_finish(&symbol_obstack);
 550         symbol_t *symbol = symbol_table_insert(string);
 551
 552         lexer_token.type   = symbol->ID;
 553         lexer_token.symbol = symbol;
 554
 555         if (symbol->string != string) {
 556                 obstack_free(&symbol_obstack, string);
 557         }
 558 }
 559
 560 /**
 561  * parse suffixes like 'LU' or 'f' after numbers
 562  */
 563 static void parse_number_suffix(void)
 564 {
 565         assert(obstack_object_size(&symbol_obstack) == 0);
 566         while (true) {
 567                 switch (c) {
 568                 SYMBOL_CHARS
 569                         obstack_1grow(&symbol_obstack, (char) c);
 570                         next_char();
 571                         break;
 572                 default:
 573                 dollar_sign:
 574                         goto finish_suffix;
 575                 }
 576         }
 577 finish_suffix:
 578         if (obstack_object_size(&symbol_obstack) == 0) {
 579                 lexer_token.symbol = NULL;
 580                 return;
 581         }
 582
 583         obstack_1grow(&symbol_obstack, '\0');
 584         char     *string = obstack_finish(&symbol_obstack);
 585         symbol_t *symbol = symbol_table_insert(string);
 586
 587         if (symbol->string != string) {
 588                 obstack_free(&symbol_obstack, string);
 589         }
 590         lexer_token.symbol = symbol;
 591 }
 592
 593 static string_t identify_string(char *string, size_t len)
 594 {
 595         /* TODO hash */
 596 #if 0
 597         const char *result = strset_insert(&stringset, concat);
 598         if (result != concat) {
 599                 obstack_free(&symbol_obstack, concat);
 600         }
 601 #else
 602         const char *result = string;
 603 #endif
 604         return (string_t) {result, len};
 605 }
 606
 607 /**
 608  * Parses a hex number including hex floats and set the
 609  * lexer_token.
 610  */
 611 static void parse_number_hex(void)
 612 {
 613         bool is_float   = false;
 614         bool has_digits = false;
 615
 616         assert(obstack_object_size(&symbol_obstack) == 0);
 617         while (isxdigit(c)) {
 618                 has_digits = true;
 619                 obstack_1grow(&symbol_obstack, (char) c);
 620                 next_char();
 621         }
 622
 623         if (c == '.') {
 624                 is_float = true;
 625                 obstack_1grow(&symbol_obstack, (char) c);
 626                 next_char();
 627
 628                 while (isxdigit(c)) {
 629                         has_digits = true;
 630                         obstack_1grow(&symbol_obstack, (char) c);
 631                         next_char();
 632                 }
 633         }
 634         if (c == 'p' || c == 'P') {
 635                 is_float = true;
 636                 obstack_1grow(&symbol_obstack, (char) c);
 637                 next_char();
 638
 639                 if (c == '-' || c == '+') {
 640                         obstack_1grow(&symbol_obstack, (char) c);
 641                         next_char();
 642                 }
 643
 644                 while (isxdigit(c)) {
 645                         obstack_1grow(&symbol_obstack, (char) c);
 646                         next_char();
 647                 }
 648         } else if (is_float) {
 649                 errorf(&lexer_token.source_position,
 650                        "hexadecimal floatingpoint constant requires an exponent");
 651         }
 652         obstack_1grow(&symbol_obstack, '\0');
 653
 654         size_t  size   = obstack_object_size(&symbol_obstack) - 1;
 655         char   *string = obstack_finish(&symbol_obstack);
 656         lexer_token.literal = identify_string(string, size);
 657
 658         lexer_token.type    =
 659                 is_float ? T_FLOATINGPOINT_HEXADECIMAL : T_INTEGER_HEXADECIMAL;
 660
 661         if (!has_digits) {
 662                 errorf(&lexer_token.source_position, "invalid number literal '0x%S'",
 663                        &lexer_token.literal);
 664                 lexer_token.literal.begin = "0";
 665                 lexer_token.literal.size  = 1;
 666         }
 667
 668         parse_number_suffix();
 669 }
 670
 671 /**
 672  * Returns true if the given char is a octal digit.
 673  *
 674  * @param char  the character to check
 675  */
 676 static bool is_octal_digit(utf32 chr)
 677 {
 678         switch (chr) {
 679         case '0':
 680         case '1':
 681         case '2':
 682         case '3':
 683         case '4':
 684         case '5':
 685         case '6':
 686         case '7':
 687                 return true;
 688         default:
 689                 return false;
 690         }
 691 }
 692
 693 /**
 694  * Parses a number and sets the lexer_token.
 695  */
 696 static void parse_number(void)
 697 {
 698         bool is_float   = false;
 699         bool has_digits = false;
 700
 701         assert(obstack_object_size(&symbol_obstack) == 0);
 702         if (c == '0') {
 703                 next_char();
 704                 if (c == 'x' || c == 'X') {
 705                         next_char();
 706                         parse_number_hex();
 707                         return;
 708                 } else {
 709                         has_digits = true;
 710                 }
 711                 obstack_1grow(&symbol_obstack, '0');
 712         }
 713
 714         while (isdigit(c)) {
 715                 has_digits = true;
 716                 obstack_1grow(&symbol_obstack, (char) c);
 717                 next_char();
 718         }
 719
 720         if (c == '.') {
 721                 is_float = true;
 722                 obstack_1grow(&symbol_obstack, '.');
 723                 next_char();
 724
 725                 while (isdigit(c)) {
 726                         has_digits = true;
 727                         obstack_1grow(&symbol_obstack, (char) c);
 728                         next_char();
 729                 }
 730         }
 731         if (c == 'e' || c == 'E') {
 732                 is_float = true;
 733                 obstack_1grow(&symbol_obstack, 'e');
 734                 next_char();
 735
 736                 if (c == '-' || c == '+') {
 737                         obstack_1grow(&symbol_obstack, (char) c);
 738                         next_char();
 739                 }
 740
 741                 while (isdigit(c)) {
 742                         obstack_1grow(&symbol_obstack, (char) c);
 743                         next_char();
 744                 }
 745         }
 746
 747         obstack_1grow(&symbol_obstack, '\0');
 748         size_t  size   = obstack_object_size(&symbol_obstack) - 1;
 749         char   *string = obstack_finish(&symbol_obstack);
 750         lexer_token.literal = identify_string(string, size);
 751
 752         /* is it an octal number? */
 753         if (is_float) {
 754                 lexer_token.type = T_FLOATINGPOINT;
 755         } else if (string[0] == '0') {
 756                 lexer_token.type = T_INTEGER_OCTAL;
 757
 758                 /* check for invalid octal digits */
 759                 for (size_t i= 0; i < size; ++i) {
 760                         char t = string[i];
 761                         if (t == '8' || t == '9')
 762                                 errorf(&lexer_token.source_position,
 763                                        "invalid digit '%c' in octal number", t);
 764                 }
 765         } else {
 766                 lexer_token.type = T_INTEGER;
 767         }
 768
 769         if (!has_digits) {
 770                 errorf(&lexer_token.source_position, "invalid number literal '%S'",
 771                        &lexer_token.literal);
 772         }
 773
 774         parse_number_suffix();
 775 }
 776
 777 /**
 778  * Returns the value of a digit.
 779  * The only portable way to do it ...
 780  */
 781 static int digit_value(utf32 const digit)
 782 {
 783         switch (digit) {
 784         case '0': return 0;
 785         case '1': return 1;
 786         case '2': return 2;
 787         case '3': return 3;
 788         case '4': return 4;
 789         case '5': return 5;
 790         case '6': return 6;
 791         case '7': return 7;
 792         case '8': return 8;
 793         case '9': return 9;
 794         case 'a':
 795         case 'A': return 10;
 796         case 'b':
 797         case 'B': return 11;
 798         case 'c':
 799         case 'C': return 12;
 800         case 'd':
 801         case 'D': return 13;
 802         case 'e':
 803         case 'E': return 14;
 804         case 'f':
 805         case 'F': return 15;
 806         default:
 807                 internal_error("wrong character given");
 808         }
 809 }
 810
 811 /**
 812  * Parses an octal character sequence.
 813  *
 814  * @param first_digit  the already read first digit
 815  */
 816 static utf32 parse_octal_sequence(utf32 const first_digit)
 817 {
 818         assert(is_octal_digit(first_digit));
 819         utf32 value = digit_value(first_digit);
 820         if (!is_octal_digit(c)) return value;
 821         value = 8 * value + digit_value(c);
 822         next_char();
 823         if (!is_octal_digit(c)) return value;
 824         value = 8 * value + digit_value(c);
 825         next_char();
 826         return value;
 827 }
 828
 829 /**
 830  * Parses a hex character sequence.
 831  */
 832 static utf32 parse_hex_sequence(void)
 833 {
 834         utf32 value = 0;
 835         while (isxdigit(c)) {
 836                 value = 16 * value + digit_value(c);
 837                 next_char();
 838         }
 839         return value;
 840 }
 841
 842 /**
 843  * Parse an escape sequence.
 844  */
 845 static utf32 parse_escape_sequence(void)
 846 {
 847         eat('\\');
 848
 849         utf32 const ec = c;
 850         next_char();
 851
 852         switch (ec) {
 853         case '"':  return '"';
 854         case '\'': return '\'';
 855         case '\\': return '\\';
 856         case '?': return '\?';
 857         case 'a': return '\a';
 858         case 'b': return '\b';
 859         case 'f': return '\f';
 860         case 'n': return '\n';
 861         case 'r': return '\r';
 862         case 't': return '\t';
 863         case 'v': return '\v';
 864         case 'x':
 865                 return parse_hex_sequence();
 866         case '0':
 867         case '1':
 868         case '2':
 869         case '3':
 870         case '4':
 871         case '5':
 872         case '6':
 873         case '7':
 874                 return parse_octal_sequence(ec);
 875         case EOF:
 876                 parse_error("reached end of file while parsing escape sequence");
 877                 return EOF;
 878         /* \E is not documented, but handled, by GCC.  It is acceptable according
 879          * to §6.11.4, whereas \e is not. */
 880         case 'E':
 881         case 'e':
 882                 if (c_mode & _GNUC)
 883                         return 27;   /* hopefully 27 is ALWAYS the code for ESCAPE */
 884                 break;
 885         case 'u':
 886         case 'U':
 887                 parse_error("universal character parsing not implemented yet");
 888                 return EOF;
 889         default:
 890                 break;
 891         }
 892         /* §6.4.4.4:8 footnote 64 */
 893         parse_error("unknown escape sequence");
 894         return EOF;
 895 }
 896
 897 /**
 898  * Concatenate two strings.
 899  */
 900 string_t concat_strings(const string_t *const s1, const string_t *const s2)
 901 {
 902         const size_t len1 = s1->size - 1;
 903         const size_t len2 = s2->size - 1;
 904
 905         char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
 906         memcpy(concat, s1->begin, len1);
 907         memcpy(concat + len1, s2->begin, len2 + 1);
 908
 909         return identify_string(concat, len1 + len2 + 1);
 910 }
 911
 912 string_t make_string(const char *string)
 913 {
 914         size_t      len   = strlen(string) + 1;
 915         char *const space = obstack_alloc(&symbol_obstack, len);
 916         memcpy(space, string, len);
 917
 918         return identify_string(space, len);
 919 }
 920
 921 static void grow_symbol(utf32 const tc)
 922 {
 923         struct obstack *const o  = &symbol_obstack;
 924         if (tc < 0x80U) {
 925                 obstack_1grow(o, tc);
 926         } else if (tc < 0x800) {
 927                 obstack_1grow(o, 0xC0 | (tc >> 6));
 928                 obstack_1grow(o, 0x80 | (tc & 0x3F));
 929         } else if (tc < 0x10000) {
 930                 obstack_1grow(o, 0xE0 | ( tc >> 12));
 931                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
 932                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
 933         } else {
 934                 obstack_1grow(o, 0xF0 | ( tc >> 18));
 935                 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
 936                 obstack_1grow(o, 0x80 | ((tc >>  6) & 0x3F));
 937                 obstack_1grow(o, 0x80 | ( tc        & 0x3F));
 938         }
 939 }
 940
 941 /**
 942  * Parse a string literal and set lexer_token.
 943  */
 944 static void parse_string_literal(void)
 945 {
 946         const unsigned start_linenr = lexer_token.source_position.linenr;
 947
 948         eat('"');
 949
 950         while (true) {
 951                 switch (c) {
 952                 case '\\': {
 953                         utf32 const tc = parse_escape_sequence();
 954                         if (tc >= 0x100) {
 955                                 warningf(&lexer_token.source_position,
 956                                                 "escape sequence out of range");
 957                         }
 958                         obstack_1grow(&symbol_obstack, tc);
 959                         break;
 960                 }
 961
 962                 case EOF: {
 963                         source_position_t source_position;
 964                         source_position.input_name = lexer_token.source_position.input_name;
 965                         source_position.linenr     = start_linenr;
 966                         errorf(&source_position, "string has no end");
 967                         lexer_token.type = T_ERROR;
 968                         return;
 969                 }
 970
 971                 case '"':
 972                         next_char();
 973                         goto end_of_string;
 974
 975                 default:
 976                         grow_symbol(c);
 977                         next_char();
 978                         break;
 979                 }
 980         }
 981
 982 end_of_string:
 983
 984         /* TODO: concatenate multiple strings separated by whitespace... */
 985
 986         /* add finishing 0 to the string */
 987         obstack_1grow(&symbol_obstack, '\0');
 988         const size_t  size   = (size_t)obstack_object_size(&symbol_obstack);
 989         char         *string = obstack_finish(&symbol_obstack);
 990
 991         lexer_token.type    = T_STRING_LITERAL;
 992         lexer_token.literal = identify_string(string, size);
 993 }
 994
 995 /**
 996  * Parse a wide character constant and set lexer_token.
 997  */
 998 static void parse_wide_character_constant(void)
 999 {
1000         const unsigned start_linenr = lexer_token.source_position.linenr;
1001
1002         eat('\'');
1003
1004         while (true) {
1005                 switch (c) {
1006                 case '\\': {
1007                         const utf32 tc = parse_escape_sequence();
1008                         grow_symbol(tc);
1009                         break;
1010                 }
1011
1012                 MATCH_NEWLINE(
1013                         parse_error("newline while parsing character constant");
1014                         break;
1015                 )
1016
1017                 case '\'':
1018                         next_char();
1019                         goto end_of_wide_char_constant;
1020
1021                 case EOF: {
1022                         source_position_t source_position = lexer_token.source_position;
1023                         source_position.linenr = start_linenr;
1024                         errorf(&source_position, "EOF while parsing character constant");
1025                         lexer_token.type = T_ERROR;
1026                         return;
1027                 }
1028
1029                 default:
1030                         grow_symbol(c);
1031                         next_char();
1032                         break;
1033                 }
1034         }
1035
1036 end_of_wide_char_constant:;
1037         obstack_1grow(&symbol_obstack, '\0');
1038         size_t  size   = (size_t) obstack_object_size(&symbol_obstack) - 1;
1039         char   *string = obstack_finish(&symbol_obstack);
1040
1041         lexer_token.type     = T_WIDE_CHARACTER_CONSTANT;
1042         lexer_token.literal  = identify_string(string, size);
1043 }
1044
1045 /**
1046  * Parse a wide string literal and set lexer_token.
1047  */
1048 static void parse_wide_string_literal(void)
1049 {
1050         parse_string_literal();
1051         if (lexer_token.type == T_STRING_LITERAL)
1052                 lexer_token.type = T_WIDE_STRING_LITERAL;
1053 }
1054
1055 /**
1056  * Parse a character constant and set lexer_token.
1057  */
1058 static void parse_character_constant(void)
1059 {
1060         const unsigned start_linenr = lexer_token.source_position.linenr;
1061
1062         eat('\'');
1063
1064         while (true) {
1065                 switch (c) {
1066                 case '\\': {
1067                         utf32 const tc = parse_escape_sequence();
1068                         if (tc >= 0x100) {
1069                                 warningf(&lexer_token.source_position,
1070                                                 "escape sequence out of range");
1071                         }
1072                         obstack_1grow(&symbol_obstack, tc);
1073                         break;
1074                 }
1075
1076                 MATCH_NEWLINE(
1077                         parse_error("newline while parsing character constant");
1078                         break;
1079                 )
1080
1081                 case '\'':
1082                         next_char();
1083                         goto end_of_char_constant;
1084
1085                 case EOF: {
1086                         source_position_t source_position;
1087                         source_position.input_name = lexer_token.source_position.input_name;
1088                         source_position.linenr     = start_linenr;
1089                         errorf(&source_position, "EOF while parsing character constant");
1090                         lexer_token.type = T_ERROR;
1091                         return;
1092                 }
1093
1094                 default:
1095                         grow_symbol(c);
1096                         next_char();
1097                         break;
1098
1099                 }
1100         }
1101
1102 end_of_char_constant:;
1103         obstack_1grow(&symbol_obstack, '\0');
1104         const size_t        size   = (size_t)obstack_object_size(&symbol_obstack)-1;
1105         char         *const string = obstack_finish(&symbol_obstack);
1106
1107         lexer_token.type    = T_CHARACTER_CONSTANT;
1108         lexer_token.literal = identify_string(string, size);
1109 }
1110
1111 /**
1112  * Skip a multiline comment.
1113  */
1114 static void skip_multiline_comment(void)
1115 {
1116         unsigned start_linenr = lexer_token.source_position.linenr;
1117
1118         while (true) {
1119                 switch (c) {
1120                 case '/':
1121                         next_char();
1122                         if (c == '*') {
1123                                 /* nested comment, warn here */
1124                                 if (warning.comment) {
1125                                         warningf(&lexer_token.source_position, "'/*' within comment");
1126                                 }
1127                         }
1128                         break;
1129                 case '*':
1130                         next_char();
1131                         if (c == '/') {
1132                                 next_char();
1133                                 return;
1134                         }
1135                         break;
1136
1137                 MATCH_NEWLINE(break;)
1138
1139                 case EOF: {
1140                         source_position_t source_position;
1141                         source_position.input_name = lexer_token.source_position.input_name;
1142                         source_position.linenr     = start_linenr;
1143                         errorf(&source_position, "at end of file while looking for comment end");
1144                         return;
1145                 }
1146
1147                 default:
1148                         next_char();
1149                         break;
1150                 }
1151         }
1152 }
1153
1154 /**
1155  * Skip a single line comment.
1156  */
1157 static void skip_line_comment(void)
1158 {
1159         while (true) {
1160                 switch (c) {
1161                 case EOF:
1162                         return;
1163
1164                 case '\n':
1165                 case '\r':
1166                         return;
1167
1168                 case '\\':
1169                         next_char();
1170                         if (c == '\n' || c == '\r') {
1171                                 if (warning.comment)
1172                                         warningf(&lexer_token.source_position, "multi-line comment");
1173                                 return;
1174                         }
1175                         break;
1176
1177                 default:
1178                         next_char();
1179                         break;
1180                 }
1181         }
1182 }
1183
1184 /** The current preprocessor token. */
1185 static token_t pp_token;
1186
1187 /**
1188  * Read the next preprocessor token.
1189  */
1190 static inline void next_pp_token(void)
1191 {
1192         lexer_next_preprocessing_token();
1193         pp_token = lexer_token;
1194 }
1195
1196 /**
1197  * Eat all preprocessor tokens until newline.
1198  */
1199 static void eat_until_newline(void)
1200 {
1201         while (pp_token.type != '\n' && pp_token.type != T_EOF) {
1202                 next_pp_token();
1203         }
1204 }
1205
1206 /**
1207  * Handle the define directive.
1208  */
1209 static void define_directive(void)
1210 {
1211         lexer_next_preprocessing_token();
1212         if (lexer_token.type != T_IDENTIFIER) {
1213                 parse_error("expected identifier after #define\n");
1214                 eat_until_newline();
1215         }
1216 }
1217
1218 /**
1219  * Handle the ifdef directive.
1220  */
1221 static void ifdef_directive(int is_ifndef)
1222 {
1223         (void) is_ifndef;
1224         lexer_next_preprocessing_token();
1225         //expect_identifier();
1226         //extect_newline();
1227 }
1228
1229 /**
1230  * Handle the endif directive.
1231  */
1232 static void endif_directive(void)
1233 {
1234         //expect_newline();
1235 }
1236
1237 /**
1238  * Parse the line directive.
1239  */
1240 static void parse_line_directive(void)
1241 {
1242         if (pp_token.type != T_INTEGER) {
1243                 parse_error("expected integer");
1244         } else {
1245                 /* use offset -1 as this is about the next line */
1246                 lexer_token.source_position.linenr = atoi(pp_token.literal.begin) - 1;
1247                 next_pp_token();
1248         }
1249         if (pp_token.type == T_STRING_LITERAL) {
1250                 lexer_token.source_position.input_name = pp_token.literal.begin;
1251                 next_pp_token();
1252         }
1253
1254         eat_until_newline();
1255 }
1256
1257 /**
1258  * STDC pragmas.
1259  */
1260 typedef enum stdc_pragma_kind_t {
1261         STDC_UNKNOWN,
1262         STDC_FP_CONTRACT,
1263         STDC_FENV_ACCESS,
1264         STDC_CX_LIMITED_RANGE
1265 } stdc_pragma_kind_t;
1266
1267 /**
1268  * STDC pragma values.
1269  */
1270 typedef enum stdc_pragma_value_kind_t {
1271         STDC_VALUE_UNKNOWN,
1272         STDC_VALUE_ON,
1273         STDC_VALUE_OFF,
1274         STDC_VALUE_DEFAULT
1275 } stdc_pragma_value_kind_t;
1276
1277 /**
1278  * Parse a pragma directive.
1279  */
1280 static void parse_pragma(void)
1281 {
1282         bool unknown_pragma = true;
1283
1284         next_pp_token();
1285         if (pp_token.symbol->pp_ID == TP_STDC) {
1286                 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1287                 /* a STDC pragma */
1288                 if (c_mode & _C99) {
1289                         next_pp_token();
1290
1291                         switch (pp_token.symbol->pp_ID) {
1292                         case TP_FP_CONTRACT:
1293                                 kind = STDC_FP_CONTRACT;
1294                                 break;
1295                         case TP_FENV_ACCESS:
1296                                 kind = STDC_FENV_ACCESS;
1297                                 break;
1298                         case TP_CX_LIMITED_RANGE:
1299                                 kind = STDC_CX_LIMITED_RANGE;
1300                                 break;
1301                         default:
1302                                 break;
1303                         }
1304                         if (kind != STDC_UNKNOWN) {
1305                                 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1306                                 next_pp_token();
1307                                 switch (pp_token.symbol->pp_ID) {
1308                                 case TP_ON:
1309                                         value = STDC_VALUE_ON;
1310                                         break;
1311                                 case TP_OFF:
1312                                         value = STDC_VALUE_OFF;
1313                                         break;
1314                                 case TP_DEFAULT:
1315                                         value = STDC_VALUE_DEFAULT;
1316                                         break;
1317                                 default:
1318                                         break;
1319                                 }
1320                                 if (value != STDC_VALUE_UNKNOWN) {
1321                                         unknown_pragma = false;
1322                                 } else {
1323                                         errorf(&pp_token.source_position, "bad STDC pragma argument");
1324                                 }
1325                         }
1326                 }
1327         } else {
1328                 unknown_pragma = true;
1329         }
1330         eat_until_newline();
1331         if (unknown_pragma && warning.unknown_pragmas) {
1332                 warningf(&pp_token.source_position, "encountered unknown #pragma");
1333         }
1334 }
1335
1336 /**
1337  * Parse a preprocessor non-null directive.
1338  */
1339 static void parse_preprocessor_identifier(void)
1340 {
1341         assert(pp_token.type == T_IDENTIFIER);
1342         symbol_t *symbol = pp_token.symbol;
1343
1344         switch (symbol->pp_ID) {
1345         case TP_include:
1346                 printf("include - enable header name parsing!\n");
1347                 break;
1348         case TP_define:
1349                 define_directive();
1350                 break;
1351         case TP_ifdef:
1352                 ifdef_directive(0);
1353                 break;
1354         case TP_ifndef:
1355                 ifdef_directive(1);
1356                 break;
1357         case TP_endif:
1358                 endif_directive();
1359                 break;
1360         case TP_line:
1361                 next_pp_token();
1362                 parse_line_directive();
1363                 break;
1364         case TP_if:
1365         case TP_else:
1366         case TP_elif:
1367         case TP_undef:
1368         case TP_error:
1369                 /* TODO; output the rest of the line */
1370                 parse_error("#error directive: ");
1371                 break;
1372         case TP_pragma:
1373                 parse_pragma();
1374                 break;
1375         }
1376 }
1377
1378 /**
1379  * Parse a preprocessor directive.
1380  */
1381 static void parse_preprocessor_directive(void)
1382 {
1383         next_pp_token();
1384
1385         switch (pp_token.type) {
1386         case T_IDENTIFIER:
1387                 parse_preprocessor_identifier();
1388                 break;
1389         case T_INTEGER:
1390                 parse_line_directive();
1391                 break;
1392         case '\n':
1393                 /* NULL directive, see §6.10.7 */
1394                 break;
1395         default:
1396                 parse_error("invalid preprocessor directive");
1397                 eat_until_newline();
1398                 break;
1399         }
1400 }
1401
1402 #define MAYBE_PROLOG                                       \
1403                         next_char();                                   \
1404                         while (true) {                                 \
1405                                 switch (c) {
1406
1407 #define MAYBE(ch, set_type)                                \
1408                                 case ch:                                   \
1409                                         next_char();                           \
1410                                         lexer_token.type = set_type;           \
1411                                         return;
1412
1413 /* must use this as last thing */
1414 #define MAYBE_MODE(ch, set_type, mode)                     \
1415                                 case ch:                                   \
1416                                         if (c_mode & mode) {                   \
1417                                                 next_char();                       \
1418                                                 lexer_token.type = set_type;       \
1419                                                 return;                            \
1420                                         }                                      \
1421                                         /* fallthrough */
1422
1423 #define ELSE_CODE(code)                                    \
1424                                 default:                                   \
1425                                         code                                   \
1426                                 }                                          \
1427                         } /* end of while (true) */                    \
1428                         break;
1429
1430 #define ELSE(set_type)                                     \
1431                 ELSE_CODE(                                         \
1432                         lexer_token.type = set_type;                   \
1433                         return;                                        \
1434                 )
1435
1436 void lexer_next_preprocessing_token(void)
1437 {
1438         while (true) {
1439                 switch (c) {
1440                 case ' ':
1441                 case '\t':
1442                         next_char();
1443                         break;
1444
1445                 MATCH_NEWLINE(
1446                         lexer_token.type = '\n';
1447                         return;
1448                 )
1449
1450                 SYMBOL_CHARS
1451                         parse_symbol();
1452                         /* might be a wide string ( L"string" ) */
1453                         if (lexer_token.symbol == symbol_L) {
1454                                 switch (c) {
1455                                         case '"':  parse_wide_string_literal();     break;
1456                                         case '\'': parse_wide_character_constant(); break;
1457                                 }
1458                         }
1459                         return;
1460
1461                 DIGITS
1462                         parse_number();
1463                         return;
1464
1465                 case '"':
1466                         parse_string_literal();
1467                         return;
1468
1469                 case '\'':
1470                         parse_character_constant();
1471                         return;
1472
1473                 case '.':
1474                         MAYBE_PROLOG
1475                                 DIGITS
1476                                         put_back(c);
1477                                         c = '.';
1478                                         parse_number();
1479                                         return;
1480
1481                                 case '.':
1482                                         MAYBE_PROLOG
1483                                         MAYBE('.', T_DOTDOTDOT)
1484                                         ELSE_CODE(
1485                                                 put_back(c);
1486                                                 c = '.';
1487                                                 lexer_token.type = '.';
1488                                                 return;
1489                                         )
1490                         ELSE('.')
1491                 case '&':
1492                         MAYBE_PROLOG
1493                         MAYBE('&', T_ANDAND)
1494                         MAYBE('=', T_ANDEQUAL)
1495                         ELSE('&')
1496                 case '*':
1497                         MAYBE_PROLOG
1498                         MAYBE('=', T_ASTERISKEQUAL)
1499                         ELSE('*')
1500                 case '+':
1501                         MAYBE_PROLOG
1502                         MAYBE('+', T_PLUSPLUS)
1503                         MAYBE('=', T_PLUSEQUAL)
1504                         ELSE('+')
1505                 case '-':
1506                         MAYBE_PROLOG
1507                         MAYBE('>', T_MINUSGREATER)
1508                         MAYBE('-', T_MINUSMINUS)
1509                         MAYBE('=', T_MINUSEQUAL)
1510                         ELSE('-')
1511                 case '!':
1512                         MAYBE_PROLOG
1513                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1514                         ELSE('!')
1515                 case '/':
1516                         MAYBE_PROLOG
1517                         MAYBE('=', T_SLASHEQUAL)
1518                                 case '*':
1519                                         next_char();
1520                                         skip_multiline_comment();
1521                                         lexer_next_preprocessing_token();
1522                                         return;
1523                                 case '/':
1524                                         next_char();
1525                                         skip_line_comment();
1526                                         lexer_next_preprocessing_token();
1527                                         return;
1528                         ELSE('/')
1529                 case '%':
1530                         MAYBE_PROLOG
1531                         MAYBE('>', '}')
1532                         MAYBE('=', T_PERCENTEQUAL)
1533                                 case ':':
1534                                         MAYBE_PROLOG
1535                                                 case '%':
1536                                                         MAYBE_PROLOG
1537                                                         MAYBE(':', T_HASHHASH)
1538                                                         ELSE_CODE(
1539                                                                 put_back(c);
1540                                                                 c = '%';
1541                                                                 lexer_token.type = '#';
1542                                                                 return;
1543                                                         )
1544                                         ELSE('#')
1545                         ELSE('%')
1546                 case '<':
1547                         MAYBE_PROLOG
1548                         MAYBE(':', '[')
1549                         MAYBE('%', '{')
1550                         MAYBE('=', T_LESSEQUAL)
1551                                 case '<':
1552                                         MAYBE_PROLOG
1553                                         MAYBE('=', T_LESSLESSEQUAL)
1554                                         ELSE(T_LESSLESS)
1555                         ELSE('<')
1556                 case '>':
1557                         MAYBE_PROLOG
1558                         MAYBE('=', T_GREATEREQUAL)
1559                                 case '>':
1560                                         MAYBE_PROLOG
1561                                         MAYBE('=', T_GREATERGREATEREQUAL)
1562                                         ELSE(T_GREATERGREATER)
1563                         ELSE('>')
1564                 case '^':
1565                         MAYBE_PROLOG
1566                         MAYBE('=', T_CARETEQUAL)
1567                         ELSE('^')
1568                 case '|':
1569                         MAYBE_PROLOG
1570                         MAYBE('=', T_PIPEEQUAL)
1571                         MAYBE('|', T_PIPEPIPE)
1572                         ELSE('|')
1573                 case ':':
1574                         MAYBE_PROLOG
1575                         MAYBE('>', ']')
1576                         MAYBE_MODE(':', T_COLONCOLON, _CXX)
1577                         ELSE(':')
1578                 case '=':
1579                         MAYBE_PROLOG
1580                         MAYBE('=', T_EQUALEQUAL)
1581                         ELSE('=')
1582                 case '#':
1583                         MAYBE_PROLOG
1584                         MAYBE('#', T_HASHHASH)
1585                         ELSE('#')
1586
1587                 case '?':
1588                 case '[':
1589                 case ']':
1590                 case '(':
1591                 case ')':
1592                 case '{':
1593                 case '}':
1594                 case '~':
1595                 case ';':
1596                 case ',':
1597                 case '\\':
1598                         lexer_token.type = c;
1599                         next_char();
1600                         return;
1601
1602                 case EOF:
1603                         lexer_token.type = T_EOF;
1604                         return;
1605
1606                 default:
1607 dollar_sign:
1608                         errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1609                         next_char();
1610                         lexer_token.type = T_ERROR;
1611                         return;
1612                 }
1613         }
1614 }
1615
1616 void lexer_next_token(void)
1617 {
1618         lexer_next_preprocessing_token();
1619
1620         while (lexer_token.type == '\n') {
1621 newline_found:
1622                 lexer_next_preprocessing_token();
1623         }
1624
1625         if (lexer_token.type == '#') {
1626                 parse_preprocessor_directive();
1627                 goto newline_found;
1628         }
1629 }
1630
1631 void init_lexer(void)
1632 {
1633         strset_init(&stringset);
1634         symbol_L = symbol_table_insert("L");
1635 }
1636
1637 void lexer_open_stream(FILE *stream, const char *input_name)
1638 {
1639         input                                  = stream;
1640         lexer_token.source_position.linenr     = 0;
1641         lexer_token.source_position.input_name = input_name;
1642
1643         bufpos = NULL;
1644         bufend = NULL;
1645
1646         /* place a virtual \n at the beginning so the lexer knows that we're
1647          * at the beginning of a line */
1648         c = '\n';
1649 }
1650
1651 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1652 {
1653         input                                  = NULL;
1654         lexer_token.source_position.linenr     = 0;
1655         lexer_token.source_position.input_name = input_name;
1656
1657 #if 0 // TODO
1658         bufpos = buffer;
1659         bufend = buffer + len;
1660 #else
1661         (void)buffer;
1662         (void)len;
1663         panic("builtin lexing not done yet");
1664 #endif
1665
1666         /* place a virtual \n at the beginning so the lexer knows that we're
1667          * at the beginning of a line */
1668         c = '\n';
1669 }
1670
1671 void exit_lexer(void)
1672 {
1673         strset_destroy(&stringset);
1674 }
1675
1676 static __attribute__((unused))
1677 void dbg_pos(const source_position_t source_position)
1678 {
1679         fprintf(stdout, "%s:%u\n", source_position.input_name,
1680                 source_position.linenr);
1681         fflush(stdout);
1682 }