2 * This file is part of cparser.
3 * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 #include "diagnostic.h"
26 #include "symbol_table_t.h"
27 #include "adt/error.h"
28 #include "adt/strset.h"
32 #include "target_architecture.h"
35 #include "lang_features.h"
51 #if defined(_WIN32) || defined(__CYGWIN__)
52 /* No strtold on windows and no replacement yet */
53 #define strtold(s, e) strtod(s, e)
54 #define strcasecmp(a, b) stricmp(a, b)
57 typedef unsigned int utf32;
63 static utf32 buf[BUF_SIZE + MAX_PUTBACK];
64 static const utf32 *bufend;
65 static const utf32 *bufpos;
66 static strset_t stringset;
67 bool allow_dollar_in_symbol = true;
70 * Prints a parse error message at the current token.
72 * @param msg the error message
74 static void parse_error(const char *msg)
76 errorf(&lexer_token.source_position, "%s", msg);
80 * Prints an internal error message at the current token.
82 * @param msg the error message
84 static NORETURN internal_error(const char *msg)
86 internal_errorf(&lexer_token.source_position, "%s", msg);
89 static size_t read_block(unsigned char *const read_buf, size_t const n)
91 size_t const s = fread(read_buf, 1, n, input);
94 parse_error("read from input failed");
95 buf[MAX_PUTBACK] = EOF;
96 bufpos = buf + MAX_PUTBACK;
97 bufend = buf + MAX_PUTBACK + 1;
102 static void decode_iso_8859_1(void)
104 unsigned char read_buf[BUF_SIZE];
105 size_t const s = read_block(read_buf, sizeof(read_buf));
109 unsigned char const *src = read_buf;
110 unsigned char const *end = read_buf + s;
111 utf32 *dst = buf + MAX_PUTBACK;
115 bufpos = buf + MAX_PUTBACK;
119 static void decode_iso_8859_15(void)
121 unsigned char read_buf[BUF_SIZE];
122 size_t const s = read_block(read_buf, sizeof(read_buf));
126 unsigned char const *src = read_buf;
127 unsigned char const *end = read_buf + s;
128 utf32 *dst = buf + MAX_PUTBACK;
132 case 0xA4: tc = 0x20AC; break; // €
133 case 0xA6: tc = 0x0160; break; // Š
134 case 0xA8: tc = 0x0161; break; // š
135 case 0xB4: tc = 0x017D; break; // Ž
136 case 0xB8: tc = 0x017E; break; // ž
137 case 0xBC: tc = 0x0152; break; // Œ
138 case 0xBD: tc = 0x0153; break; // œ
139 case 0xBE: tc = 0x0178; break; // Ÿ
144 bufpos = buf + MAX_PUTBACK;
148 static void decode_utf8(void)
150 static utf32 part_decoded_min_code;
151 static utf32 part_decoded_char;
152 static size_t part_decoded_rest_len;
155 unsigned char read_buf[BUF_SIZE];
156 size_t const s = read_block(read_buf, sizeof(read_buf));
158 if (part_decoded_rest_len > 0)
159 parse_error("incomplete input char at end of input");
163 unsigned char const *src = read_buf;
164 unsigned char const *end = read_buf + s;
165 utf32 *dst = buf + MAX_PUTBACK;
169 if (part_decoded_rest_len != 0) {
170 min_code = part_decoded_min_code;
171 decoded = part_decoded_char;
172 size_t const rest_len = part_decoded_rest_len;
173 part_decoded_rest_len = 0;
175 case 4: goto realign;
176 case 3: goto three_more;
177 case 2: goto two_more;
178 default: goto one_more;
183 if ((*src & 0x80) == 0) {
185 } else if ((*src & 0xE0) == 0xC0) {
187 decoded = *src++ & 0x1F;
190 part_decoded_min_code = min_code;
191 part_decoded_char = decoded;
192 part_decoded_rest_len = 1;
195 if ((*src & 0xC0) == 0x80) {
196 decoded = (decoded << 6) | (*src++ & 0x3F);
200 if (decoded < min_code ||
201 decoded > 0x10FFFF ||
202 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
203 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
204 (decoded & 0xFFFE) == 0xFFFE) { // noncharacters
205 parse_error("invalid byte sequence in input");
207 } else if ((*src & 0xF0) == 0xE0) {
209 decoded = *src++ & 0x0F;
212 part_decoded_min_code = min_code;
213 part_decoded_char = decoded;
214 part_decoded_rest_len = 2;
217 if ((*src & 0xC0) == 0x80) {
218 decoded = (decoded << 6) | (*src++ & 0x3F);
223 } else if ((*src & 0xF8) == 0xF0) {
225 decoded = *src++ & 0x07;
228 part_decoded_min_code = min_code;
229 part_decoded_char = decoded;
230 part_decoded_rest_len = 3;
233 if ((*src & 0xC0) == 0x80) {
234 decoded = (decoded << 6) | (*src++ & 0x3F);
241 parse_error("invalid byte sequence in input");
246 part_decoded_rest_len = 4;
249 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
255 bufpos = buf + MAX_PUTBACK;
257 } while (bufpos == bufend);
260 typedef void (*decoder_t)(void);
262 static decoder_t decoder = decode_utf8;
264 typedef struct named_decoder_t {
269 static named_decoder_t const decoders[] = {
270 { "CP819", decode_iso_8859_1 }, // offical alias
271 { "IBM819", decode_iso_8859_1 }, // offical alias
272 { "ISO-8859-1", decode_iso_8859_1 }, // offical alias
273 { "ISO-8859-15", decode_iso_8859_15 }, // offical name
274 { "ISO8859-1", decode_iso_8859_1 },
275 { "ISO8859-15", decode_iso_8859_15 },
276 { "ISO_8859-1", decode_iso_8859_1 }, // offical alias
277 { "ISO_8859-15", decode_iso_8859_15 }, // offical alias
278 { "ISO_8859-1:1987", decode_iso_8859_1 }, // offical name
279 { "Latin-9", decode_iso_8859_15 }, // offical alias
280 { "UTF-8", decode_utf8 }, // offical name
281 { "csISOLatin1", decode_iso_8859_1 }, // offical alias
282 { "iso-ir-100", decode_iso_8859_1 }, // offical alias
283 { "l1", decode_iso_8859_1 }, // offical alias
284 { "latin1", decode_iso_8859_1 }, // offical alias
289 void select_input_encoding(char const* const encoding)
291 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
292 if (strcasecmp(encoding, i->name) != 0)
294 decoder = i->decoder;
297 fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
300 static inline void next_real_char(void)
302 assert(bufpos <= bufend);
303 if (bufpos >= bufend) {
314 * Put a character back into the buffer.
316 * @param pc the character to put back
318 static inline void put_back(utf32 const pc)
320 assert(bufpos > buf);
321 *(--bufpos - buf + buf) = pc;
324 printf("putback '%lc'\n", pc);
328 static inline void next_char(void);
330 #define MATCH_NEWLINE(code) \
336 lexer_token.source_position.linenr++; \
340 lexer_token.source_position.linenr++; \
343 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
345 static void maybe_concat_lines(void)
350 MATCH_NEWLINE(return;)
361 * Set c to the next input character, ie.
362 * after expanding trigraphs.
364 static inline void next_char(void)
368 /* filter trigraphs */
369 if(UNLIKELY(c == '\\')) {
370 maybe_concat_lines();
371 goto end_of_next_char;
375 goto end_of_next_char;
378 if(LIKELY(c != '?')) {
381 goto end_of_next_char;
386 case '=': c = '#'; break;
387 case '(': c = '['; break;
388 case '/': c = '\\'; maybe_concat_lines(); break;
389 case ')': c = ']'; break;
390 case '\'': c = '^'; break;
391 case '<': c = '{'; break;
392 case '!': c = '|'; break;
393 case '>': c = '}'; break;
394 case '-': c = '~'; break;
404 printf("nchar '%c'\n", c);
408 #define SYMBOL_CHARS \
409 case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
477 * Read a symbol from the input and build
480 static void parse_symbol(void)
485 obstack_1grow(&symbol_obstack, (char) c);
492 obstack_1grow(&symbol_obstack, (char) c);
503 obstack_1grow(&symbol_obstack, '\0');
505 string = obstack_finish(&symbol_obstack);
506 symbol = symbol_table_insert(string);
508 lexer_token.type = symbol->ID;
509 lexer_token.v.symbol = symbol;
511 if(symbol->string != string) {
512 obstack_free(&symbol_obstack, string);
516 static void parse_integer_suffix(bool is_oct_hex)
518 bool is_unsigned = false;
519 bool min_long = false;
520 bool min_longlong = false;
521 bool not_traditional = false;
525 if (c == 'U' || c == 'u') {
526 not_traditional = true;
527 suffix[pos++] = toupper(c);
530 if (c == 'L' || c == 'l') {
531 suffix[pos++] = toupper(c);
534 if (c == 'L' || c == 'l') {
535 suffix[pos++] = toupper(c);
540 } else if (c == 'l' || c == 'L') {
541 suffix[pos++] = toupper(c);
544 if (c == 'l' || c == 'L') {
545 not_traditional = true;
546 suffix[pos++] = toupper(c);
549 if (c == 'u' || c == 'U') {
550 suffix[pos++] = toupper(c);
554 } else if (c == 'u' || c == 'U') {
555 not_traditional = true;
556 suffix[pos++] = toupper(c);
559 lexer_token.datatype = type_unsigned_long;
563 if (warning.traditional && not_traditional) {
565 warningf(&lexer_token.source_position,
566 "traditional C rejects the '%s' suffix", suffix);
569 long long v = lexer_token.v.intvalue;
571 if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
572 lexer_token.datatype = type_int;
574 } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
575 lexer_token.datatype = type_unsigned_int;
580 if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
581 lexer_token.datatype = type_long;
583 } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
584 lexer_token.datatype = type_unsigned_long;
588 unsigned long long uv = (unsigned long long) v;
589 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
590 lexer_token.datatype = type_unsigned_long_long;
594 lexer_token.datatype = type_long_long;
596 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
597 if (!min_long && v <= TARGET_UINT_MAX) {
598 lexer_token.datatype = type_unsigned_int;
601 if (!min_longlong && v <= TARGET_ULONG_MAX) {
602 lexer_token.datatype = type_unsigned_long;
605 lexer_token.datatype = type_unsigned_long_long;
609 static void parse_floating_suffix(void)
612 /* TODO: do something useful with the suffixes... */
615 if (warning.traditional) {
616 warningf(&lexer_token.source_position,
617 "traditional C rejects the 'F' suffix");
620 lexer_token.datatype = type_float;
624 if (warning.traditional) {
625 warningf(&lexer_token.source_position,
626 "traditional C rejects the 'F' suffix");
629 lexer_token.datatype = type_long_double;
632 lexer_token.datatype = type_double;
638 * A replacement for strtoull. Only those parts needed for
639 * our parser are implemented.
641 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
643 unsigned long long v = 0;
648 /* check for overrun */
649 if (v >= 0x1000000000000000ULL)
651 switch (tolower(*s)) {
652 case '0': v <<= 4; break;
653 case '1': v <<= 4; v |= 0x1; break;
654 case '2': v <<= 4; v |= 0x2; break;
655 case '3': v <<= 4; v |= 0x3; break;
656 case '4': v <<= 4; v |= 0x4; break;
657 case '5': v <<= 4; v |= 0x5; break;
658 case '6': v <<= 4; v |= 0x6; break;
659 case '7': v <<= 4; v |= 0x7; break;
660 case '8': v <<= 4; v |= 0x8; break;
661 case '9': v <<= 4; v |= 0x9; break;
662 case 'a': v <<= 4; v |= 0xa; break;
663 case 'b': v <<= 4; v |= 0xb; break;
664 case 'c': v <<= 4; v |= 0xc; break;
665 case 'd': v <<= 4; v |= 0xd; break;
666 case 'e': v <<= 4; v |= 0xe; break;
667 case 'f': v <<= 4; v |= 0xf; break;
675 /* check for overrun */
676 if (v >= 0x2000000000000000ULL)
678 switch (tolower(*s)) {
679 case '0': v <<= 3; break;
680 case '1': v <<= 3; v |= 1; break;
681 case '2': v <<= 3; v |= 2; break;
682 case '3': v <<= 3; v |= 3; break;
683 case '4': v <<= 3; v |= 4; break;
684 case '5': v <<= 3; v |= 5; break;
685 case '6': v <<= 3; v |= 6; break;
686 case '7': v <<= 3; v |= 7; break;
694 /* check for overrun */
695 if (v > 0x1999999999999999ULL)
697 switch (tolower(*s)) {
698 case '0': v *= 10; break;
699 case '1': v *= 10; v += 1; break;
700 case '2': v *= 10; v += 2; break;
701 case '3': v *= 10; v += 3; break;
702 case '4': v *= 10; v += 4; break;
703 case '5': v *= 10; v += 5; break;
704 case '6': v *= 10; v += 6; break;
705 case '7': v *= 10; v += 7; break;
706 case '8': v *= 10; v += 8; break;
707 case '9': v *= 10; v += 9; break;
723 * Parses a hex number including hex floats and set the
726 static void parse_number_hex(void)
728 bool is_float = false;
729 assert(c == 'x' || c == 'X');
732 obstack_1grow(&symbol_obstack, '0');
733 obstack_1grow(&symbol_obstack, 'x');
736 obstack_1grow(&symbol_obstack, (char) c);
741 obstack_1grow(&symbol_obstack, (char) c);
744 while (isxdigit(c)) {
745 obstack_1grow(&symbol_obstack, (char) c);
750 if (c == 'p' || c == 'P') {
751 obstack_1grow(&symbol_obstack, (char) c);
754 if (c == '-' || c == '+') {
755 obstack_1grow(&symbol_obstack, (char) c);
759 while (isxdigit(c)) {
760 obstack_1grow(&symbol_obstack, (char) c);
766 obstack_1grow(&symbol_obstack, '\0');
767 char *string = obstack_finish(&symbol_obstack);
768 if(*string == '\0') {
769 parse_error("invalid hex number");
770 lexer_token.type = T_ERROR;
771 obstack_free(&symbol_obstack, string);
777 lexer_token.type = T_FLOATINGPOINT;
778 lexer_token.v.floatvalue = strtold(string, &endptr);
780 if(*endptr != '\0') {
781 parse_error("invalid hex float literal");
784 parse_floating_suffix();
787 lexer_token.type = T_INTEGER;
788 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
789 if(*endptr != '\0') {
790 parse_error("hex number literal too long");
792 parse_integer_suffix(true);
795 obstack_free(&symbol_obstack, string);
799 * Returns true if the given char is a octal digit.
801 * @param char the character to check
803 static inline bool is_octal_digit(utf32 chr)
821 * Parses a octal number and set the lexer_token.
823 static void parse_number_oct(void)
825 while(is_octal_digit(c)) {
826 obstack_1grow(&symbol_obstack, (char) c);
829 obstack_1grow(&symbol_obstack, '\0');
830 char *string = obstack_finish(&symbol_obstack);
833 lexer_token.type = T_INTEGER;
834 lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
835 if(*endptr != '\0') {
836 parse_error("octal number literal too long");
839 obstack_free(&symbol_obstack, string);
840 parse_integer_suffix(true);
844 * Parses a decimal including float number and set the
847 static void parse_number_dec(void)
849 bool is_float = false;
851 obstack_1grow(&symbol_obstack, (char) c);
856 obstack_1grow(&symbol_obstack, '.');
860 obstack_1grow(&symbol_obstack, (char) c);
865 if(c == 'e' || c == 'E') {
866 obstack_1grow(&symbol_obstack, (char) c);
869 if(c == '-' || c == '+') {
870 obstack_1grow(&symbol_obstack, (char) c);
875 obstack_1grow(&symbol_obstack, (char) c);
881 obstack_1grow(&symbol_obstack, '\0');
882 char *string = obstack_finish(&symbol_obstack);
886 lexer_token.type = T_FLOATINGPOINT;
887 lexer_token.v.floatvalue = strtold(string, &endptr);
889 if(*endptr != '\0') {
890 parse_error("invalid number literal");
893 parse_floating_suffix();
896 lexer_token.type = T_INTEGER;
897 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
899 if(*endptr != '\0') {
900 parse_error("invalid number literal");
903 parse_integer_suffix(false);
905 obstack_free(&symbol_obstack, string);
909 * Parses a number and sets the lexer_token.
911 static void parse_number(void)
933 parse_error("invalid octal number");
934 lexer_token.type = T_ERROR;
940 obstack_1grow(&symbol_obstack, '0');
950 * Returns the value of a digit.
951 * The only portable way to do it ...
953 static int digit_value(utf32 const digit)
979 internal_error("wrong character given");
984 * Parses an octal character sequence.
986 * @param first_digit the already read first digit
988 static utf32 parse_octal_sequence(utf32 const first_digit)
990 assert(is_octal_digit(first_digit));
991 utf32 value = digit_value(first_digit);
992 if (!is_octal_digit(c)) return value;
993 value = 8 * value + digit_value(c);
995 if (!is_octal_digit(c)) return value;
996 value = 8 * value + digit_value(c);
1002 * Parses a hex character sequence.
1004 static utf32 parse_hex_sequence(void)
1007 while(isxdigit(c)) {
1008 value = 16 * value + digit_value(c);
1015 * Parse an escape sequence.
1017 static utf32 parse_escape_sequence(void)
1025 case '"': return '"';
1026 case '\'': return '\'';
1027 case '\\': return '\\';
1028 case '?': return '\?';
1029 case 'a': return '\a';
1030 case 'b': return '\b';
1031 case 'f': return '\f';
1032 case 'n': return '\n';
1033 case 'r': return '\r';
1034 case 't': return '\t';
1035 case 'v': return '\v';
1037 return parse_hex_sequence();
1046 return parse_octal_sequence(ec);
1048 parse_error("reached end of file while parsing escape sequence");
1050 /* \E is not documented, but handled, by GCC. It is acceptable according
1051 * to §6.11.4, whereas \e is not. */
1055 return 27; /* hopefully 27 is ALWAYS the code for ESCAPE */
1058 /* §6.4.4.4:8 footnote 64 */
1059 parse_error("unknown escape sequence");
1065 * Concatenate two strings.
1067 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1069 const size_t len1 = s1->size - 1;
1070 const size_t len2 = s2->size - 1;
1072 char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1073 memcpy(concat, s1->begin, len1);
1074 memcpy(concat + len1, s2->begin, len2 + 1);
1076 if (warning.traditional) {
1077 warningf(&lexer_token.source_position,
1078 "traditional C rejects string constant concatenation");
1080 #if 0 /* TODO hash */
1081 const char *result = strset_insert(&stringset, concat);
1082 if(result != concat) {
1083 obstack_free(&symbol_obstack, concat);
1088 return (string_t){ concat, len1 + len2 + 1 };
1093 * Concatenate a string and a wide string.
1095 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1097 const size_t len1 = s1->size - 1;
1098 const size_t len2 = s2->size - 1;
1100 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1101 const char *const src = s1->begin;
1102 for (size_t i = 0; i != len1; ++i) {
1105 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1106 if (warning.traditional) {
1107 warningf(&lexer_token.source_position,
1108 "traditional C rejects string constant concatenation");
1111 return (wide_string_t){ concat, len1 + len2 + 1 };
1115 * Concatenate two wide strings.
1117 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1119 const size_t len1 = s1->size - 1;
1120 const size_t len2 = s2->size - 1;
1122 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1123 memcpy(concat, s1->begin, len1 * sizeof(*concat));
1124 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1125 if (warning.traditional) {
1126 warningf(&lexer_token.source_position,
1127 "traditional C rejects string constant concatenation");
1130 return (wide_string_t){ concat, len1 + len2 + 1 };
1134 * Concatenate a wide string and a string.
1136 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1138 const size_t len1 = s1->size - 1;
1139 const size_t len2 = s2->size - 1;
1141 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1142 memcpy(concat, s1->begin, len1 * sizeof(*concat));
1143 const char *const src = s2->begin;
1144 wchar_rep_t *const dst = concat + len1;
1145 for (size_t i = 0; i != len2 + 1; ++i) {
1148 if (warning.traditional) {
1149 warningf(&lexer_token.source_position,
1150 "traditional C rejects string constant concatenation");
1153 return (wide_string_t){ concat, len1 + len2 + 1 };
1156 static void grow_symbol(utf32 const tc)
1158 struct obstack *const o = &symbol_obstack;
1160 obstack_1grow(o, tc);
1161 } else if (tc < 0x800) {
1162 obstack_1grow(o, 0xC0 | (tc >> 6));
1163 obstack_1grow(o, 0x80 | (tc & 0x3F));
1164 } else if (tc < 0x10000) {
1165 obstack_1grow(o, 0xE0 | ( tc >> 12));
1166 obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F));
1167 obstack_1grow(o, 0x80 | ( tc & 0x3F));
1169 obstack_1grow(o, 0xF0 | ( tc >> 18));
1170 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1171 obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F));
1172 obstack_1grow(o, 0x80 | ( tc & 0x3F));
1177 * Parse a string literal and set lexer_token.
1179 static void parse_string_literal(void)
1181 const unsigned start_linenr = lexer_token.source_position.linenr;
1188 utf32 const tc = parse_escape_sequence();
1190 warningf(&lexer_token.source_position,
1191 "escape sequence out of range");
1193 obstack_1grow(&symbol_obstack, tc);
1198 source_position_t source_position;
1199 source_position.input_name = lexer_token.source_position.input_name;
1200 source_position.linenr = start_linenr;
1201 errorf(&source_position, "string has no end");
1202 lexer_token.type = T_ERROR;
1219 /* TODO: concatenate multiple strings separated by whitespace... */
1221 /* add finishing 0 to the string */
1222 obstack_1grow(&symbol_obstack, '\0');
1223 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
1224 const char *const string = obstack_finish(&symbol_obstack);
1226 #if 0 /* TODO hash */
1227 /* check if there is already a copy of the string */
1228 result = strset_insert(&stringset, string);
1229 if(result != string) {
1230 obstack_free(&symbol_obstack, string);
1233 const char *const result = string;
1236 lexer_token.type = T_STRING_LITERAL;
1237 lexer_token.v.string.begin = result;
1238 lexer_token.v.string.size = size;
1242 * Parse a wide character constant and set lexer_token.
1244 static void parse_wide_character_constant(void)
1246 const unsigned start_linenr = lexer_token.source_position.linenr;
1253 wchar_rep_t tc = parse_escape_sequence();
1254 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1259 parse_error("newline while parsing character constant");
1265 goto end_of_wide_char_constant;
1268 source_position_t source_position = lexer_token.source_position;
1269 source_position.linenr = start_linenr;
1270 errorf(&source_position, "EOF while parsing character constant");
1271 lexer_token.type = T_ERROR;
1276 wchar_rep_t tc = (wchar_rep_t) c;
1277 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1284 end_of_wide_char_constant:;
1285 size_t size = (size_t) obstack_object_size(&symbol_obstack);
1286 assert(size % sizeof(wchar_rep_t) == 0);
1287 size /= sizeof(wchar_rep_t);
1289 const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1291 lexer_token.type = T_WIDE_CHARACTER_CONSTANT;
1292 lexer_token.v.wide_string.begin = string;
1293 lexer_token.v.wide_string.size = size;
1294 lexer_token.datatype = type_wchar_t;
1298 * Parse a wide string literal and set lexer_token.
1300 static void parse_wide_string_literal(void)
1302 const unsigned start_linenr = lexer_token.source_position.linenr;
1310 wchar_rep_t tc = parse_escape_sequence();
1311 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1316 source_position_t source_position;
1317 source_position.input_name = lexer_token.source_position.input_name;
1318 source_position.linenr = start_linenr;
1319 errorf(&source_position, "string has no end");
1320 lexer_token.type = T_ERROR;
1330 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1339 /* TODO: concatenate multiple strings separated by whitespace... */
1341 /* add finishing 0 to the string */
1342 wchar_rep_t nul = L'\0';
1343 obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1344 const size_t size = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1345 const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1347 #if 0 /* TODO hash */
1348 /* check if there is already a copy of the string */
1349 const wchar_rep_t *const result = strset_insert(&stringset, string);
1350 if(result != string) {
1351 obstack_free(&symbol_obstack, string);
1354 const wchar_rep_t *const result = string;
1357 lexer_token.type = T_WIDE_STRING_LITERAL;
1358 lexer_token.v.wide_string.begin = result;
1359 lexer_token.v.wide_string.size = size;
1363 * Parse a character constant and set lexer_token.
1365 static void parse_character_constant(void)
1367 const unsigned start_linenr = lexer_token.source_position.linenr;
1374 utf32 const tc = parse_escape_sequence();
1376 warningf(&lexer_token.source_position,
1377 "escape sequence out of range");
1379 obstack_1grow(&symbol_obstack, tc);
1384 parse_error("newline while parsing character constant");
1390 goto end_of_char_constant;
1393 source_position_t source_position;
1394 source_position.input_name = lexer_token.source_position.input_name;
1395 source_position.linenr = start_linenr;
1396 errorf(&source_position, "EOF while parsing character constant");
1397 lexer_token.type = T_ERROR;
1409 end_of_char_constant:;
1410 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
1411 const char *const string = obstack_finish(&symbol_obstack);
1413 lexer_token.type = T_CHARACTER_CONSTANT;
1414 lexer_token.v.string.begin = string;
1415 lexer_token.v.string.size = size;
1416 lexer_token.datatype = c_mode & _CXX && size == 1 ? type_char : type_int;
1420 * Skip a multiline comment.
1422 static void skip_multiline_comment(void)
1424 unsigned start_linenr = lexer_token.source_position.linenr;
1431 /* nested comment, warn here */
1432 if (warning.comment) {
1433 warningf(&lexer_token.source_position, "'/*' within comment");
1445 MATCH_NEWLINE(break;)
1448 source_position_t source_position;
1449 source_position.input_name = lexer_token.source_position.input_name;
1450 source_position.linenr = start_linenr;
1451 errorf(&source_position, "at end of file while looking for comment end");
1463 * Skip a single line comment.
1465 static void skip_line_comment(void)
1478 if (c == '\n' || c == '\r') {
1479 if (warning.comment)
1480 warningf(&lexer_token.source_position, "multi-line comment");
1492 /** The current preprocessor token. */
1493 static token_t pp_token;
1496 * Read the next preprocessor token.
1498 static inline void next_pp_token(void)
1500 lexer_next_preprocessing_token();
1501 pp_token = lexer_token;
1505 * Eat all preprocessor tokens until newline.
1507 static void eat_until_newline(void)
1509 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1515 * Handle the define directive.
1517 static void define_directive(void)
1519 lexer_next_preprocessing_token();
1520 if(lexer_token.type != T_IDENTIFIER) {
1521 parse_error("expected identifier after #define\n");
1522 eat_until_newline();
1527 * Handle the ifdef directive.
1529 static void ifdef_directive(int is_ifndef)
1532 lexer_next_preprocessing_token();
1533 //expect_identifier();
1538 * Handle the endif directive.
1540 static void endif_directive(void)
1546 * Parse the line directive.
1548 static void parse_line_directive(void)
1550 if(pp_token.type != T_INTEGER) {
1551 parse_error("expected integer");
1553 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1556 if(pp_token.type == T_STRING_LITERAL) {
1557 lexer_token.source_position.input_name = pp_token.v.string.begin;
1561 eat_until_newline();
1567 typedef enum stdc_pragma_kind_t {
1571 STDC_CX_LIMITED_RANGE
1572 } stdc_pragma_kind_t;
1575 * STDC pragma values.
1577 typedef enum stdc_pragma_value_kind_t {
1582 } stdc_pragma_value_kind_t;
1585 * Parse a pragma directive.
1587 static void parse_pragma(void)
1589 bool unknown_pragma = true;
1592 if (pp_token.v.symbol->pp_ID == TP_STDC) {
1593 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1595 if (c_mode & _C99) {
1598 switch (pp_token.v.symbol->pp_ID) {
1599 case TP_FP_CONTRACT:
1600 kind = STDC_FP_CONTRACT;
1602 case TP_FENV_ACCESS:
1603 kind = STDC_FENV_ACCESS;
1605 case TP_CX_LIMITED_RANGE:
1606 kind = STDC_CX_LIMITED_RANGE;
1611 if (kind != STDC_UNKNOWN) {
1612 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1614 switch (pp_token.v.symbol->pp_ID) {
1616 value = STDC_VALUE_ON;
1619 value = STDC_VALUE_OFF;
1622 value = STDC_VALUE_DEFAULT;
1627 if (value != STDC_VALUE_UNKNOWN) {
1628 unknown_pragma = false;
1630 errorf(&pp_token.source_position, "bad STDC pragma argument");
1635 unknown_pragma = true;
1637 eat_until_newline();
1638 if (unknown_pragma && warning.unknown_pragmas) {
1639 warningf(&pp_token.source_position, "encountered unknown #pragma");
1644 * Parse a preprocessor non-null directive.
1646 static void parse_preprocessor_identifier(void)
1648 assert(pp_token.type == T_IDENTIFIER);
1649 symbol_t *symbol = pp_token.v.symbol;
1651 switch(symbol->pp_ID) {
1653 printf("include - enable header name parsing!\n");
1669 parse_line_directive();
1676 /* TODO; output the rest of the line */
1677 parse_error("#error directive: ");
1686 * Parse a preprocessor directive.
1688 static void parse_preprocessor_directive(void)
1692 switch(pp_token.type) {
1694 parse_preprocessor_identifier();
1697 parse_line_directive();
1700 /* NULL directive, see § 6.10.7 */
1703 parse_error("invalid preprocessor directive");
1704 eat_until_newline();
1709 #define MAYBE_PROLOG \
1714 #define MAYBE(ch, set_type) \
1717 lexer_token.type = set_type; \
1720 #define ELSE_CODE(code) \
1724 } /* end of while(1) */ \
1727 #define ELSE(set_type) \
1729 lexer_token.type = set_type; \
1733 void lexer_next_preprocessing_token(void)
1743 lexer_token.type = '\n';
1749 /* might be a wide string ( L"string" ) */
1750 if(lexer_token.type == T_IDENTIFIER &&
1751 lexer_token.v.symbol == symbol_L) {
1753 parse_wide_string_literal();
1754 } else if(c == '\'') {
1755 parse_wide_character_constant();
1765 parse_string_literal();
1769 parse_character_constant();
1782 MAYBE('.', T_DOTDOTDOT)
1786 lexer_token.type = '.';
1792 MAYBE('&', T_ANDAND)
1793 MAYBE('=', T_ANDEQUAL)
1797 MAYBE('=', T_ASTERISKEQUAL)
1801 MAYBE('+', T_PLUSPLUS)
1802 MAYBE('=', T_PLUSEQUAL)
1806 MAYBE('>', T_MINUSGREATER)
1807 MAYBE('-', T_MINUSMINUS)
1808 MAYBE('=', T_MINUSEQUAL)
1812 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1816 MAYBE('=', T_SLASHEQUAL)
1819 skip_multiline_comment();
1820 lexer_next_preprocessing_token();
1824 skip_line_comment();
1825 lexer_next_preprocessing_token();
1831 MAYBE('=', T_PERCENTEQUAL)
1836 MAYBE(':', T_HASHHASH)
1840 lexer_token.type = '#';
1849 MAYBE('=', T_LESSEQUAL)
1852 MAYBE('=', T_LESSLESSEQUAL)
1857 MAYBE('=', T_GREATEREQUAL)
1860 MAYBE('=', T_GREATERGREATEREQUAL)
1861 ELSE(T_GREATERGREATER)
1865 MAYBE('=', T_CARETEQUAL)
1869 MAYBE('=', T_PIPEEQUAL)
1870 MAYBE('|', T_PIPEPIPE)
1878 MAYBE('=', T_EQUALEQUAL)
1882 MAYBE('#', T_HASHHASH)
1896 lexer_token.type = c;
1901 lexer_token.type = T_EOF;
1906 errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1908 lexer_token.type = T_ERROR;
1914 void lexer_next_token(void)
1916 lexer_next_preprocessing_token();
1918 while (lexer_token.type == '\n') {
1920 lexer_next_preprocessing_token();
1923 if (lexer_token.type == '#') {
1924 parse_preprocessor_directive();
1929 void init_lexer(void)
1931 strset_init(&stringset);
1932 symbol_L = symbol_table_insert("L");
1935 void lexer_open_stream(FILE *stream, const char *input_name)
1938 lexer_token.source_position.linenr = 0;
1939 lexer_token.source_position.input_name = input_name;
1944 /* place a virtual \n at the beginning so the lexer knows that we're
1945 * at the beginning of a line */
1949 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
1952 lexer_token.source_position.linenr = 0;
1953 lexer_token.source_position.input_name = input_name;
1957 bufend = buffer + len;
1961 panic("builtin lexing not done yet");
1964 /* place a virtual \n at the beginning so the lexer knows that we're
1965 * at the beginning of a line */
1969 void exit_lexer(void)
1971 strset_destroy(&stringset);
1974 static __attribute__((unused))
1975 void dbg_pos(const source_position_t source_position)
1977 fprintf(stdout, "%s:%u\n", source_position.input_name,
1978 source_position.linenr);