2 * This file is part of cparser.
3 * Copyright (C) 2007-2009 Matthias Braun <matze@braunis.de>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 #include "diagnostic.h"
26 #include "symbol_table_t.h"
27 #include "adt/error.h"
28 #include "adt/strset.h"
32 #include "target_architecture.h"
35 #include "lang_features.h"
51 #if defined(_WIN32) || defined(__CYGWIN__)
52 /* No strtold on windows and no replacement yet */
53 #define strtold(s, e) strtod(s, e)
56 typedef unsigned int utf32;
62 static utf32 buf[BUF_SIZE + MAX_PUTBACK];
63 static const utf32 *bufend;
64 static const utf32 *bufpos;
65 static strset_t stringset;
66 bool allow_dollar_in_symbol = true;
69 * Prints a parse error message at the current token.
71 * @param msg the error message
73 static void parse_error(const char *msg)
75 errorf(&lexer_token.source_position, "%s", msg);
79 * Prints an internal error message at the current token.
81 * @param msg the error message
83 static NORETURN internal_error(const char *msg)
85 internal_errorf(&lexer_token.source_position, "%s", msg);
88 static size_t read_block(unsigned char *const read_buf, size_t const n)
90 size_t const s = fread(read_buf, 1, n, input);
93 parse_error("read from input failed");
94 buf[MAX_PUTBACK] = EOF;
95 bufpos = buf + MAX_PUTBACK;
96 bufend = buf + MAX_PUTBACK + 1;
101 static void decode_iso_8859_1(void)
103 unsigned char read_buf[BUF_SIZE];
104 size_t const s = read_block(read_buf, sizeof(read_buf));
108 unsigned char const *src = read_buf;
109 unsigned char const *end = read_buf + s;
110 utf32 *dst = buf + MAX_PUTBACK;
114 bufpos = buf + MAX_PUTBACK;
118 static void decode_iso_8859_15(void)
120 unsigned char read_buf[BUF_SIZE];
121 size_t const s = read_block(read_buf, sizeof(read_buf));
125 unsigned char const *src = read_buf;
126 unsigned char const *end = read_buf + s;
127 utf32 *dst = buf + MAX_PUTBACK;
131 case 0xA4: tc = 0x20AC; break; // €
132 case 0xA6: tc = 0x0160; break; // Š
133 case 0xA8: tc = 0x0161; break; // š
134 case 0xB4: tc = 0x017D; break; // Ž
135 case 0xB8: tc = 0x017E; break; // ž
136 case 0xBC: tc = 0x0152; break; // Œ
137 case 0xBD: tc = 0x0153; break; // œ
138 case 0xBE: tc = 0x0178; break; // Ÿ
143 bufpos = buf + MAX_PUTBACK;
147 static void decode_utf8(void)
149 static utf32 part_decoded_min_code;
150 static utf32 part_decoded_char;
151 static size_t part_decoded_rest_len;
154 unsigned char read_buf[BUF_SIZE];
155 size_t const s = read_block(read_buf, sizeof(read_buf));
157 if (part_decoded_rest_len > 0)
158 parse_error("incomplete input char at end of input");
162 unsigned char const *src = read_buf;
163 unsigned char const *end = read_buf + s;
164 utf32 *dst = buf + MAX_PUTBACK;
168 if (part_decoded_rest_len != 0) {
169 min_code = part_decoded_min_code;
170 decoded = part_decoded_char;
171 size_t const rest_len = part_decoded_rest_len;
172 part_decoded_rest_len = 0;
174 case 4: goto realign;
175 case 3: goto three_more;
176 case 2: goto two_more;
177 default: goto one_more;
182 if ((*src & 0x80) == 0) {
184 } else if ((*src & 0xE0) == 0xC0) {
186 decoded = *src++ & 0x1F;
189 part_decoded_min_code = min_code;
190 part_decoded_char = decoded;
191 part_decoded_rest_len = 1;
194 if ((*src & 0xC0) == 0x80) {
195 decoded = (decoded << 6) | (*src++ & 0x3F);
199 if (decoded < min_code ||
200 decoded > 0x10FFFF ||
201 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
202 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
203 (decoded & 0xFFFE) == 0xFFFE) { // noncharacters
204 parse_error("invalid byte sequence in input");
206 } else if ((*src & 0xF0) == 0xE0) {
208 decoded = *src++ & 0x0F;
211 part_decoded_min_code = min_code;
212 part_decoded_char = decoded;
213 part_decoded_rest_len = 2;
216 if ((*src & 0xC0) == 0x80) {
217 decoded = (decoded << 6) | (*src++ & 0x3F);
222 } else if ((*src & 0xF8) == 0xF0) {
224 decoded = *src++ & 0x07;
227 part_decoded_min_code = min_code;
228 part_decoded_char = decoded;
229 part_decoded_rest_len = 3;
232 if ((*src & 0xC0) == 0x80) {
233 decoded = (decoded << 6) | (*src++ & 0x3F);
240 parse_error("invalid byte sequence in input");
245 part_decoded_rest_len = 4;
248 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
254 bufpos = buf + MAX_PUTBACK;
256 } while (bufpos == bufend);
259 static void decode_windows_1252(void)
261 unsigned char read_buf[BUF_SIZE];
262 size_t const s = read_block(read_buf, sizeof(read_buf));
266 unsigned char const *src = read_buf;
267 unsigned char const *end = read_buf + s;
268 utf32 *dst = buf + MAX_PUTBACK;
272 case 0x80: tc = 0x20AC; break; // €
273 case 0x82: tc = 0x201A; break; // ‚
274 case 0x83: tc = 0x0192; break; // ƒ
275 case 0x84: tc = 0x201E; break; // „
276 case 0x85: tc = 0x2026; break; // …
277 case 0x86: tc = 0x2020; break; // †
278 case 0x87: tc = 0x2021; break; // ‡
279 case 0x88: tc = 0x02C6; break; // ˆ
280 case 0x89: tc = 0x2030; break; // ‰
281 case 0x8A: tc = 0x0160; break; // Š
282 case 0x8B: tc = 0x2039; break; // ‹
283 case 0x8C: tc = 0x0152; break; // Œ
284 case 0x8E: tc = 0x017D; break; // Ž
285 case 0x91: tc = 0x2018; break; // ‘
286 case 0x92: tc = 0x2019; break; // ’
287 case 0x93: tc = 0x201C; break; // “
288 case 0x94: tc = 0x201D; break; // ”
289 case 0x95: tc = 0x2022; break; // •
290 case 0x96: tc = 0x2013; break; // –
291 case 0x97: tc = 0x2014; break; // —
292 case 0x98: tc = 0x02DC; break; // ˜
293 case 0x99: tc = 0x2122; break; // ™
294 case 0x9A: tc = 0x0161; break; // š
295 case 0x9B: tc = 0x203A; break; // ›
296 case 0x9C: tc = 0x0153; break; // œ
297 case 0x9E: tc = 0x017E; break; // ž
298 case 0x9F: tc = 0x0178; break; // Ÿ
303 bufpos = buf + MAX_PUTBACK;
307 typedef void (*decoder_t)(void);
309 static decoder_t decoder = decode_utf8;
311 typedef struct named_decoder_t {
316 static named_decoder_t const decoders[] = {
317 { "CP819", decode_iso_8859_1 }, // offical alias
318 { "IBM819", decode_iso_8859_1 }, // offical alias
319 { "ISO-8859-1", decode_iso_8859_1 }, // offical alias
320 { "ISO-8859-15", decode_iso_8859_15 }, // offical name
321 { "ISO8859-1", decode_iso_8859_1 },
322 { "ISO8859-15", decode_iso_8859_15 },
323 { "ISO_8859-1", decode_iso_8859_1 }, // offical alias
324 { "ISO_8859-15", decode_iso_8859_15 }, // offical alias
325 { "ISO_8859-1:1987", decode_iso_8859_1 }, // offical name
326 { "Latin-9", decode_iso_8859_15 }, // offical alias
327 { "UTF-8", decode_utf8 }, // offical name
328 { "csISOLatin1", decode_iso_8859_1 }, // offical alias
329 { "cp1252", decode_windows_1252 },
330 { "iso-ir-100", decode_iso_8859_1 }, // offical alias
331 { "l1", decode_iso_8859_1 }, // offical alias
332 { "latin1", decode_iso_8859_1 }, // offical alias
333 { "windows-1252", decode_windows_1252 }, // official name
338 void select_input_encoding(char const* const encoding)
340 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
341 if (strcasecmp(encoding, i->name) != 0)
343 decoder = i->decoder;
346 fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
349 static inline void next_real_char(void)
351 assert(bufpos <= bufend);
352 if (bufpos >= bufend) {
363 * Put a character back into the buffer.
365 * @param pc the character to put back
367 static inline void put_back(utf32 const pc)
369 assert(bufpos > buf);
370 *(--bufpos - buf + buf) = pc;
373 printf("putback '%lc'\n", pc);
377 static inline void next_char(void);
379 #define MATCH_NEWLINE(code) \
385 lexer_token.source_position.linenr++; \
389 lexer_token.source_position.linenr++; \
392 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
394 static void maybe_concat_lines(void)
399 MATCH_NEWLINE(return;)
410 * Set c to the next input character, ie.
411 * after expanding trigraphs.
413 static inline void next_char(void)
417 /* filter trigraphs */
418 if(UNLIKELY(c == '\\')) {
419 maybe_concat_lines();
420 goto end_of_next_char;
424 goto end_of_next_char;
427 if(LIKELY(c != '?')) {
430 goto end_of_next_char;
435 case '=': c = '#'; break;
436 case '(': c = '['; break;
437 case '/': c = '\\'; maybe_concat_lines(); break;
438 case ')': c = ']'; break;
439 case '\'': c = '^'; break;
440 case '<': c = '{'; break;
441 case '!': c = '|'; break;
442 case '>': c = '}'; break;
443 case '-': c = '~'; break;
453 printf("nchar '%c'\n", c);
457 #define SYMBOL_CHARS \
458 case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
526 * Read a symbol from the input and build
529 static void parse_symbol(void)
534 obstack_1grow(&symbol_obstack, (char) c);
541 obstack_1grow(&symbol_obstack, (char) c);
552 obstack_1grow(&symbol_obstack, '\0');
554 string = obstack_finish(&symbol_obstack);
555 symbol = symbol_table_insert(string);
557 lexer_token.type = symbol->ID;
558 lexer_token.v.symbol = symbol;
560 if(symbol->string != string) {
561 obstack_free(&symbol_obstack, string);
565 static void parse_integer_suffix(bool is_oct_hex)
567 bool is_unsigned = false;
568 bool min_long = false;
569 bool min_longlong = false;
570 bool not_traditional = false;
574 if (c == 'U' || c == 'u') {
575 not_traditional = true;
576 suffix[pos++] = toupper(c);
579 if (c == 'L' || c == 'l') {
580 suffix[pos++] = toupper(c);
583 if (c == 'L' || c == 'l') {
584 suffix[pos++] = toupper(c);
589 } else if (c == 'l' || c == 'L') {
590 suffix[pos++] = toupper(c);
593 if (c == 'l' || c == 'L') {
594 not_traditional = true;
595 suffix[pos++] = toupper(c);
598 if (c == 'u' || c == 'U') {
599 suffix[pos++] = toupper(c);
603 } else if (c == 'u' || c == 'U') {
604 not_traditional = true;
605 suffix[pos++] = toupper(c);
608 lexer_token.datatype = type_unsigned_long;
612 if (warning.traditional && not_traditional) {
614 warningf(&lexer_token.source_position,
615 "traditional C rejects the '%s' suffix", suffix);
618 long long v = lexer_token.v.intvalue;
620 if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
621 lexer_token.datatype = type_int;
623 } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
624 lexer_token.datatype = type_unsigned_int;
629 if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
630 lexer_token.datatype = type_long;
632 } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
633 lexer_token.datatype = type_unsigned_long;
637 unsigned long long uv = (unsigned long long) v;
638 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
639 lexer_token.datatype = type_unsigned_long_long;
643 lexer_token.datatype = type_long_long;
645 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
646 if (!min_long && v <= TARGET_UINT_MAX) {
647 lexer_token.datatype = type_unsigned_int;
650 if (!min_longlong && v <= TARGET_ULONG_MAX) {
651 lexer_token.datatype = type_unsigned_long;
654 lexer_token.datatype = type_unsigned_long_long;
658 static void parse_floating_suffix(void)
661 /* TODO: do something useful with the suffixes... */
664 if (warning.traditional) {
665 warningf(&lexer_token.source_position,
666 "traditional C rejects the 'F' suffix");
669 lexer_token.datatype = type_float;
673 if (warning.traditional) {
674 warningf(&lexer_token.source_position,
675 "traditional C rejects the 'F' suffix");
678 lexer_token.datatype = type_long_double;
681 lexer_token.datatype = type_double;
687 * A replacement for strtoull. Only those parts needed for
688 * our parser are implemented.
690 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
692 unsigned long long v = 0;
697 /* check for overrun */
698 if (v >= 0x1000000000000000ULL)
700 switch (tolower(*s)) {
701 case '0': v <<= 4; break;
702 case '1': v <<= 4; v |= 0x1; break;
703 case '2': v <<= 4; v |= 0x2; break;
704 case '3': v <<= 4; v |= 0x3; break;
705 case '4': v <<= 4; v |= 0x4; break;
706 case '5': v <<= 4; v |= 0x5; break;
707 case '6': v <<= 4; v |= 0x6; break;
708 case '7': v <<= 4; v |= 0x7; break;
709 case '8': v <<= 4; v |= 0x8; break;
710 case '9': v <<= 4; v |= 0x9; break;
711 case 'a': v <<= 4; v |= 0xa; break;
712 case 'b': v <<= 4; v |= 0xb; break;
713 case 'c': v <<= 4; v |= 0xc; break;
714 case 'd': v <<= 4; v |= 0xd; break;
715 case 'e': v <<= 4; v |= 0xe; break;
716 case 'f': v <<= 4; v |= 0xf; break;
724 /* check for overrun */
725 if (v >= 0x2000000000000000ULL)
727 switch (tolower(*s)) {
728 case '0': v <<= 3; break;
729 case '1': v <<= 3; v |= 1; break;
730 case '2': v <<= 3; v |= 2; break;
731 case '3': v <<= 3; v |= 3; break;
732 case '4': v <<= 3; v |= 4; break;
733 case '5': v <<= 3; v |= 5; break;
734 case '6': v <<= 3; v |= 6; break;
735 case '7': v <<= 3; v |= 7; break;
743 /* check for overrun */
744 if (v > 0x1999999999999999ULL)
746 switch (tolower(*s)) {
747 case '0': v *= 10; break;
748 case '1': v *= 10; v += 1; break;
749 case '2': v *= 10; v += 2; break;
750 case '3': v *= 10; v += 3; break;
751 case '4': v *= 10; v += 4; break;
752 case '5': v *= 10; v += 5; break;
753 case '6': v *= 10; v += 6; break;
754 case '7': v *= 10; v += 7; break;
755 case '8': v *= 10; v += 8; break;
756 case '9': v *= 10; v += 9; break;
772 * Parses a hex number including hex floats and set the
775 static void parse_number_hex(void)
777 bool is_float = false;
778 assert(c == 'x' || c == 'X');
781 obstack_1grow(&symbol_obstack, '0');
782 obstack_1grow(&symbol_obstack, 'x');
785 obstack_1grow(&symbol_obstack, (char) c);
790 obstack_1grow(&symbol_obstack, (char) c);
793 while (isxdigit(c)) {
794 obstack_1grow(&symbol_obstack, (char) c);
799 if (c == 'p' || c == 'P') {
800 obstack_1grow(&symbol_obstack, (char) c);
803 if (c == '-' || c == '+') {
804 obstack_1grow(&symbol_obstack, (char) c);
808 while (isxdigit(c)) {
809 obstack_1grow(&symbol_obstack, (char) c);
815 obstack_1grow(&symbol_obstack, '\0');
816 char *string = obstack_finish(&symbol_obstack);
817 if(*string == '\0') {
818 parse_error("invalid hex number");
819 lexer_token.type = T_ERROR;
820 obstack_free(&symbol_obstack, string);
826 lexer_token.type = T_FLOATINGPOINT;
827 lexer_token.v.floatvalue = strtold(string, &endptr);
829 if(*endptr != '\0') {
830 parse_error("invalid hex float literal");
833 parse_floating_suffix();
836 lexer_token.type = T_INTEGER;
837 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
838 if(*endptr != '\0') {
839 parse_error("hex number literal too long");
841 parse_integer_suffix(true);
844 obstack_free(&symbol_obstack, string);
848 * Returns true if the given char is a octal digit.
850 * @param char the character to check
852 static inline bool is_octal_digit(utf32 chr)
870 * Parses a octal number and set the lexer_token.
872 static void parse_number_oct(void)
874 while(is_octal_digit(c)) {
875 obstack_1grow(&symbol_obstack, (char) c);
878 obstack_1grow(&symbol_obstack, '\0');
879 char *string = obstack_finish(&symbol_obstack);
882 lexer_token.type = T_INTEGER;
883 lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
884 if(*endptr != '\0') {
885 parse_error("octal number literal too long");
888 obstack_free(&symbol_obstack, string);
889 parse_integer_suffix(true);
893 * Parses a decimal including float number and set the
896 static void parse_number_dec(void)
898 bool is_float = false;
900 obstack_1grow(&symbol_obstack, (char) c);
905 obstack_1grow(&symbol_obstack, '.');
909 obstack_1grow(&symbol_obstack, (char) c);
914 if(c == 'e' || c == 'E') {
915 obstack_1grow(&symbol_obstack, (char) c);
918 if(c == '-' || c == '+') {
919 obstack_1grow(&symbol_obstack, (char) c);
924 obstack_1grow(&symbol_obstack, (char) c);
930 obstack_1grow(&symbol_obstack, '\0');
931 char *string = obstack_finish(&symbol_obstack);
935 lexer_token.type = T_FLOATINGPOINT;
936 lexer_token.v.floatvalue = strtold(string, &endptr);
938 if(*endptr != '\0') {
939 parse_error("invalid number literal");
942 parse_floating_suffix();
945 lexer_token.type = T_INTEGER;
946 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
948 if(*endptr != '\0') {
949 parse_error("invalid number literal");
952 parse_integer_suffix(false);
954 obstack_free(&symbol_obstack, string);
958 * Parses a number and sets the lexer_token.
960 static void parse_number(void)
982 parse_error("invalid octal number");
983 lexer_token.type = T_ERROR;
989 obstack_1grow(&symbol_obstack, '0');
999 * Returns the value of a digit.
1000 * The only portable way to do it ...
1002 static int digit_value(utf32 const digit)
1016 case 'A': return 10;
1018 case 'B': return 11;
1020 case 'C': return 12;
1022 case 'D': return 13;
1024 case 'E': return 14;
1026 case 'F': return 15;
1028 internal_error("wrong character given");
1033 * Parses an octal character sequence.
1035 * @param first_digit the already read first digit
1037 static utf32 parse_octal_sequence(utf32 const first_digit)
1039 assert(is_octal_digit(first_digit));
1040 utf32 value = digit_value(first_digit);
1041 if (!is_octal_digit(c)) return value;
1042 value = 8 * value + digit_value(c);
1044 if (!is_octal_digit(c)) return value;
1045 value = 8 * value + digit_value(c);
1051 * Parses a hex character sequence.
1053 static utf32 parse_hex_sequence(void)
1056 while(isxdigit(c)) {
1057 value = 16 * value + digit_value(c);
1064 * Parse an escape sequence.
1066 static utf32 parse_escape_sequence(void)
1074 case '"': return '"';
1075 case '\'': return '\'';
1076 case '\\': return '\\';
1077 case '?': return '\?';
1078 case 'a': return '\a';
1079 case 'b': return '\b';
1080 case 'f': return '\f';
1081 case 'n': return '\n';
1082 case 'r': return '\r';
1083 case 't': return '\t';
1084 case 'v': return '\v';
1086 return parse_hex_sequence();
1095 return parse_octal_sequence(ec);
1097 parse_error("reached end of file while parsing escape sequence");
1099 /* \E is not documented, but handled, by GCC. It is acceptable according
1100 * to §6.11.4, whereas \e is not. */
1104 return 27; /* hopefully 27 is ALWAYS the code for ESCAPE */
1107 /* §6.4.4.4:8 footnote 64 */
1108 parse_error("unknown escape sequence");
1114 * Concatenate two strings.
1116 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1118 const size_t len1 = s1->size - 1;
1119 const size_t len2 = s2->size - 1;
1121 char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1122 memcpy(concat, s1->begin, len1);
1123 memcpy(concat + len1, s2->begin, len2 + 1);
1125 if (warning.traditional) {
1126 warningf(&lexer_token.source_position,
1127 "traditional C rejects string constant concatenation");
1129 #if 0 /* TODO hash */
1130 const char *result = strset_insert(&stringset, concat);
1131 if(result != concat) {
1132 obstack_free(&symbol_obstack, concat);
1137 return (string_t){ concat, len1 + len2 + 1 };
1142 * Concatenate a string and a wide string.
1144 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1146 const size_t len1 = s1->size - 1;
1147 const size_t len2 = s2->size - 1;
1149 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1150 const char *const src = s1->begin;
1151 for (size_t i = 0; i != len1; ++i) {
1154 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1155 if (warning.traditional) {
1156 warningf(&lexer_token.source_position,
1157 "traditional C rejects string constant concatenation");
1160 return (wide_string_t){ concat, len1 + len2 + 1 };
1164 * Concatenate two wide strings.
1166 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1168 const size_t len1 = s1->size - 1;
1169 const size_t len2 = s2->size - 1;
1171 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1172 memcpy(concat, s1->begin, len1 * sizeof(*concat));
1173 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1174 if (warning.traditional) {
1175 warningf(&lexer_token.source_position,
1176 "traditional C rejects string constant concatenation");
1179 return (wide_string_t){ concat, len1 + len2 + 1 };
1183 * Concatenate a wide string and a string.
1185 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1187 const size_t len1 = s1->size - 1;
1188 const size_t len2 = s2->size - 1;
1190 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1191 memcpy(concat, s1->begin, len1 * sizeof(*concat));
1192 const char *const src = s2->begin;
1193 wchar_rep_t *const dst = concat + len1;
1194 for (size_t i = 0; i != len2 + 1; ++i) {
1197 if (warning.traditional) {
1198 warningf(&lexer_token.source_position,
1199 "traditional C rejects string constant concatenation");
1202 return (wide_string_t){ concat, len1 + len2 + 1 };
1205 static void grow_symbol(utf32 const tc)
1207 struct obstack *const o = &symbol_obstack;
1209 obstack_1grow(o, tc);
1210 } else if (tc < 0x800) {
1211 obstack_1grow(o, 0xC0 | (tc >> 6));
1212 obstack_1grow(o, 0x80 | (tc & 0x3F));
1213 } else if (tc < 0x10000) {
1214 obstack_1grow(o, 0xE0 | ( tc >> 12));
1215 obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F));
1216 obstack_1grow(o, 0x80 | ( tc & 0x3F));
1218 obstack_1grow(o, 0xF0 | ( tc >> 18));
1219 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1220 obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F));
1221 obstack_1grow(o, 0x80 | ( tc & 0x3F));
1226 * Parse a string literal and set lexer_token.
1228 static void parse_string_literal(void)
1230 const unsigned start_linenr = lexer_token.source_position.linenr;
1237 utf32 const tc = parse_escape_sequence();
1239 warningf(&lexer_token.source_position,
1240 "escape sequence out of range");
1242 obstack_1grow(&symbol_obstack, tc);
1247 source_position_t source_position;
1248 source_position.input_name = lexer_token.source_position.input_name;
1249 source_position.linenr = start_linenr;
1250 errorf(&source_position, "string has no end");
1251 lexer_token.type = T_ERROR;
1268 /* TODO: concatenate multiple strings separated by whitespace... */
1270 /* add finishing 0 to the string */
1271 obstack_1grow(&symbol_obstack, '\0');
1272 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
1273 const char *const string = obstack_finish(&symbol_obstack);
1275 #if 0 /* TODO hash */
1276 /* check if there is already a copy of the string */
1277 result = strset_insert(&stringset, string);
1278 if(result != string) {
1279 obstack_free(&symbol_obstack, string);
1282 const char *const result = string;
1285 lexer_token.type = T_STRING_LITERAL;
1286 lexer_token.v.string.begin = result;
1287 lexer_token.v.string.size = size;
1291 * Parse a wide character constant and set lexer_token.
1293 static void parse_wide_character_constant(void)
1295 const unsigned start_linenr = lexer_token.source_position.linenr;
1302 wchar_rep_t tc = parse_escape_sequence();
1303 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1308 parse_error("newline while parsing character constant");
1314 goto end_of_wide_char_constant;
1317 source_position_t source_position = lexer_token.source_position;
1318 source_position.linenr = start_linenr;
1319 errorf(&source_position, "EOF while parsing character constant");
1320 lexer_token.type = T_ERROR;
1325 wchar_rep_t tc = (wchar_rep_t) c;
1326 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1333 end_of_wide_char_constant:;
1334 size_t size = (size_t) obstack_object_size(&symbol_obstack);
1335 assert(size % sizeof(wchar_rep_t) == 0);
1336 size /= sizeof(wchar_rep_t);
1338 const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1340 lexer_token.type = T_WIDE_CHARACTER_CONSTANT;
1341 lexer_token.v.wide_string.begin = string;
1342 lexer_token.v.wide_string.size = size;
1343 lexer_token.datatype = type_wchar_t;
1347 * Parse a wide string literal and set lexer_token.
1349 static void parse_wide_string_literal(void)
1351 const unsigned start_linenr = lexer_token.source_position.linenr;
1359 wchar_rep_t tc = parse_escape_sequence();
1360 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1365 source_position_t source_position;
1366 source_position.input_name = lexer_token.source_position.input_name;
1367 source_position.linenr = start_linenr;
1368 errorf(&source_position, "string has no end");
1369 lexer_token.type = T_ERROR;
1379 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1388 /* TODO: concatenate multiple strings separated by whitespace... */
1390 /* add finishing 0 to the string */
1391 wchar_rep_t nul = L'\0';
1392 obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1393 const size_t size = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1394 const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1396 #if 0 /* TODO hash */
1397 /* check if there is already a copy of the string */
1398 const wchar_rep_t *const result = strset_insert(&stringset, string);
1399 if(result != string) {
1400 obstack_free(&symbol_obstack, string);
1403 const wchar_rep_t *const result = string;
1406 lexer_token.type = T_WIDE_STRING_LITERAL;
1407 lexer_token.v.wide_string.begin = result;
1408 lexer_token.v.wide_string.size = size;
1412 * Parse a character constant and set lexer_token.
1414 static void parse_character_constant(void)
1416 const unsigned start_linenr = lexer_token.source_position.linenr;
1423 utf32 const tc = parse_escape_sequence();
1425 warningf(&lexer_token.source_position,
1426 "escape sequence out of range");
1428 obstack_1grow(&symbol_obstack, tc);
1433 parse_error("newline while parsing character constant");
1439 goto end_of_char_constant;
1442 source_position_t source_position;
1443 source_position.input_name = lexer_token.source_position.input_name;
1444 source_position.linenr = start_linenr;
1445 errorf(&source_position, "EOF while parsing character constant");
1446 lexer_token.type = T_ERROR;
1458 end_of_char_constant:;
1459 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
1460 const char *const string = obstack_finish(&symbol_obstack);
1462 lexer_token.type = T_CHARACTER_CONSTANT;
1463 lexer_token.v.string.begin = string;
1464 lexer_token.v.string.size = size;
1465 lexer_token.datatype = c_mode & _CXX && size == 1 ? type_char : type_int;
1469 * Skip a multiline comment.
1471 static void skip_multiline_comment(void)
1473 unsigned start_linenr = lexer_token.source_position.linenr;
1480 /* nested comment, warn here */
1481 if (warning.comment) {
1482 warningf(&lexer_token.source_position, "'/*' within comment");
1494 MATCH_NEWLINE(break;)
1497 source_position_t source_position;
1498 source_position.input_name = lexer_token.source_position.input_name;
1499 source_position.linenr = start_linenr;
1500 errorf(&source_position, "at end of file while looking for comment end");
1512 * Skip a single line comment.
1514 static void skip_line_comment(void)
1527 if (c == '\n' || c == '\r') {
1528 if (warning.comment)
1529 warningf(&lexer_token.source_position, "multi-line comment");
1541 /** The current preprocessor token. */
1542 static token_t pp_token;
1545 * Read the next preprocessor token.
1547 static inline void next_pp_token(void)
1549 lexer_next_preprocessing_token();
1550 pp_token = lexer_token;
1554 * Eat all preprocessor tokens until newline.
1556 static void eat_until_newline(void)
1558 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1564 * Handle the define directive.
1566 static void define_directive(void)
1568 lexer_next_preprocessing_token();
1569 if(lexer_token.type != T_IDENTIFIER) {
1570 parse_error("expected identifier after #define\n");
1571 eat_until_newline();
1576 * Handle the ifdef directive.
1578 static void ifdef_directive(int is_ifndef)
1581 lexer_next_preprocessing_token();
1582 //expect_identifier();
1587 * Handle the endif directive.
1589 static void endif_directive(void)
1595 * Parse the line directive.
1597 static void parse_line_directive(void)
1599 if(pp_token.type != T_INTEGER) {
1600 parse_error("expected integer");
1602 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1605 if(pp_token.type == T_STRING_LITERAL) {
1606 lexer_token.source_position.input_name = pp_token.v.string.begin;
1610 eat_until_newline();
1616 typedef enum stdc_pragma_kind_t {
1620 STDC_CX_LIMITED_RANGE
1621 } stdc_pragma_kind_t;
1624 * STDC pragma values.
1626 typedef enum stdc_pragma_value_kind_t {
1631 } stdc_pragma_value_kind_t;
1634 * Parse a pragma directive.
1636 static void parse_pragma(void)
1638 bool unknown_pragma = true;
1641 if (pp_token.v.symbol->pp_ID == TP_STDC) {
1642 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1644 if (c_mode & _C99) {
1647 switch (pp_token.v.symbol->pp_ID) {
1648 case TP_FP_CONTRACT:
1649 kind = STDC_FP_CONTRACT;
1651 case TP_FENV_ACCESS:
1652 kind = STDC_FENV_ACCESS;
1654 case TP_CX_LIMITED_RANGE:
1655 kind = STDC_CX_LIMITED_RANGE;
1660 if (kind != STDC_UNKNOWN) {
1661 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1663 switch (pp_token.v.symbol->pp_ID) {
1665 value = STDC_VALUE_ON;
1668 value = STDC_VALUE_OFF;
1671 value = STDC_VALUE_DEFAULT;
1676 if (value != STDC_VALUE_UNKNOWN) {
1677 unknown_pragma = false;
1679 errorf(&pp_token.source_position, "bad STDC pragma argument");
1684 unknown_pragma = true;
1686 eat_until_newline();
1687 if (unknown_pragma && warning.unknown_pragmas) {
1688 warningf(&pp_token.source_position, "encountered unknown #pragma");
1693 * Parse a preprocessor non-null directive.
1695 static void parse_preprocessor_identifier(void)
1697 assert(pp_token.type == T_IDENTIFIER);
1698 symbol_t *symbol = pp_token.v.symbol;
1700 switch(symbol->pp_ID) {
1702 printf("include - enable header name parsing!\n");
1718 parse_line_directive();
1725 /* TODO; output the rest of the line */
1726 parse_error("#error directive: ");
1735 * Parse a preprocessor directive.
1737 static void parse_preprocessor_directive(void)
1741 switch(pp_token.type) {
1743 parse_preprocessor_identifier();
1746 parse_line_directive();
1749 /* NULL directive, see §6.10.7 */
1752 parse_error("invalid preprocessor directive");
1753 eat_until_newline();
1758 #define MAYBE_PROLOG \
1763 #define MAYBE(ch, set_type) \
1766 lexer_token.type = set_type; \
1769 #define ELSE_CODE(code) \
1773 } /* end of while(1) */ \
1776 #define ELSE(set_type) \
1778 lexer_token.type = set_type; \
1782 void lexer_next_preprocessing_token(void)
1792 lexer_token.type = '\n';
1798 /* might be a wide string ( L"string" ) */
1799 if(lexer_token.type == T_IDENTIFIER &&
1800 lexer_token.v.symbol == symbol_L) {
1802 parse_wide_string_literal();
1803 } else if(c == '\'') {
1804 parse_wide_character_constant();
1814 parse_string_literal();
1818 parse_character_constant();
1831 MAYBE('.', T_DOTDOTDOT)
1835 lexer_token.type = '.';
1841 MAYBE('&', T_ANDAND)
1842 MAYBE('=', T_ANDEQUAL)
1846 MAYBE('=', T_ASTERISKEQUAL)
1850 MAYBE('+', T_PLUSPLUS)
1851 MAYBE('=', T_PLUSEQUAL)
1855 MAYBE('>', T_MINUSGREATER)
1856 MAYBE('-', T_MINUSMINUS)
1857 MAYBE('=', T_MINUSEQUAL)
1861 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1865 MAYBE('=', T_SLASHEQUAL)
1868 skip_multiline_comment();
1869 lexer_next_preprocessing_token();
1873 skip_line_comment();
1874 lexer_next_preprocessing_token();
1880 MAYBE('=', T_PERCENTEQUAL)
1885 MAYBE(':', T_HASHHASH)
1889 lexer_token.type = '#';
1898 MAYBE('=', T_LESSEQUAL)
1901 MAYBE('=', T_LESSLESSEQUAL)
1906 MAYBE('=', T_GREATEREQUAL)
1909 MAYBE('=', T_GREATERGREATEREQUAL)
1910 ELSE(T_GREATERGREATER)
1914 MAYBE('=', T_CARETEQUAL)
1918 MAYBE('=', T_PIPEEQUAL)
1919 MAYBE('|', T_PIPEPIPE)
1927 MAYBE('=', T_EQUALEQUAL)
1931 MAYBE('#', T_HASHHASH)
1945 lexer_token.type = c;
1950 lexer_token.type = T_EOF;
1955 errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1957 lexer_token.type = T_ERROR;
1963 void lexer_next_token(void)
1965 lexer_next_preprocessing_token();
1967 while (lexer_token.type == '\n') {
1969 lexer_next_preprocessing_token();
1972 if (lexer_token.type == '#') {
1973 parse_preprocessor_directive();
1978 void init_lexer(void)
1980 strset_init(&stringset);
1981 symbol_L = symbol_table_insert("L");
1984 void lexer_open_stream(FILE *stream, const char *input_name)
1987 lexer_token.source_position.linenr = 0;
1988 lexer_token.source_position.input_name = input_name;
1993 /* place a virtual \n at the beginning so the lexer knows that we're
1994 * at the beginning of a line */
1998 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
2001 lexer_token.source_position.linenr = 0;
2002 lexer_token.source_position.input_name = input_name;
2006 bufend = buffer + len;
2010 panic("builtin lexing not done yet");
2013 /* place a virtual \n at the beginning so the lexer knows that we're
2014 * at the beginning of a line */
2018 void exit_lexer(void)
2020 strset_destroy(&stringset);
2023 static __attribute__((unused))
2024 void dbg_pos(const source_position_t source_position)
2026 fprintf(stdout, "%s:%u\n", source_position.input_name,
2027 source_position.linenr);