2 * This file is part of cparser.
3 * Copyright (C) 2007-2008 Matthias Braun <matze@braunis.de>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 #include "diagnostic.h"
26 #include "symbol_table_t.h"
27 #include "adt/error.h"
28 #include "adt/strset.h"
32 #include "target_architecture.h"
35 #include "lang_features.h"
51 #if defined(_WIN32) || defined(__CYGWIN__)
52 /* No strtold on windows and no replacement yet */
53 #define strtold(s, e) strtod(s, e)
54 #define strcasecmp(a, b) stricmp(a, b)
57 typedef unsigned int utf32;
63 static utf32 buf[BUF_SIZE + MAX_PUTBACK];
64 static const utf32 *bufend;
65 static const utf32 *bufpos;
66 static strset_t stringset;
67 bool allow_dollar_in_symbol = true;
70 * Prints a parse error message at the current token.
72 * @param msg the error message
74 static void parse_error(const char *msg)
76 errorf(&lexer_token.source_position, "%s", msg);
80 * Prints an internal error message at the current token.
82 * @param msg the error message
84 static NORETURN internal_error(const char *msg)
86 internal_errorf(&lexer_token.source_position, "%s", msg);
89 static size_t read_block(unsigned char *const read_buf, size_t const n)
91 size_t const s = fread(read_buf, 1, n, input);
94 parse_error("read from input failed");
95 buf[MAX_PUTBACK] = EOF;
96 bufpos = buf + MAX_PUTBACK;
97 bufend = buf + MAX_PUTBACK + 1;
102 static void decode_iso_8859_1(void)
104 unsigned char read_buf[BUF_SIZE];
105 size_t const s = read_block(read_buf, sizeof(read_buf));
109 unsigned char const *src = read_buf;
110 unsigned char const *end = read_buf + s;
111 utf32 *dst = buf + MAX_PUTBACK;
115 bufpos = buf + MAX_PUTBACK;
119 static void decode_iso_8859_15(void)
121 unsigned char read_buf[BUF_SIZE];
122 size_t const s = read_block(read_buf, sizeof(read_buf));
126 unsigned char const *src = read_buf;
127 unsigned char const *end = read_buf + s;
128 utf32 *dst = buf + MAX_PUTBACK;
132 case 0xA4: tc = 0x20AC; break; // €
133 case 0xA6: tc = 0x0160; break; // Š
134 case 0xA8: tc = 0x0161; break; // š
135 case 0xB4: tc = 0x017D; break; // Ž
136 case 0xB8: tc = 0x017E; break; // ž
137 case 0xBC: tc = 0x0152; break; // Œ
138 case 0xBD: tc = 0x0153; break; // œ
139 case 0xBE: tc = 0x0178; break; // Ÿ
144 bufpos = buf + MAX_PUTBACK;
148 static void decode_utf8(void)
150 static utf32 part_decoded_min_code;
151 static utf32 part_decoded_char;
152 static size_t part_decoded_rest_len;
155 unsigned char read_buf[BUF_SIZE];
156 size_t const s = read_block(read_buf, sizeof(read_buf));
158 if (part_decoded_rest_len > 0)
159 parse_error("incomplete input char at end of input");
163 unsigned char const *src = read_buf;
164 unsigned char const *end = read_buf + s;
165 utf32 *dst = buf + MAX_PUTBACK;
169 if (part_decoded_rest_len != 0) {
170 min_code = part_decoded_min_code;
171 decoded = part_decoded_char;
172 size_t const rest_len = part_decoded_rest_len;
173 part_decoded_rest_len = 0;
175 case 4: goto realign;
176 case 3: goto three_more;
177 case 2: goto two_more;
178 default: goto one_more;
183 if ((*src & 0x80) == 0) {
185 } else if ((*src & 0xE0) == 0xC0) {
187 decoded = *src++ & 0x1F;
190 part_decoded_min_code = min_code;
191 part_decoded_char = decoded;
192 part_decoded_rest_len = 1;
195 if ((*src & 0xC0) == 0x80) {
196 decoded = (decoded << 6) | (*src++ & 0x3F);
200 if (decoded < min_code ||
201 decoded > 0x10FFFF ||
202 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
203 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
204 (decoded & 0xFFFE) == 0xFFFE) { // noncharacters
205 parse_error("invalid byte sequence in input");
207 } else if ((*src & 0xF0) == 0xE0) {
209 decoded = *src++ & 0x0F;
212 part_decoded_min_code = min_code;
213 part_decoded_char = decoded;
214 part_decoded_rest_len = 2;
217 if ((*src & 0xC0) == 0x80) {
218 decoded = (decoded << 6) | (*src++ & 0x3F);
223 } else if ((*src & 0xF8) == 0xF0) {
225 decoded = *src++ & 0x07;
228 part_decoded_min_code = min_code;
229 part_decoded_char = decoded;
230 part_decoded_rest_len = 3;
233 if ((*src & 0xC0) == 0x80) {
234 decoded = (decoded << 6) | (*src++ & 0x3F);
241 parse_error("invalid byte sequence in input");
246 part_decoded_rest_len = 4;
249 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
255 bufpos = buf + MAX_PUTBACK;
257 } while (bufpos == bufend);
260 static void decode_windows_1252(void)
262 unsigned char read_buf[BUF_SIZE];
263 size_t const s = read_block(read_buf, sizeof(read_buf));
267 unsigned char const *src = read_buf;
268 unsigned char const *end = read_buf + s;
269 utf32 *dst = buf + MAX_PUTBACK;
273 case 0x80: tc = 0x20AC; break; // €
274 case 0x82: tc = 0x201A; break; // ‚
275 case 0x83: tc = 0x0192; break; // ƒ
276 case 0x84: tc = 0x201E; break; // „
277 case 0x85: tc = 0x2026; break; // …
278 case 0x86: tc = 0x2020; break; // †
279 case 0x87: tc = 0x2021; break; // ‡
280 case 0x88: tc = 0x02C6; break; // ˆ
281 case 0x89: tc = 0x2030; break; // ‰
282 case 0x8A: tc = 0x0160; break; // Š
283 case 0x8B: tc = 0x2039; break; // ‹
284 case 0x8C: tc = 0x0152; break; // Œ
285 case 0x8E: tc = 0x017D; break; // Ž
286 case 0x91: tc = 0x2018; break; // ‘
287 case 0x92: tc = 0x2019; break; // ’
288 case 0x93: tc = 0x201C; break; // “
289 case 0x94: tc = 0x201D; break; // ”
290 case 0x95: tc = 0x2022; break; // •
291 case 0x96: tc = 0x2013; break; // –
292 case 0x97: tc = 0x2014; break; // —
293 case 0x98: tc = 0x02DC; break; // ˜
294 case 0x99: tc = 0x2122; break; // ™
295 case 0x9A: tc = 0x0161; break; // š
296 case 0x9B: tc = 0x203A; break; // ›
297 case 0x9C: tc = 0x0153; break; // œ
298 case 0x9E: tc = 0x017E; break; // ž
299 case 0x9F: tc = 0x0178; break; // Ÿ
304 bufpos = buf + MAX_PUTBACK;
308 typedef void (*decoder_t)(void);
310 static decoder_t decoder = decode_utf8;
312 typedef struct named_decoder_t {
317 static named_decoder_t const decoders[] = {
318 { "CP819", decode_iso_8859_1 }, // offical alias
319 { "IBM819", decode_iso_8859_1 }, // offical alias
320 { "ISO-8859-1", decode_iso_8859_1 }, // offical alias
321 { "ISO-8859-15", decode_iso_8859_15 }, // offical name
322 { "ISO8859-1", decode_iso_8859_1 },
323 { "ISO8859-15", decode_iso_8859_15 },
324 { "ISO_8859-1", decode_iso_8859_1 }, // offical alias
325 { "ISO_8859-15", decode_iso_8859_15 }, // offical alias
326 { "ISO_8859-1:1987", decode_iso_8859_1 }, // offical name
327 { "Latin-9", decode_iso_8859_15 }, // offical alias
328 { "UTF-8", decode_utf8 }, // offical name
329 { "csISOLatin1", decode_iso_8859_1 }, // offical alias
330 { "cp1252", decode_windows_1252 },
331 { "iso-ir-100", decode_iso_8859_1 }, // offical alias
332 { "l1", decode_iso_8859_1 }, // offical alias
333 { "latin1", decode_iso_8859_1 }, // offical alias
334 { "windows-1252", decode_windows_1252 }, // official name
339 void select_input_encoding(char const* const encoding)
341 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
342 if (strcasecmp(encoding, i->name) != 0)
344 decoder = i->decoder;
347 fprintf(stderr, "error: input encoding \"%s\" not supported\n", encoding);
350 static inline void next_real_char(void)
352 assert(bufpos <= bufend);
353 if (bufpos >= bufend) {
364 * Put a character back into the buffer.
366 * @param pc the character to put back
368 static inline void put_back(utf32 const pc)
370 assert(bufpos > buf);
371 *(--bufpos - buf + buf) = pc;
374 printf("putback '%lc'\n", pc);
378 static inline void next_char(void);
380 #define MATCH_NEWLINE(code) \
386 lexer_token.source_position.linenr++; \
390 lexer_token.source_position.linenr++; \
393 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
395 static void maybe_concat_lines(void)
400 MATCH_NEWLINE(return;)
411 * Set c to the next input character, ie.
412 * after expanding trigraphs.
414 static inline void next_char(void)
418 /* filter trigraphs */
419 if(UNLIKELY(c == '\\')) {
420 maybe_concat_lines();
421 goto end_of_next_char;
425 goto end_of_next_char;
428 if(LIKELY(c != '?')) {
431 goto end_of_next_char;
436 case '=': c = '#'; break;
437 case '(': c = '['; break;
438 case '/': c = '\\'; maybe_concat_lines(); break;
439 case ')': c = ']'; break;
440 case '\'': c = '^'; break;
441 case '<': c = '{'; break;
442 case '!': c = '|'; break;
443 case '>': c = '}'; break;
444 case '-': c = '~'; break;
454 printf("nchar '%c'\n", c);
458 #define SYMBOL_CHARS \
459 case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
527 * Read a symbol from the input and build
530 static void parse_symbol(void)
535 obstack_1grow(&symbol_obstack, (char) c);
542 obstack_1grow(&symbol_obstack, (char) c);
553 obstack_1grow(&symbol_obstack, '\0');
555 string = obstack_finish(&symbol_obstack);
556 symbol = symbol_table_insert(string);
558 lexer_token.type = symbol->ID;
559 lexer_token.v.symbol = symbol;
561 if(symbol->string != string) {
562 obstack_free(&symbol_obstack, string);
566 static void parse_integer_suffix(bool is_oct_hex)
568 bool is_unsigned = false;
569 bool min_long = false;
570 bool min_longlong = false;
571 bool not_traditional = false;
575 if (c == 'U' || c == 'u') {
576 not_traditional = true;
577 suffix[pos++] = toupper(c);
580 if (c == 'L' || c == 'l') {
581 suffix[pos++] = toupper(c);
584 if (c == 'L' || c == 'l') {
585 suffix[pos++] = toupper(c);
590 } else if (c == 'l' || c == 'L') {
591 suffix[pos++] = toupper(c);
594 if (c == 'l' || c == 'L') {
595 not_traditional = true;
596 suffix[pos++] = toupper(c);
599 if (c == 'u' || c == 'U') {
600 suffix[pos++] = toupper(c);
604 } else if (c == 'u' || c == 'U') {
605 not_traditional = true;
606 suffix[pos++] = toupper(c);
609 lexer_token.datatype = type_unsigned_long;
613 if (warning.traditional && not_traditional) {
615 warningf(&lexer_token.source_position,
616 "traditional C rejects the '%s' suffix", suffix);
619 long long v = lexer_token.v.intvalue;
621 if (v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
622 lexer_token.datatype = type_int;
624 } else if (is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
625 lexer_token.datatype = type_unsigned_int;
630 if (v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
631 lexer_token.datatype = type_long;
633 } else if (is_oct_hex && v >= 0 && (unsigned long long)v <= (unsigned long long)TARGET_ULONG_MAX) {
634 lexer_token.datatype = type_unsigned_long;
638 unsigned long long uv = (unsigned long long) v;
639 if (is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
640 lexer_token.datatype = type_unsigned_long_long;
644 lexer_token.datatype = type_long_long;
646 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
647 if (!min_long && v <= TARGET_UINT_MAX) {
648 lexer_token.datatype = type_unsigned_int;
651 if (!min_longlong && v <= TARGET_ULONG_MAX) {
652 lexer_token.datatype = type_unsigned_long;
655 lexer_token.datatype = type_unsigned_long_long;
659 static void parse_floating_suffix(void)
662 /* TODO: do something useful with the suffixes... */
665 if (warning.traditional) {
666 warningf(&lexer_token.source_position,
667 "traditional C rejects the 'F' suffix");
670 lexer_token.datatype = type_float;
674 if (warning.traditional) {
675 warningf(&lexer_token.source_position,
676 "traditional C rejects the 'F' suffix");
679 lexer_token.datatype = type_long_double;
682 lexer_token.datatype = type_double;
688 * A replacement for strtoull. Only those parts needed for
689 * our parser are implemented.
691 static unsigned long long parse_int_string(const char *s, const char **endptr, int base)
693 unsigned long long v = 0;
698 /* check for overrun */
699 if (v >= 0x1000000000000000ULL)
701 switch (tolower(*s)) {
702 case '0': v <<= 4; break;
703 case '1': v <<= 4; v |= 0x1; break;
704 case '2': v <<= 4; v |= 0x2; break;
705 case '3': v <<= 4; v |= 0x3; break;
706 case '4': v <<= 4; v |= 0x4; break;
707 case '5': v <<= 4; v |= 0x5; break;
708 case '6': v <<= 4; v |= 0x6; break;
709 case '7': v <<= 4; v |= 0x7; break;
710 case '8': v <<= 4; v |= 0x8; break;
711 case '9': v <<= 4; v |= 0x9; break;
712 case 'a': v <<= 4; v |= 0xa; break;
713 case 'b': v <<= 4; v |= 0xb; break;
714 case 'c': v <<= 4; v |= 0xc; break;
715 case 'd': v <<= 4; v |= 0xd; break;
716 case 'e': v <<= 4; v |= 0xe; break;
717 case 'f': v <<= 4; v |= 0xf; break;
725 /* check for overrun */
726 if (v >= 0x2000000000000000ULL)
728 switch (tolower(*s)) {
729 case '0': v <<= 3; break;
730 case '1': v <<= 3; v |= 1; break;
731 case '2': v <<= 3; v |= 2; break;
732 case '3': v <<= 3; v |= 3; break;
733 case '4': v <<= 3; v |= 4; break;
734 case '5': v <<= 3; v |= 5; break;
735 case '6': v <<= 3; v |= 6; break;
736 case '7': v <<= 3; v |= 7; break;
744 /* check for overrun */
745 if (v > 0x1999999999999999ULL)
747 switch (tolower(*s)) {
748 case '0': v *= 10; break;
749 case '1': v *= 10; v += 1; break;
750 case '2': v *= 10; v += 2; break;
751 case '3': v *= 10; v += 3; break;
752 case '4': v *= 10; v += 4; break;
753 case '5': v *= 10; v += 5; break;
754 case '6': v *= 10; v += 6; break;
755 case '7': v *= 10; v += 7; break;
756 case '8': v *= 10; v += 8; break;
757 case '9': v *= 10; v += 9; break;
773 * Parses a hex number including hex floats and set the
776 static void parse_number_hex(void)
778 bool is_float = false;
779 assert(c == 'x' || c == 'X');
782 obstack_1grow(&symbol_obstack, '0');
783 obstack_1grow(&symbol_obstack, 'x');
786 obstack_1grow(&symbol_obstack, (char) c);
791 obstack_1grow(&symbol_obstack, (char) c);
794 while (isxdigit(c)) {
795 obstack_1grow(&symbol_obstack, (char) c);
800 if (c == 'p' || c == 'P') {
801 obstack_1grow(&symbol_obstack, (char) c);
804 if (c == '-' || c == '+') {
805 obstack_1grow(&symbol_obstack, (char) c);
809 while (isxdigit(c)) {
810 obstack_1grow(&symbol_obstack, (char) c);
816 obstack_1grow(&symbol_obstack, '\0');
817 char *string = obstack_finish(&symbol_obstack);
818 if(*string == '\0') {
819 parse_error("invalid hex number");
820 lexer_token.type = T_ERROR;
821 obstack_free(&symbol_obstack, string);
827 lexer_token.type = T_FLOATINGPOINT;
828 lexer_token.v.floatvalue = strtold(string, &endptr);
830 if(*endptr != '\0') {
831 parse_error("invalid hex float literal");
834 parse_floating_suffix();
837 lexer_token.type = T_INTEGER;
838 lexer_token.v.intvalue = parse_int_string(string + 2, &endptr, 16);
839 if(*endptr != '\0') {
840 parse_error("hex number literal too long");
842 parse_integer_suffix(true);
845 obstack_free(&symbol_obstack, string);
849 * Returns true if the given char is a octal digit.
851 * @param char the character to check
853 static inline bool is_octal_digit(utf32 chr)
871 * Parses a octal number and set the lexer_token.
873 static void parse_number_oct(void)
875 while(is_octal_digit(c)) {
876 obstack_1grow(&symbol_obstack, (char) c);
879 obstack_1grow(&symbol_obstack, '\0');
880 char *string = obstack_finish(&symbol_obstack);
883 lexer_token.type = T_INTEGER;
884 lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
885 if(*endptr != '\0') {
886 parse_error("octal number literal too long");
889 obstack_free(&symbol_obstack, string);
890 parse_integer_suffix(true);
894 * Parses a decimal including float number and set the
897 static void parse_number_dec(void)
899 bool is_float = false;
901 obstack_1grow(&symbol_obstack, (char) c);
906 obstack_1grow(&symbol_obstack, '.');
910 obstack_1grow(&symbol_obstack, (char) c);
915 if(c == 'e' || c == 'E') {
916 obstack_1grow(&symbol_obstack, (char) c);
919 if(c == '-' || c == '+') {
920 obstack_1grow(&symbol_obstack, (char) c);
925 obstack_1grow(&symbol_obstack, (char) c);
931 obstack_1grow(&symbol_obstack, '\0');
932 char *string = obstack_finish(&symbol_obstack);
936 lexer_token.type = T_FLOATINGPOINT;
937 lexer_token.v.floatvalue = strtold(string, &endptr);
939 if(*endptr != '\0') {
940 parse_error("invalid number literal");
943 parse_floating_suffix();
946 lexer_token.type = T_INTEGER;
947 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
949 if(*endptr != '\0') {
950 parse_error("invalid number literal");
953 parse_integer_suffix(false);
955 obstack_free(&symbol_obstack, string);
959 * Parses a number and sets the lexer_token.
961 static void parse_number(void)
983 parse_error("invalid octal number");
984 lexer_token.type = T_ERROR;
990 obstack_1grow(&symbol_obstack, '0');
1000 * Returns the value of a digit.
1001 * The only portable way to do it ...
1003 static int digit_value(utf32 const digit)
1017 case 'A': return 10;
1019 case 'B': return 11;
1021 case 'C': return 12;
1023 case 'D': return 13;
1025 case 'E': return 14;
1027 case 'F': return 15;
1029 internal_error("wrong character given");
1034 * Parses an octal character sequence.
1036 * @param first_digit the already read first digit
1038 static utf32 parse_octal_sequence(utf32 const first_digit)
1040 assert(is_octal_digit(first_digit));
1041 utf32 value = digit_value(first_digit);
1042 if (!is_octal_digit(c)) return value;
1043 value = 8 * value + digit_value(c);
1045 if (!is_octal_digit(c)) return value;
1046 value = 8 * value + digit_value(c);
1052 * Parses a hex character sequence.
1054 static utf32 parse_hex_sequence(void)
1057 while(isxdigit(c)) {
1058 value = 16 * value + digit_value(c);
1065 * Parse an escape sequence.
1067 static utf32 parse_escape_sequence(void)
1075 case '"': return '"';
1076 case '\'': return '\'';
1077 case '\\': return '\\';
1078 case '?': return '\?';
1079 case 'a': return '\a';
1080 case 'b': return '\b';
1081 case 'f': return '\f';
1082 case 'n': return '\n';
1083 case 'r': return '\r';
1084 case 't': return '\t';
1085 case 'v': return '\v';
1087 return parse_hex_sequence();
1096 return parse_octal_sequence(ec);
1098 parse_error("reached end of file while parsing escape sequence");
1100 /* \E is not documented, but handled, by GCC. It is acceptable according
1101 * to §6.11.4, whereas \e is not. */
1105 return 27; /* hopefully 27 is ALWAYS the code for ESCAPE */
1108 /* §6.4.4.4:8 footnote 64 */
1109 parse_error("unknown escape sequence");
1115 * Concatenate two strings.
1117 string_t concat_strings(const string_t *const s1, const string_t *const s2)
1119 const size_t len1 = s1->size - 1;
1120 const size_t len2 = s2->size - 1;
1122 char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
1123 memcpy(concat, s1->begin, len1);
1124 memcpy(concat + len1, s2->begin, len2 + 1);
1126 if (warning.traditional) {
1127 warningf(&lexer_token.source_position,
1128 "traditional C rejects string constant concatenation");
1130 #if 0 /* TODO hash */
1131 const char *result = strset_insert(&stringset, concat);
1132 if(result != concat) {
1133 obstack_free(&symbol_obstack, concat);
1138 return (string_t){ concat, len1 + len2 + 1 };
1143 * Concatenate a string and a wide string.
1145 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
1147 const size_t len1 = s1->size - 1;
1148 const size_t len2 = s2->size - 1;
1150 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1151 const char *const src = s1->begin;
1152 for (size_t i = 0; i != len1; ++i) {
1155 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1156 if (warning.traditional) {
1157 warningf(&lexer_token.source_position,
1158 "traditional C rejects string constant concatenation");
1161 return (wide_string_t){ concat, len1 + len2 + 1 };
1165 * Concatenate two wide strings.
1167 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
1169 const size_t len1 = s1->size - 1;
1170 const size_t len2 = s2->size - 1;
1172 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1173 memcpy(concat, s1->begin, len1 * sizeof(*concat));
1174 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
1175 if (warning.traditional) {
1176 warningf(&lexer_token.source_position,
1177 "traditional C rejects string constant concatenation");
1180 return (wide_string_t){ concat, len1 + len2 + 1 };
1184 * Concatenate a wide string and a string.
1186 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
1188 const size_t len1 = s1->size - 1;
1189 const size_t len2 = s2->size - 1;
1191 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
1192 memcpy(concat, s1->begin, len1 * sizeof(*concat));
1193 const char *const src = s2->begin;
1194 wchar_rep_t *const dst = concat + len1;
1195 for (size_t i = 0; i != len2 + 1; ++i) {
1198 if (warning.traditional) {
1199 warningf(&lexer_token.source_position,
1200 "traditional C rejects string constant concatenation");
1203 return (wide_string_t){ concat, len1 + len2 + 1 };
1206 static void grow_symbol(utf32 const tc)
1208 struct obstack *const o = &symbol_obstack;
1210 obstack_1grow(o, tc);
1211 } else if (tc < 0x800) {
1212 obstack_1grow(o, 0xC0 | (tc >> 6));
1213 obstack_1grow(o, 0x80 | (tc & 0x3F));
1214 } else if (tc < 0x10000) {
1215 obstack_1grow(o, 0xE0 | ( tc >> 12));
1216 obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F));
1217 obstack_1grow(o, 0x80 | ( tc & 0x3F));
1219 obstack_1grow(o, 0xF0 | ( tc >> 18));
1220 obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F));
1221 obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F));
1222 obstack_1grow(o, 0x80 | ( tc & 0x3F));
1227 * Parse a string literal and set lexer_token.
1229 static void parse_string_literal(void)
1231 const unsigned start_linenr = lexer_token.source_position.linenr;
1238 utf32 const tc = parse_escape_sequence();
1240 warningf(&lexer_token.source_position,
1241 "escape sequence out of range");
1243 obstack_1grow(&symbol_obstack, tc);
1248 source_position_t source_position;
1249 source_position.input_name = lexer_token.source_position.input_name;
1250 source_position.linenr = start_linenr;
1251 errorf(&source_position, "string has no end");
1252 lexer_token.type = T_ERROR;
1269 /* TODO: concatenate multiple strings separated by whitespace... */
1271 /* add finishing 0 to the string */
1272 obstack_1grow(&symbol_obstack, '\0');
1273 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
1274 const char *const string = obstack_finish(&symbol_obstack);
1276 #if 0 /* TODO hash */
1277 /* check if there is already a copy of the string */
1278 result = strset_insert(&stringset, string);
1279 if(result != string) {
1280 obstack_free(&symbol_obstack, string);
1283 const char *const result = string;
1286 lexer_token.type = T_STRING_LITERAL;
1287 lexer_token.v.string.begin = result;
1288 lexer_token.v.string.size = size;
1292 * Parse a wide character constant and set lexer_token.
1294 static void parse_wide_character_constant(void)
1296 const unsigned start_linenr = lexer_token.source_position.linenr;
1303 wchar_rep_t tc = parse_escape_sequence();
1304 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1309 parse_error("newline while parsing character constant");
1315 goto end_of_wide_char_constant;
1318 source_position_t source_position = lexer_token.source_position;
1319 source_position.linenr = start_linenr;
1320 errorf(&source_position, "EOF while parsing character constant");
1321 lexer_token.type = T_ERROR;
1326 wchar_rep_t tc = (wchar_rep_t) c;
1327 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1334 end_of_wide_char_constant:;
1335 size_t size = (size_t) obstack_object_size(&symbol_obstack);
1336 assert(size % sizeof(wchar_rep_t) == 0);
1337 size /= sizeof(wchar_rep_t);
1339 const wchar_rep_t *string = obstack_finish(&symbol_obstack);
1341 lexer_token.type = T_WIDE_CHARACTER_CONSTANT;
1342 lexer_token.v.wide_string.begin = string;
1343 lexer_token.v.wide_string.size = size;
1344 lexer_token.datatype = type_wchar_t;
1348 * Parse a wide string literal and set lexer_token.
1350 static void parse_wide_string_literal(void)
1352 const unsigned start_linenr = lexer_token.source_position.linenr;
1360 wchar_rep_t tc = parse_escape_sequence();
1361 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1366 source_position_t source_position;
1367 source_position.input_name = lexer_token.source_position.input_name;
1368 source_position.linenr = start_linenr;
1369 errorf(&source_position, "string has no end");
1370 lexer_token.type = T_ERROR;
1380 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
1389 /* TODO: concatenate multiple strings separated by whitespace... */
1391 /* add finishing 0 to the string */
1392 wchar_rep_t nul = L'\0';
1393 obstack_grow(&symbol_obstack, &nul, sizeof(nul));
1394 const size_t size = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
1395 const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
1397 #if 0 /* TODO hash */
1398 /* check if there is already a copy of the string */
1399 const wchar_rep_t *const result = strset_insert(&stringset, string);
1400 if(result != string) {
1401 obstack_free(&symbol_obstack, string);
1404 const wchar_rep_t *const result = string;
1407 lexer_token.type = T_WIDE_STRING_LITERAL;
1408 lexer_token.v.wide_string.begin = result;
1409 lexer_token.v.wide_string.size = size;
1413 * Parse a character constant and set lexer_token.
1415 static void parse_character_constant(void)
1417 const unsigned start_linenr = lexer_token.source_position.linenr;
1424 utf32 const tc = parse_escape_sequence();
1426 warningf(&lexer_token.source_position,
1427 "escape sequence out of range");
1429 obstack_1grow(&symbol_obstack, tc);
1434 parse_error("newline while parsing character constant");
1440 goto end_of_char_constant;
1443 source_position_t source_position;
1444 source_position.input_name = lexer_token.source_position.input_name;
1445 source_position.linenr = start_linenr;
1446 errorf(&source_position, "EOF while parsing character constant");
1447 lexer_token.type = T_ERROR;
1459 end_of_char_constant:;
1460 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
1461 const char *const string = obstack_finish(&symbol_obstack);
1463 lexer_token.type = T_CHARACTER_CONSTANT;
1464 lexer_token.v.string.begin = string;
1465 lexer_token.v.string.size = size;
1466 lexer_token.datatype = c_mode & _CXX && size == 1 ? type_char : type_int;
1470 * Skip a multiline comment.
1472 static void skip_multiline_comment(void)
1474 unsigned start_linenr = lexer_token.source_position.linenr;
1481 /* nested comment, warn here */
1482 if (warning.comment) {
1483 warningf(&lexer_token.source_position, "'/*' within comment");
1495 MATCH_NEWLINE(break;)
1498 source_position_t source_position;
1499 source_position.input_name = lexer_token.source_position.input_name;
1500 source_position.linenr = start_linenr;
1501 errorf(&source_position, "at end of file while looking for comment end");
1513 * Skip a single line comment.
1515 static void skip_line_comment(void)
1528 if (c == '\n' || c == '\r') {
1529 if (warning.comment)
1530 warningf(&lexer_token.source_position, "multi-line comment");
1542 /** The current preprocessor token. */
1543 static token_t pp_token;
1546 * Read the next preprocessor token.
1548 static inline void next_pp_token(void)
1550 lexer_next_preprocessing_token();
1551 pp_token = lexer_token;
1555 * Eat all preprocessor tokens until newline.
1557 static void eat_until_newline(void)
1559 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1565 * Handle the define directive.
1567 static void define_directive(void)
1569 lexer_next_preprocessing_token();
1570 if(lexer_token.type != T_IDENTIFIER) {
1571 parse_error("expected identifier after #define\n");
1572 eat_until_newline();
1577 * Handle the ifdef directive.
1579 static void ifdef_directive(int is_ifndef)
1582 lexer_next_preprocessing_token();
1583 //expect_identifier();
1588 * Handle the endif directive.
1590 static void endif_directive(void)
1596 * Parse the line directive.
1598 static void parse_line_directive(void)
1600 if(pp_token.type != T_INTEGER) {
1601 parse_error("expected integer");
1603 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1606 if(pp_token.type == T_STRING_LITERAL) {
1607 lexer_token.source_position.input_name = pp_token.v.string.begin;
1611 eat_until_newline();
1617 typedef enum stdc_pragma_kind_t {
1621 STDC_CX_LIMITED_RANGE
1622 } stdc_pragma_kind_t;
1625 * STDC pragma values.
1627 typedef enum stdc_pragma_value_kind_t {
1632 } stdc_pragma_value_kind_t;
1635 * Parse a pragma directive.
1637 static void parse_pragma(void)
1639 bool unknown_pragma = true;
1642 if (pp_token.v.symbol->pp_ID == TP_STDC) {
1643 stdc_pragma_kind_t kind = STDC_UNKNOWN;
1645 if (c_mode & _C99) {
1648 switch (pp_token.v.symbol->pp_ID) {
1649 case TP_FP_CONTRACT:
1650 kind = STDC_FP_CONTRACT;
1652 case TP_FENV_ACCESS:
1653 kind = STDC_FENV_ACCESS;
1655 case TP_CX_LIMITED_RANGE:
1656 kind = STDC_CX_LIMITED_RANGE;
1661 if (kind != STDC_UNKNOWN) {
1662 stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN;
1664 switch (pp_token.v.symbol->pp_ID) {
1666 value = STDC_VALUE_ON;
1669 value = STDC_VALUE_OFF;
1672 value = STDC_VALUE_DEFAULT;
1677 if (value != STDC_VALUE_UNKNOWN) {
1678 unknown_pragma = false;
1680 errorf(&pp_token.source_position, "bad STDC pragma argument");
1685 unknown_pragma = true;
1687 eat_until_newline();
1688 if (unknown_pragma && warning.unknown_pragmas) {
1689 warningf(&pp_token.source_position, "encountered unknown #pragma");
1694 * Parse a preprocessor non-null directive.
1696 static void parse_preprocessor_identifier(void)
1698 assert(pp_token.type == T_IDENTIFIER);
1699 symbol_t *symbol = pp_token.v.symbol;
1701 switch(symbol->pp_ID) {
1703 printf("include - enable header name parsing!\n");
1719 parse_line_directive();
1726 /* TODO; output the rest of the line */
1727 parse_error("#error directive: ");
1736 * Parse a preprocessor directive.
1738 static void parse_preprocessor_directive(void)
1742 switch(pp_token.type) {
1744 parse_preprocessor_identifier();
1747 parse_line_directive();
1750 /* NULL directive, see §6.10.7 */
1753 parse_error("invalid preprocessor directive");
1754 eat_until_newline();
1759 #define MAYBE_PROLOG \
1764 #define MAYBE(ch, set_type) \
1767 lexer_token.type = set_type; \
1770 #define ELSE_CODE(code) \
1774 } /* end of while(1) */ \
1777 #define ELSE(set_type) \
1779 lexer_token.type = set_type; \
1783 void lexer_next_preprocessing_token(void)
1793 lexer_token.type = '\n';
1799 /* might be a wide string ( L"string" ) */
1800 if(lexer_token.type == T_IDENTIFIER &&
1801 lexer_token.v.symbol == symbol_L) {
1803 parse_wide_string_literal();
1804 } else if(c == '\'') {
1805 parse_wide_character_constant();
1815 parse_string_literal();
1819 parse_character_constant();
1832 MAYBE('.', T_DOTDOTDOT)
1836 lexer_token.type = '.';
1842 MAYBE('&', T_ANDAND)
1843 MAYBE('=', T_ANDEQUAL)
1847 MAYBE('=', T_ASTERISKEQUAL)
1851 MAYBE('+', T_PLUSPLUS)
1852 MAYBE('=', T_PLUSEQUAL)
1856 MAYBE('>', T_MINUSGREATER)
1857 MAYBE('-', T_MINUSMINUS)
1858 MAYBE('=', T_MINUSEQUAL)
1862 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1866 MAYBE('=', T_SLASHEQUAL)
1869 skip_multiline_comment();
1870 lexer_next_preprocessing_token();
1874 skip_line_comment();
1875 lexer_next_preprocessing_token();
1881 MAYBE('=', T_PERCENTEQUAL)
1886 MAYBE(':', T_HASHHASH)
1890 lexer_token.type = '#';
1899 MAYBE('=', T_LESSEQUAL)
1902 MAYBE('=', T_LESSLESSEQUAL)
1907 MAYBE('=', T_GREATEREQUAL)
1910 MAYBE('=', T_GREATERGREATEREQUAL)
1911 ELSE(T_GREATERGREATER)
1915 MAYBE('=', T_CARETEQUAL)
1919 MAYBE('=', T_PIPEEQUAL)
1920 MAYBE('|', T_PIPEPIPE)
1928 MAYBE('=', T_EQUALEQUAL)
1932 MAYBE('#', T_HASHHASH)
1946 lexer_token.type = c;
1951 lexer_token.type = T_EOF;
1956 errorf(&lexer_token.source_position, "unknown character '%c' found", c);
1958 lexer_token.type = T_ERROR;
1964 void lexer_next_token(void)
1966 lexer_next_preprocessing_token();
1968 while (lexer_token.type == '\n') {
1970 lexer_next_preprocessing_token();
1973 if (lexer_token.type == '#') {
1974 parse_preprocessor_directive();
1979 void init_lexer(void)
1981 strset_init(&stringset);
1982 symbol_L = symbol_table_insert("L");
1985 void lexer_open_stream(FILE *stream, const char *input_name)
1988 lexer_token.source_position.linenr = 0;
1989 lexer_token.source_position.input_name = input_name;
1994 /* place a virtual \n at the beginning so the lexer knows that we're
1995 * at the beginning of a line */
1999 void lexer_open_buffer(const char *buffer, size_t len, const char *input_name)
2002 lexer_token.source_position.linenr = 0;
2003 lexer_token.source_position.input_name = input_name;
2007 bufend = buffer + len;
2011 panic("builtin lexing not done yet");
2014 /* place a virtual \n at the beginning so the lexer knows that we're
2015 * at the beginning of a line */
2019 void exit_lexer(void)
2021 strset_destroy(&stringset);
2024 static __attribute__((unused))
2025 void dbg_pos(const source_position_t source_position)
2027 fprintf(stdout, "%s:%u\n", source_position.input_name,
2028 source_position.linenr);