3 #include "diagnostic.h"
6 #include "symbol_table_t.h"
8 #include "adt/strset.h"
12 #include "target_architecture.h"
26 /* No strtold on windows and no replacement yet */
27 #define strtold(s, e) strtod(s, e)
30 #if defined HAS_SIGNED_CHAR
31 typedef signed char char_type;
32 #elif defined HAS_UNSIGNED_CHAR
33 typedef unsigned char char_type;
35 # error signedness of char not determined
42 static char buf[1024 + MAX_PUTBACK];
43 static const char *bufend;
44 static const char *bufpos;
45 static strset_t stringset;
48 * Prints a parse error message at the current token.
50 * @param msg the error message
52 static void parse_error(const char *msg)
54 errorf(lexer_token.source_position, "%s", msg);
57 static inline void next_real_char(void)
59 assert(bufpos <= bufend);
60 if (bufpos >= bufend) {
61 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
67 bufpos = buf + MAX_PUTBACK;
68 bufend = buf + MAX_PUTBACK + s;
73 static inline void put_back(int pc)
76 *(--bufpos - buf + buf) = (char) pc;
79 printf("putback '%c'\n", pc);
83 static inline void next_char(void);
85 #define MATCH_NEWLINE(code) \
91 lexer_token.source_position.linenr++; \
95 lexer_token.source_position.linenr++; \
98 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
100 static void maybe_concat_lines(void)
105 MATCH_NEWLINE(return;)
115 static inline void next_char(void)
119 /* filter trigraphs */
120 if(UNLIKELY(c == '\\')) {
121 maybe_concat_lines();
122 goto end_of_next_char;
126 goto end_of_next_char;
129 if(LIKELY(c != '?')) {
132 goto end_of_next_char;
137 case '=': c = '#'; break;
138 case '(': c = '['; break;
139 case '/': c = '\\'; maybe_concat_lines(); break;
140 case ')': c = ']'; break;
141 case '\'': c = '^'; break;
142 case '<': c = '{'; break;
143 case '!': c = '|'; break;
144 case '>': c = '}'; break;
145 case '-': c = '~'; break;
155 printf("nchar '%c'\n", c);
159 #define SYMBOL_CHARS \
226 static void parse_symbol(void)
231 obstack_1grow(&symbol_obstack, (char) c);
238 obstack_1grow(&symbol_obstack, (char) c);
248 obstack_1grow(&symbol_obstack, '\0');
250 string = obstack_finish(&symbol_obstack);
251 symbol = symbol_table_insert(string);
253 lexer_token.type = symbol->ID;
254 lexer_token.v.symbol = symbol;
256 if(symbol->string != string) {
257 obstack_free(&symbol_obstack, string);
261 static void parse_integer_suffix(bool is_oct_hex)
263 bool is_unsigned = false;
264 bool min_long = false;
265 bool min_longlong = false;
267 if(c == 'U' || c == 'u') {
270 if(c == 'L' || c == 'l') {
273 if(c == 'L' || c == 'l') {
278 } else if(c == 'l' || c == 'L') {
281 if(c == 'l' || c == 'L') {
284 if(c == 'u' || c == 'U') {
288 } else if(c == 'u' || c == 'U') {
291 lexer_token.datatype = type_unsigned_long;
296 long long v = lexer_token.v.intvalue;
298 if(v >= TARGET_INT_MIN && v <= TARGET_INT_MAX) {
299 lexer_token.datatype = type_int;
301 } else if(is_oct_hex && v >= 0 && v <= TARGET_UINT_MAX) {
302 lexer_token.datatype = type_unsigned_int;
307 if(v >= TARGET_LONG_MIN && v <= TARGET_LONG_MAX) {
308 lexer_token.datatype = type_long;
310 } else if(is_oct_hex && v >= 0 && v <= TARGET_ULONG_MAX) {
311 lexer_token.datatype = type_unsigned_long;
315 unsigned long long uv = (unsigned long long) v;
316 if(is_oct_hex && uv > (unsigned long long) TARGET_LONGLONG_MAX) {
317 lexer_token.datatype = type_unsigned_long_long;
321 lexer_token.datatype = type_long_long;
323 unsigned long long v = (unsigned long long) lexer_token.v.intvalue;
324 if(!min_long && v <= TARGET_UINT_MAX) {
325 lexer_token.datatype = type_unsigned_int;
328 if(!min_longlong && v <= TARGET_ULONG_MAX) {
329 lexer_token.datatype = type_unsigned_long;
332 lexer_token.datatype = type_unsigned_long_long;
336 static void parse_floating_suffix(void)
339 /* TODO: do something usefull with the suffixes... */
343 lexer_token.datatype = type_float;
348 lexer_token.datatype = type_long_double;
351 lexer_token.datatype = type_double;
357 * A replacement for strtoull. Only those parts needed for
358 * our parser are implemented.
360 static unsigned long long parse_int_string(const char *s, const char **endptr, int base) {
361 unsigned long long v = 0;
366 /* check for overrun */
367 if (v >= 0x1000000000000000ULL)
369 switch (tolower(*s)) {
370 case '0': v <<= 4; break;
371 case '1': v <<= 4; v |= 0x1; break;
372 case '2': v <<= 4; v |= 0x2; break;
373 case '3': v <<= 4; v |= 0x3; break;
374 case '4': v <<= 4; v |= 0x4; break;
375 case '5': v <<= 4; v |= 0x5; break;
376 case '6': v <<= 4; v |= 0x6; break;
377 case '7': v <<= 4; v |= 0x7; break;
378 case '8': v <<= 4; v |= 0x8; break;
379 case '9': v <<= 4; v |= 0x9; break;
380 case 'a': v <<= 4; v |= 0xa; break;
381 case 'b': v <<= 4; v |= 0xb; break;
382 case 'c': v <<= 4; v |= 0xc; break;
383 case 'd': v <<= 4; v |= 0xd; break;
384 case 'e': v <<= 4; v |= 0xe; break;
385 case 'f': v <<= 4; v |= 0xf; break;
393 /* check for overrun */
394 if (v >= 0x2000000000000000ULL)
396 switch (tolower(*s)) {
397 case '0': v <<= 3; break;
398 case '1': v <<= 3; v |= 1; break;
399 case '2': v <<= 3; v |= 2; break;
400 case '3': v <<= 3; v |= 3; break;
401 case '4': v <<= 3; v |= 4; break;
402 case '5': v <<= 3; v |= 5; break;
403 case '6': v <<= 3; v |= 6; break;
404 case '7': v <<= 3; v |= 7; break;
412 /* check for overrun */
413 if (v > 0x1999999999999999ULL)
415 switch (tolower(*s)) {
416 case '0': v *= 10; break;
417 case '1': v *= 10; v += 1; break;
418 case '2': v *= 10; v += 2; break;
419 case '3': v *= 10; v += 3; break;
420 case '4': v *= 10; v += 4; break;
421 case '5': v *= 10; v += 5; break;
422 case '6': v *= 10; v += 6; break;
423 case '7': v *= 10; v += 7; break;
424 case '8': v *= 10; v += 8; break;
425 case '9': v *= 10; v += 9; break;
440 static void parse_number_hex(void)
442 assert(c == 'x' || c == 'X');
446 obstack_1grow(&symbol_obstack, (char) c);
449 obstack_1grow(&symbol_obstack, '\0');
450 char *string = obstack_finish(&symbol_obstack);
452 if(c == '.' || c == 'p' || c == 'P') {
454 panic("Hex floating point numbers not implemented yet");
456 if(*string == '\0') {
457 parse_error("invalid hex number");
458 lexer_token.type = T_ERROR;
462 lexer_token.type = T_INTEGER;
463 lexer_token.v.intvalue = parse_int_string(string, &endptr, 16);
464 if(*endptr != '\0') {
465 parse_error("hex number literal too long");
468 obstack_free(&symbol_obstack, string);
469 parse_integer_suffix(true);
472 static inline bool is_octal_digit(int chr)
474 return '0' <= chr && chr <= '7';
477 static void parse_number_oct(void)
479 while(is_octal_digit(c)) {
480 obstack_1grow(&symbol_obstack, (char) c);
483 obstack_1grow(&symbol_obstack, '\0');
484 char *string = obstack_finish(&symbol_obstack);
487 lexer_token.type = T_INTEGER;
488 lexer_token.v.intvalue = parse_int_string(string, &endptr, 8);
489 if(*endptr != '\0') {
490 parse_error("octal number literal too long");
493 obstack_free(&symbol_obstack, string);
494 parse_integer_suffix(true);
497 static void parse_number_dec(void)
499 bool is_float = false;
501 obstack_1grow(&symbol_obstack, (char) c);
506 obstack_1grow(&symbol_obstack, '.');
510 obstack_1grow(&symbol_obstack, (char) c);
515 if(c == 'e' || c == 'E') {
516 obstack_1grow(&symbol_obstack, 'e');
519 if(c == '-' || c == '+') {
520 obstack_1grow(&symbol_obstack, (char) c);
525 obstack_1grow(&symbol_obstack, (char) c);
531 obstack_1grow(&symbol_obstack, '\0');
532 char *string = obstack_finish(&symbol_obstack);
536 lexer_token.type = T_FLOATINGPOINT;
537 lexer_token.v.floatvalue = strtold(string, &endptr);
539 if(*endptr != '\0') {
540 parse_error("invalid number literal");
543 parse_floating_suffix();
546 lexer_token.type = T_INTEGER;
547 lexer_token.v.intvalue = parse_int_string(string, &endptr, 10);
549 if(*endptr != '\0') {
550 parse_error("invalid number literal");
553 parse_integer_suffix(false);
555 obstack_free(&symbol_obstack, string);
558 static void parse_number(void)
580 parse_error("invalid octal number");
581 lexer_token.type = T_ERROR;
587 obstack_1grow(&symbol_obstack, '0');
596 static int parse_octal_sequence(const int first_digit)
598 assert(is_octal_digit(first_digit));
599 int value = first_digit - '0';
600 if (!is_octal_digit(c)) return value;
601 value = 8 * value + c - '0';
603 if (!is_octal_digit(c)) return value;
604 value = 8 * value + c - '0';
606 return (char_type)value;
609 static int parse_hex_sequence(void)
613 if (c >= '0' && c <= '9') {
614 value = 16 * value + c - '0';
615 } else if ('A' <= c && c <= 'F') {
616 value = 16 * value + c - 'A' + 10;
617 } else if ('a' <= c && c <= 'f') {
618 value = 16 * value + c - 'a' + 10;
625 return (char_type)value;
628 static int parse_escape_sequence(void)
636 case '"': return '"';
637 case '\'': return '\'';
638 case '\\': return '\\';
639 case '?': return '\?';
640 case 'a': return '\a';
641 case 'b': return '\b';
642 case 'f': return '\f';
643 case 'n': return '\n';
644 case 'r': return '\r';
645 case 't': return '\t';
646 case 'v': return '\v';
648 return parse_hex_sequence();
657 return parse_octal_sequence(ec);
659 parse_error("reached end of file while parsing escape sequence");
662 parse_error("unknown escape sequence");
667 string_t concat_strings(const string_t *const s1, const string_t *const s2)
669 const size_t len1 = s1->size - 1;
670 const size_t len2 = s2->size - 1;
672 char *const concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
673 memcpy(concat, s1->begin, len1);
674 memcpy(concat + len1, s2->begin, len2 + 1);
676 #if 0 /* TODO hash */
677 const char *result = strset_insert(&stringset, concat);
678 if(result != concat) {
679 obstack_free(&symbol_obstack, concat);
684 return (string_t){ concat, len1 + len2 + 1 };
688 wide_string_t concat_string_wide_string(const string_t *const s1, const wide_string_t *const s2)
690 const size_t len1 = s1->size - 1;
691 const size_t len2 = s2->size - 1;
693 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
694 const char *const src = s1->begin;
695 for (size_t i = 0; i != len1; ++i) {
698 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
700 return (wide_string_t){ concat, len1 + len2 + 1 };
703 wide_string_t concat_wide_strings(const wide_string_t *const s1, const wide_string_t *const s2)
705 const size_t len1 = s1->size - 1;
706 const size_t len2 = s2->size - 1;
708 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
709 memcpy(concat, s1->begin, len1 * sizeof(*concat));
710 memcpy(concat + len1, s2->begin, (len2 + 1) * sizeof(*concat));
712 return (wide_string_t){ concat, len1 + len2 + 1 };
715 wide_string_t concat_wide_string_string(const wide_string_t *const s1, const string_t *const s2)
717 const size_t len1 = s1->size - 1;
718 const size_t len2 = s2->size - 1;
720 wchar_rep_t *const concat = obstack_alloc(&symbol_obstack, (len1 + len2 + 1) * sizeof(*concat));
721 memcpy(concat, s1->begin, len1 * sizeof(*concat));
722 const char *const src = s2->begin;
723 for (size_t i = 0; i != len2 + 1; ++i) {
727 return (wide_string_t){ concat, len1 + len2 + 1 };
730 static void parse_string_literal(void)
732 const unsigned start_linenr = lexer_token.source_position.linenr;
740 tc = parse_escape_sequence();
741 obstack_1grow(&symbol_obstack, (char) tc);
745 source_position_t source_position;
746 source_position.input_name = lexer_token.source_position.input_name;
747 source_position.linenr = start_linenr;
748 errorf(source_position, "string has no end");
749 lexer_token.type = T_ERROR;
758 obstack_1grow(&symbol_obstack, (char) c);
766 /* TODO: concatenate multiple strings separated by whitespace... */
768 /* add finishing 0 to the string */
769 obstack_1grow(&symbol_obstack, '\0');
770 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
771 const char *const string = obstack_finish(&symbol_obstack);
773 #if 0 /* TODO hash */
774 /* check if there is already a copy of the string */
775 result = strset_insert(&stringset, string);
776 if(result != string) {
777 obstack_free(&symbol_obstack, string);
780 const char *const result = string;
783 lexer_token.type = T_STRING_LITERAL;
784 lexer_token.v.string.begin = result;
785 lexer_token.v.string.size = size;
788 static void parse_wide_character_constant(void)
796 found_char = parse_escape_sequence();
800 parse_error("newline while parsing character constant");
806 goto end_of_wide_char_constant;
809 parse_error("EOF while parsing character constant");
810 lexer_token.type = T_ERROR;
814 if(found_char != 0) {
815 parse_error("more than 1 characters in character "
817 goto end_of_wide_char_constant;
826 end_of_wide_char_constant:
827 lexer_token.type = T_INTEGER;
828 lexer_token.v.intvalue = found_char;
829 lexer_token.datatype = type_wchar_t;
832 static void parse_wide_string_literal(void)
834 const unsigned start_linenr = lexer_token.source_position.linenr;
842 wchar_rep_t tc = parse_escape_sequence();
843 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
848 source_position_t source_position;
849 source_position.input_name = lexer_token.source_position.input_name;
850 source_position.linenr = start_linenr;
851 errorf(source_position, "string has no end");
852 lexer_token.type = T_ERROR;
862 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
871 /* TODO: concatenate multiple strings separated by whitespace... */
873 /* add finishing 0 to the string */
874 wchar_rep_t nul = L'\0';
875 obstack_grow(&symbol_obstack, &nul, sizeof(nul));
876 const size_t size = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
877 const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
879 #if 0 /* TODO hash */
880 /* check if there is already a copy of the string */
881 const wchar_rep_t *const result = strset_insert(&stringset, string);
882 if(result != string) {
883 obstack_free(&symbol_obstack, string);
886 const wchar_rep_t *const result = string;
889 lexer_token.type = T_WIDE_STRING_LITERAL;
890 lexer_token.v.wide_string.begin = result;
891 lexer_token.v.wide_string.size = size;
894 static void parse_character_constant(void)
896 const unsigned start_linenr = lexer_token.source_position.linenr;
904 tc = parse_escape_sequence();
905 obstack_1grow(&symbol_obstack, (char) tc);
909 parse_error("newline while parsing character constant");
914 source_position_t source_position;
915 source_position.input_name = lexer_token.source_position.input_name;
916 source_position.linenr = start_linenr;
917 errorf(source_position, "EOF while parsing character constant");
918 lexer_token.type = T_ERROR;
924 goto end_of_char_constant;
927 obstack_1grow(&symbol_obstack, (char) c);
934 end_of_char_constant:;
935 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
936 const char *const string = obstack_finish(&symbol_obstack);
938 lexer_token.type = T_CHARS;
939 lexer_token.v.string.begin = string;
940 lexer_token.v.string.size = size;
941 lexer_token.datatype = type_int;
944 static void skip_multiline_comment(void)
946 unsigned start_linenr = lexer_token.source_position.linenr;
953 /* TODO: nested comment, warn here */
964 MATCH_NEWLINE(break;)
967 source_position_t source_position;
968 source_position.input_name = lexer_token.source_position.input_name;
969 source_position.linenr = start_linenr;
970 errorf(source_position, "at end of file while looking for comment end");
981 static void skip_line_comment(void)
999 static token_t pp_token;
1001 static inline void next_pp_token(void)
1003 lexer_next_preprocessing_token();
1004 pp_token = lexer_token;
1007 static void eat_until_newline(void)
1009 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
1014 static void define_directive(void)
1016 lexer_next_preprocessing_token();
1017 if(lexer_token.type != T_IDENTIFIER) {
1018 parse_error("expected identifier after #define\n");
1019 eat_until_newline();
1023 static void ifdef_directive(int is_ifndef)
1026 lexer_next_preprocessing_token();
1027 //expect_identifier();
1031 static void endif_directive(void)
1036 static void parse_line_directive(void)
1038 if(pp_token.type != T_INTEGER) {
1039 parse_error("expected integer");
1041 lexer_token.source_position.linenr = (unsigned int)(pp_token.v.intvalue - 1);
1044 if(pp_token.type == T_STRING_LITERAL) {
1045 lexer_token.source_position.input_name = pp_token.v.string.begin;
1049 eat_until_newline();
1052 static void parse_preprocessor_identifier(void)
1054 assert(pp_token.type == T_IDENTIFIER);
1055 symbol_t *symbol = pp_token.v.symbol;
1057 switch(symbol->pp_ID) {
1059 printf("include - enable header name parsing!\n");
1075 parse_line_directive();
1082 /* TODO; output the rest of the line */
1083 parse_error("#error directive: ");
1086 if (warning.unknown_pragmas) {
1087 warningf(lexer_token.source_position, "encountered unknown #pragma");
1089 eat_until_newline();
1094 static void parse_preprocessor_directive(void)
1098 switch(pp_token.type) {
1100 parse_preprocessor_identifier();
1103 parse_line_directive();
1106 parse_error("invalid preprocessor directive");
1107 eat_until_newline();
1112 #define MAYBE_PROLOG \
1117 #define MAYBE(ch, set_type) \
1120 lexer_token.type = set_type; \
1123 #define ELSE_CODE(code) \
1127 } /* end of while(1) */ \
1130 #define ELSE(set_type) \
1132 lexer_token.type = set_type; \
1136 void lexer_next_preprocessing_token(void)
1146 lexer_token.type = '\n';
1152 /* might be a wide string ( L"string" ) */
1153 if(lexer_token.type == T_IDENTIFIER &&
1154 lexer_token.v.symbol == symbol_L) {
1156 parse_wide_string_literal();
1157 } else if(c == '\'') {
1158 parse_wide_character_constant();
1168 parse_string_literal();
1172 parse_character_constant();
1194 MAYBE('.', T_DOTDOTDOT)
1198 lexer_token.type = '.';
1204 MAYBE('&', T_ANDAND)
1205 MAYBE('=', T_ANDEQUAL)
1209 MAYBE('=', T_ASTERISKEQUAL)
1213 MAYBE('+', T_PLUSPLUS)
1214 MAYBE('=', T_PLUSEQUAL)
1218 MAYBE('>', T_MINUSGREATER)
1219 MAYBE('-', T_MINUSMINUS)
1220 MAYBE('=', T_MINUSEQUAL)
1224 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
1228 MAYBE('=', T_SLASHEQUAL)
1231 skip_multiline_comment();
1232 lexer_next_preprocessing_token();
1236 skip_line_comment();
1237 lexer_next_preprocessing_token();
1242 MAYBE('>', T_PERCENTGREATER)
1243 MAYBE('=', T_PERCENTEQUAL)
1248 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
1252 lexer_token.type = T_PERCENTCOLON;
1255 ELSE(T_PERCENTCOLON)
1259 MAYBE(':', T_LESSCOLON)
1260 MAYBE('%', T_LESSPERCENT)
1261 MAYBE('=', T_LESSEQUAL)
1264 MAYBE('=', T_LESSLESSEQUAL)
1269 MAYBE('=', T_GREATEREQUAL)
1272 MAYBE('=', T_GREATERGREATEREQUAL)
1273 ELSE(T_GREATERGREATER)
1277 MAYBE('=', T_CARETEQUAL)
1281 MAYBE('=', T_PIPEEQUAL)
1282 MAYBE('|', T_PIPEPIPE)
1286 MAYBE('>', T_COLONGREATER)
1290 MAYBE('=', T_EQUALEQUAL)
1294 MAYBE('#', T_HASHHASH)
1308 lexer_token.type = c;
1313 lexer_token.type = T_EOF;
1318 errorf(lexer_token.source_position, "unknown character '%c' found\n", c);
1319 lexer_token.type = T_ERROR;
1325 void lexer_next_token(void)
1327 lexer_next_preprocessing_token();
1328 if(lexer_token.type != '\n')
1333 lexer_next_preprocessing_token();
1334 } while(lexer_token.type == '\n');
1336 if(lexer_token.type == '#') {
1337 parse_preprocessor_directive();
1342 void init_lexer(void)
1344 strset_init(&stringset);
1347 void lexer_open_stream(FILE *stream, const char *input_name)
1350 lexer_token.source_position.linenr = 0;
1351 lexer_token.source_position.input_name = input_name;
1353 symbol_L = symbol_table_insert("L");
1357 /* place a virtual \n at the beginning so the lexer knows that we're
1358 * at the beginning of a line */
1362 void exit_lexer(void)
1364 strset_destroy(&stringset);
1367 static __attribute__((unused))
1368 void dbg_pos(const source_position_t source_position)
1370 fprintf(stdout, "%s:%u\n", source_position.input_name,
1371 source_position.linenr);