5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
24 static char buf[1024 + MAX_PUTBACK];
25 static const char *bufend;
26 static const char *bufpos;
27 static strset_t stringset;
29 static type_t *type_int = NULL;
30 static type_t *type_uint = NULL;
31 static type_t *type_long = NULL;
32 static type_t *type_ulong = NULL;
33 static type_t *type_longlong = NULL;
34 static type_t *type_ulonglong = NULL;
35 static type_t *type_float = NULL;
36 static type_t *type_double = NULL;
37 static type_t *type_longdouble = NULL;
39 static void error_prefix_at(const char *input_name, unsigned linenr)
41 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
44 static void error_prefix(void)
46 error_prefix_at(lexer_token.source_position.input_name,
47 lexer_token.source_position.linenr);
50 static void parse_error(const char *msg)
53 fprintf(stderr, "%s\n", msg);
56 static inline void next_real_char(void)
59 if(bufpos >= bufend) {
60 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
66 bufpos = buf + MAX_PUTBACK;
67 bufend = buf + MAX_PUTBACK + s;
72 static inline void put_back(int pc)
74 assert(bufpos >= buf);
75 //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
77 char *p = buf + (bufpos - buf);
80 /* going backwards in the buffer is legal as long as it's not more often
85 printf("putback '%c'\n", pc);
89 static inline void next_char(void);
91 #define MATCH_NEWLINE(code) \
97 lexer_token.source_position.linenr++; \
101 lexer_token.source_position.linenr++; \
104 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
106 static void maybe_concat_lines(void)
111 MATCH_NEWLINE(return;)
121 static inline void next_char(void)
125 /* filter trigraphs */
126 if(UNLIKELY(c == '\\')) {
127 maybe_concat_lines();
128 goto end_of_next_char;
132 goto end_of_next_char;
135 if(LIKELY(c != '?')) {
138 goto end_of_next_char;
143 case '=': c = '#'; break;
144 case '(': c = '['; break;
145 case '/': c = '\\'; maybe_concat_lines(); break;
146 case ')': c = ']'; break;
147 case '\'': c = '^'; break;
148 case '<': c = '{'; break;
149 case '!': c = '|'; break;
150 case '>': c = '}'; break;
151 case '-': c = '~'; break;
161 printf("nchar '%c'\n", c);
165 #define SYMBOL_CHARS \
232 static void parse_symbol(void)
237 obstack_1grow(&symbol_obstack, c);
244 obstack_1grow(&symbol_obstack, c);
254 obstack_1grow(&symbol_obstack, '\0');
256 string = obstack_finish(&symbol_obstack);
257 symbol = symbol_table_insert(string);
259 lexer_token.type = symbol->ID;
260 lexer_token.v.symbol = symbol;
262 if(symbol->string != string) {
263 obstack_free(&symbol_obstack, string);
267 static void parse_integer_suffix(void)
269 if(c == 'U' || c == 'U') {
271 if(c == 'L' || c == 'l') {
273 if(c == 'L' || c == 'l') {
275 lexer_token.datatype = type_ulonglong;
277 lexer_token.datatype = type_ulong;
280 lexer_token.datatype = type_uint;
282 } else if(c == 'l' || c == 'L') {
284 if(c == 'l' || c == 'L') {
286 if(c == 'u' || c == 'U') {
288 lexer_token.datatype = type_ulonglong;
290 lexer_token.datatype = type_longlong;
292 } else if(c == 'u' || c == 'U') {
294 lexer_token.datatype = type_ulong;
296 lexer_token.datatype = type_int;
299 lexer_token.datatype = type_int;
303 static void parse_floating_suffix(void)
306 /* TODO: do something usefull with the suffixes... */
310 lexer_token.datatype = type_float;
315 lexer_token.datatype = type_longdouble;
318 lexer_token.datatype = type_double;
323 static inline bool is_hex_digit(int c)
325 return (c >= '0' && c <= '9')
326 || (c >= 'a' && c <= 'f')
327 || (c >= 'A' && c <= 'F');
330 static void parse_number_hex(void)
332 assert(c == 'x' || c == 'X');
335 while(is_hex_digit(c)) {
336 obstack_1grow(&symbol_obstack, c);
339 obstack_1grow(&symbol_obstack, '\0');
340 char *string = obstack_finish(&symbol_obstack);
342 if(c == '.' || c == 'p' || c == 'P') {
344 panic("Hex floating point numbers not implemented yet");
346 if(*string == '\0') {
347 parse_error("invalid hex number");
348 lexer_token.type = T_ERROR;
352 lexer_token.type = T_INTEGER;
353 lexer_token.v.intvalue = strtoull(string, &endptr, 16);
354 if(*endptr != '\0') {
355 parse_error("hex number literal too long");
358 obstack_free(&symbol_obstack, string);
361 static inline bool is_octal_digit(int chr)
363 return '0' <= chr && chr <= '7';
366 static void parse_number_oct(void)
368 while(is_octal_digit(c)) {
369 obstack_1grow(&symbol_obstack, c);
372 obstack_1grow(&symbol_obstack, '\0');
373 char *string = obstack_finish(&symbol_obstack);
376 lexer_token.type = T_INTEGER;
377 lexer_token.v.intvalue = strtoull(string, &endptr, 8);
378 if(*endptr != '\0') {
379 parse_error("octal number literal too long");
382 obstack_free(&symbol_obstack, string);
383 parse_integer_suffix();
386 static void parse_number_dec(void)
388 bool is_float = false;
390 obstack_1grow(&symbol_obstack, c);
395 obstack_1grow(&symbol_obstack, '.');
399 obstack_1grow(&symbol_obstack, c);
404 if(c == 'e' || c == 'E') {
405 obstack_1grow(&symbol_obstack, 'e');
408 if(c == '-' || c == '+') {
409 obstack_1grow(&symbol_obstack, c);
414 obstack_1grow(&symbol_obstack, c);
420 obstack_1grow(&symbol_obstack, '\0');
421 char *string = obstack_finish(&symbol_obstack);
425 lexer_token.type = T_FLOATINGPOINT;
426 lexer_token.v.floatvalue = strtold(string, &endptr);
428 if(*endptr != '\0') {
429 parse_error("invalid number literal");
432 parse_floating_suffix();
434 lexer_token.type = T_INTEGER;
435 lexer_token.v.intvalue = strtoull(string, &endptr, 10);
437 if(*endptr != '\0') {
438 parse_error("invalid number literal");
441 parse_integer_suffix();
443 obstack_free(&symbol_obstack, string);
446 static void parse_number(void)
468 parse_error("invalid octal number");
469 lexer_token.type = T_ERROR;
475 obstack_1grow(&symbol_obstack, '0');
484 static int parse_octal_sequence(const int first_digit)
486 assert(is_octal_digit(first_digit));
487 int value = first_digit - '0';
488 if (!is_octal_digit(c)) return value;
489 value = 8 * value + c - '0';
491 if (!is_octal_digit(c)) return value;
492 value = 8 * value + c - '0';
497 static int parse_hex_sequence(void)
501 if (c >= '0' && c <= '9') {
502 value = 16 * value + c - '0';
503 } else if ('A' <= c && c <= 'F') {
504 value = 16 * value + c - 'A' + 10;
505 } else if ('a' <= c && c <= 'f') {
506 value = 16 * value + c - 'a' + 10;
516 static int parse_escape_sequence(void)
524 case '"': return '"';
525 case '\'': return '\'';
526 case '\\': return '\\';
527 case '?': return '\?';
528 case 'a': return '\a';
529 case 'b': return '\b';
530 case 'f': return '\f';
531 case 'n': return '\n';
532 case 'r': return '\r';
533 case 't': return '\t';
534 case 'v': return '\v';
536 return parse_hex_sequence();
545 return parse_octal_sequence(ec);
547 parse_error("reached end of file while parsing escape sequence");
550 parse_error("unknown escape sequence");
555 const char *concat_strings(const char *s1, const char *s2)
557 size_t len1 = strlen(s1);
558 size_t len2 = strlen(s2);
560 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
561 memcpy(concat, s1, len1);
562 memcpy(concat + len1, s2, len2 + 1);
564 const char *result = strset_insert(&stringset, concat);
565 if(result != concat) {
566 obstack_free(&symbol_obstack, concat);
572 static void parse_string_literal(void)
574 unsigned start_linenr = lexer_token.source_position.linenr;
585 tc = parse_escape_sequence();
586 obstack_1grow(&symbol_obstack, tc);
590 error_prefix_at(lexer_token.source_position.input_name,
592 fprintf(stderr, "string has no end\n");
593 lexer_token.type = T_ERROR;
601 obstack_1grow(&symbol_obstack, c);
609 /* TODO: concatenate multiple strings separated by whitespace... */
611 /* add finishing 0 to the string */
612 obstack_1grow(&symbol_obstack, '\0');
613 string = obstack_finish(&symbol_obstack);
615 /* check if there is already a copy of the string */
616 result = strset_insert(&stringset, string);
617 if(result != string) {
618 obstack_free(&symbol_obstack, string);
621 lexer_token.type = T_STRING_LITERAL;
622 lexer_token.v.string = result;
625 static void parse_character_constant(void)
633 found_char = parse_escape_sequence();
637 parse_error("newline while parsing character constant");
643 goto end_of_char_constant;
646 parse_error("EOF while parsing character constant");
647 lexer_token.type = T_ERROR;
651 if(found_char != 0) {
652 parse_error("more than 1 characters in character "
654 goto end_of_char_constant;
663 end_of_char_constant:
664 lexer_token.type = T_INTEGER;
665 lexer_token.v.intvalue = found_char;
668 static void skip_multiline_comment(void)
670 unsigned start_linenr = lexer_token.source_position.linenr;
682 MATCH_NEWLINE(break;)
685 error_prefix_at(lexer_token.source_position.input_name,
687 fprintf(stderr, "at end of file while looking for comment end\n");
697 static void skip_line_comment(void)
715 static token_t pp_token;
717 static inline void next_pp_token(void)
719 lexer_next_preprocessing_token();
720 pp_token = lexer_token;
723 static void eat_until_newline(void)
725 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
730 static void error_directive(void)
733 fprintf(stderr, "#error directive: \n");
735 /* parse pp-tokens until new-line */
738 static void define_directive(void)
740 lexer_next_preprocessing_token();
741 if(lexer_token.type != T_IDENTIFIER) {
742 parse_error("expected identifier after #define\n");
747 static void ifdef_directive(int is_ifndef)
750 lexer_next_preprocessing_token();
751 //expect_identifier();
755 static void endif_directive(void)
760 static void parse_line_directive(void)
762 if(pp_token.type != T_INTEGER) {
763 parse_error("expected integer");
765 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
768 if(pp_token.type == T_STRING_LITERAL) {
769 lexer_token.source_position.input_name = pp_token.v.string;
776 static void parse_preprocessor_identifier(void)
778 assert(pp_token.type == T_IDENTIFIER);
779 symbol_t *symbol = pp_token.v.symbol;
781 switch(symbol->pp_ID) {
783 printf("include - enable header name parsing!\n");
799 parse_line_directive();
813 static void parse_preprocessor_directive(void)
817 switch(pp_token.type) {
819 parse_preprocessor_identifier();
822 parse_line_directive();
825 parse_error("invalid preprocessor directive");
831 #define MAYBE_PROLOG \
836 #define MAYBE(ch, set_type) \
839 lexer_token.type = set_type; \
842 #define ELSE_CODE(code) \
846 } /* end of while(1) */ \
849 #define ELSE(set_type) \
851 lexer_token.type = set_type; \
855 void lexer_next_preprocessing_token(void)
865 lexer_token.type = '\n';
871 /* might be a wide string ( L"string" ) */
872 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
873 lexer_token.v.symbol == symbol_L)) {
874 parse_string_literal();
884 parse_string_literal();
888 parse_character_constant();
895 MAYBE('.', T_DOTDOTDOT)
899 lexer_token.type = '.';
906 MAYBE('=', T_ANDEQUAL)
910 MAYBE('=', T_ASTERISKEQUAL)
914 MAYBE('+', T_PLUSPLUS)
915 MAYBE('=', T_PLUSEQUAL)
919 MAYBE('>', T_MINUSGREATER)
920 MAYBE('-', T_MINUSMINUS)
921 MAYBE('=', T_MINUSEQUAL)
925 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
929 MAYBE('=', T_SLASHEQUAL)
932 skip_multiline_comment();
933 lexer_next_preprocessing_token();
938 lexer_next_preprocessing_token();
943 MAYBE('>', T_PERCENTGREATER)
944 MAYBE('=', T_PERCENTEQUAL)
949 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
953 lexer_token.type = T_PERCENTCOLON;
960 MAYBE(':', T_LESSCOLON)
961 MAYBE('%', T_LESSPERCENT)
962 MAYBE('=', T_LESSEQUAL)
965 MAYBE('=', T_LESSLESSEQUAL)
970 MAYBE('=', T_GREATEREQUAL)
973 MAYBE('=', T_GREATERGREATEREQUAL)
974 ELSE(T_GREATERGREATER)
978 MAYBE('=', T_CARETEQUAL)
982 MAYBE('=', T_PIPEEQUAL)
983 MAYBE('|', T_PIPEPIPE)
987 MAYBE('>', T_COLONGREATER)
991 MAYBE('=', T_EQUALEQUAL)
995 MAYBE('#', T_HASHHASH)
1009 lexer_token.type = c;
1014 lexer_token.type = T_EOF;
1020 fprintf(stderr, "unknown character '%c' found\n", c);
1021 lexer_token.type = T_ERROR;
1027 void lexer_next_token(void)
1029 lexer_next_preprocessing_token();
1030 if(lexer_token.type != '\n')
1035 lexer_next_preprocessing_token();
1036 } while(lexer_token.type == '\n');
1038 if(lexer_token.type == '#') {
1039 parse_preprocessor_directive();
1044 void init_lexer(void)
1046 strset_init(&stringset);
1048 type_int = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_CONST);
1049 type_uint = make_atomic_type(ATOMIC_TYPE_UINT, TYPE_QUALIFIER_CONST);
1050 type_long = make_atomic_type(ATOMIC_TYPE_LONG, TYPE_QUALIFIER_CONST);
1051 type_ulong = make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_CONST);
1052 type_longlong = make_atomic_type(ATOMIC_TYPE_LONGLONG,
1053 TYPE_QUALIFIER_CONST);
1054 type_ulonglong = make_atomic_type(ATOMIC_TYPE_ULONGLONG,
1055 TYPE_QUALIFIER_CONST);
1057 type_float = make_atomic_type(ATOMIC_TYPE_FLOAT, TYPE_QUALIFIER_CONST);
1058 type_double = make_atomic_type(ATOMIC_TYPE_DOUBLE,
1059 TYPE_QUALIFIER_CONST);
1060 type_longdouble = make_atomic_type(ATOMIC_TYPE_LONG_DOUBLE,
1061 TYPE_QUALIFIER_CONST);
1064 void lexer_open_stream(FILE *stream, const char *input_name)
1067 lexer_token.source_position.linenr = 0;
1068 lexer_token.source_position.input_name = input_name;
1070 symbol_L = symbol_table_insert("L");
1072 /* place a virtual \n at the beginning so the lexer knows that we're
1073 * at the beginning of a line */
1077 void exit_lexer(void)
1079 strset_destroy(&stringset);
1082 static __attribute__((unused))
1083 void dbg_pos(const source_position_t source_position)
1085 fprintf(stdout, "%s:%d\n", source_position.input_name,
1086 source_position.linenr);