5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
24 static char buf[1024 + MAX_PUTBACK];
25 static const char *bufend;
26 static const char *bufpos;
27 static strset_t stringset;
29 static type_t *type_int = NULL;
30 static type_t *type_uint = NULL;
31 static type_t *type_long = NULL;
32 static type_t *type_ulong = NULL;
33 static type_t *type_longlong = NULL;
34 static type_t *type_ulonglong = NULL;
35 static type_t *type_float = NULL;
36 static type_t *type_double = NULL;
37 static type_t *type_longdouble = NULL;
39 static void error_prefix_at(const char *input_name, unsigned linenr)
41 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
44 static void error_prefix(void)
46 error_prefix_at(lexer_token.source_position.input_name,
47 lexer_token.source_position.linenr);
50 static void parse_error(const char *msg)
53 fprintf(stderr, "%s\n", msg);
56 static inline void next_real_char(void)
59 if(bufpos >= bufend) {
60 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
66 bufpos = buf + MAX_PUTBACK;
67 bufend = buf + MAX_PUTBACK + s;
72 static inline void put_back(int pc)
74 assert(bufpos >= buf);
75 //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
77 char *p = buf + (bufpos - buf);
80 /* going backwards in the buffer is legal as long as it's not more often
85 printf("putback '%c'\n", pc);
89 static inline void next_char(void);
91 #define MATCH_NEWLINE(code) \
97 lexer_token.source_position.linenr++; \
101 lexer_token.source_position.linenr++; \
104 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
106 static void maybe_concat_lines(void)
111 MATCH_NEWLINE(return;)
121 static inline void next_char(void)
125 /* filter trigraphs */
126 if(UNLIKELY(c == '\\')) {
127 maybe_concat_lines();
128 goto end_of_next_char;
132 goto end_of_next_char;
135 if(LIKELY(c != '?')) {
138 goto end_of_next_char;
143 case '=': c = '#'; break;
144 case '(': c = '['; break;
145 case '/': c = '\\'; maybe_concat_lines(); break;
146 case ')': c = ']'; break;
147 case '\'': c = '^'; break;
148 case '<': c = '{'; break;
149 case '!': c = '|'; break;
150 case '>': c = '}'; break;
151 case '-': c = '~'; break;
161 printf("nchar '%c'\n", c);
165 #define SYMBOL_CHARS \
232 static void parse_symbol(void)
237 obstack_1grow(&symbol_obstack, c);
244 obstack_1grow(&symbol_obstack, c);
254 obstack_1grow(&symbol_obstack, '\0');
256 string = obstack_finish(&symbol_obstack);
257 symbol = symbol_table_insert(string);
259 lexer_token.type = symbol->ID;
260 lexer_token.v.symbol = symbol;
262 if(symbol->string != string) {
263 obstack_free(&symbol_obstack, string);
267 static void parse_integer_suffix(void)
269 if(c == 'U' || c == 'u') {
271 if(c == 'L' || c == 'l') {
273 if(c == 'L' || c == 'l') {
275 lexer_token.datatype = type_ulonglong;
277 lexer_token.datatype = type_ulong;
280 lexer_token.datatype = type_uint;
282 } else if(c == 'l' || c == 'L') {
284 if(c == 'l' || c == 'L') {
286 if(c == 'u' || c == 'U') {
288 lexer_token.datatype = type_ulonglong;
290 lexer_token.datatype = type_longlong;
292 } else if(c == 'u' || c == 'U') {
294 lexer_token.datatype = type_ulong;
296 lexer_token.datatype = type_int;
299 lexer_token.datatype = type_int;
303 static void parse_floating_suffix(void)
306 /* TODO: do something usefull with the suffixes... */
310 lexer_token.datatype = type_float;
315 lexer_token.datatype = type_longdouble;
318 lexer_token.datatype = type_double;
323 static inline bool is_hex_digit(int c)
325 return (c >= '0' && c <= '9')
326 || (c >= 'a' && c <= 'f')
327 || (c >= 'A' && c <= 'F');
330 static void parse_number_hex(void)
332 assert(c == 'x' || c == 'X');
335 while(is_hex_digit(c)) {
336 obstack_1grow(&symbol_obstack, c);
339 obstack_1grow(&symbol_obstack, '\0');
340 char *string = obstack_finish(&symbol_obstack);
342 if(c == '.' || c == 'p' || c == 'P') {
344 panic("Hex floating point numbers not implemented yet");
346 if(*string == '\0') {
347 parse_error("invalid hex number");
348 lexer_token.type = T_ERROR;
352 lexer_token.type = T_INTEGER;
353 lexer_token.v.intvalue = strtoull(string, &endptr, 16);
354 if(*endptr != '\0') {
355 parse_error("hex number literal too long");
358 obstack_free(&symbol_obstack, string);
359 parse_integer_suffix();
362 static inline bool is_octal_digit(int chr)
364 return '0' <= chr && chr <= '7';
367 static void parse_number_oct(void)
369 while(is_octal_digit(c)) {
370 obstack_1grow(&symbol_obstack, c);
373 obstack_1grow(&symbol_obstack, '\0');
374 char *string = obstack_finish(&symbol_obstack);
377 lexer_token.type = T_INTEGER;
378 lexer_token.v.intvalue = strtoull(string, &endptr, 8);
379 if(*endptr != '\0') {
380 parse_error("octal number literal too long");
383 obstack_free(&symbol_obstack, string);
384 parse_integer_suffix();
387 static void parse_number_dec(void)
389 bool is_float = false;
391 obstack_1grow(&symbol_obstack, c);
396 obstack_1grow(&symbol_obstack, '.');
400 obstack_1grow(&symbol_obstack, c);
405 if(c == 'e' || c == 'E') {
406 obstack_1grow(&symbol_obstack, 'e');
409 if(c == '-' || c == '+') {
410 obstack_1grow(&symbol_obstack, c);
415 obstack_1grow(&symbol_obstack, c);
421 obstack_1grow(&symbol_obstack, '\0');
422 char *string = obstack_finish(&symbol_obstack);
426 lexer_token.type = T_FLOATINGPOINT;
427 lexer_token.v.floatvalue = strtold(string, &endptr);
429 if(*endptr != '\0') {
430 parse_error("invalid number literal");
433 parse_floating_suffix();
435 lexer_token.type = T_INTEGER;
436 lexer_token.v.intvalue = strtoull(string, &endptr, 10);
438 if(*endptr != '\0') {
439 parse_error("invalid number literal");
442 parse_integer_suffix();
444 obstack_free(&symbol_obstack, string);
447 static void parse_number(void)
469 parse_error("invalid octal number");
470 lexer_token.type = T_ERROR;
476 obstack_1grow(&symbol_obstack, '0');
485 static int parse_octal_sequence(const int first_digit)
487 assert(is_octal_digit(first_digit));
488 int value = first_digit - '0';
489 if (!is_octal_digit(c)) return value;
490 value = 8 * value + c - '0';
492 if (!is_octal_digit(c)) return value;
493 value = 8 * value + c - '0';
498 static int parse_hex_sequence(void)
502 if (c >= '0' && c <= '9') {
503 value = 16 * value + c - '0';
504 } else if ('A' <= c && c <= 'F') {
505 value = 16 * value + c - 'A' + 10;
506 } else if ('a' <= c && c <= 'f') {
507 value = 16 * value + c - 'a' + 10;
517 static int parse_escape_sequence(void)
525 case '"': return '"';
526 case '\'': return '\'';
527 case '\\': return '\\';
528 case '?': return '\?';
529 case 'a': return '\a';
530 case 'b': return '\b';
531 case 'f': return '\f';
532 case 'n': return '\n';
533 case 'r': return '\r';
534 case 't': return '\t';
535 case 'v': return '\v';
537 return parse_hex_sequence();
546 return parse_octal_sequence(ec);
548 parse_error("reached end of file while parsing escape sequence");
551 parse_error("unknown escape sequence");
556 const char *concat_strings(const char *s1, const char *s2)
558 size_t len1 = strlen(s1);
559 size_t len2 = strlen(s2);
561 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
562 memcpy(concat, s1, len1);
563 memcpy(concat + len1, s2, len2 + 1);
565 const char *result = strset_insert(&stringset, concat);
566 if(result != concat) {
567 obstack_free(&symbol_obstack, concat);
573 static void parse_string_literal(void)
575 unsigned start_linenr = lexer_token.source_position.linenr;
586 tc = parse_escape_sequence();
587 obstack_1grow(&symbol_obstack, tc);
591 error_prefix_at(lexer_token.source_position.input_name,
593 fprintf(stderr, "string has no end\n");
594 lexer_token.type = T_ERROR;
602 obstack_1grow(&symbol_obstack, c);
610 /* TODO: concatenate multiple strings separated by whitespace... */
612 /* add finishing 0 to the string */
613 obstack_1grow(&symbol_obstack, '\0');
614 string = obstack_finish(&symbol_obstack);
616 /* check if there is already a copy of the string */
617 result = strset_insert(&stringset, string);
618 if(result != string) {
619 obstack_free(&symbol_obstack, string);
622 lexer_token.type = T_STRING_LITERAL;
623 lexer_token.v.string = result;
626 static void parse_character_constant(void)
634 found_char = parse_escape_sequence();
638 parse_error("newline while parsing character constant");
644 goto end_of_char_constant;
647 parse_error("EOF while parsing character constant");
648 lexer_token.type = T_ERROR;
652 if(found_char != 0) {
653 parse_error("more than 1 characters in character "
655 goto end_of_char_constant;
664 end_of_char_constant:
665 lexer_token.type = T_INTEGER;
666 lexer_token.v.intvalue = found_char;
669 static void skip_multiline_comment(void)
671 unsigned start_linenr = lexer_token.source_position.linenr;
683 MATCH_NEWLINE(break;)
686 error_prefix_at(lexer_token.source_position.input_name,
688 fprintf(stderr, "at end of file while looking for comment end\n");
698 static void skip_line_comment(void)
716 static token_t pp_token;
718 static inline void next_pp_token(void)
720 lexer_next_preprocessing_token();
721 pp_token = lexer_token;
724 static void eat_until_newline(void)
726 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
731 static void error_directive(void)
734 fprintf(stderr, "#error directive: \n");
736 /* parse pp-tokens until new-line */
739 static void define_directive(void)
741 lexer_next_preprocessing_token();
742 if(lexer_token.type != T_IDENTIFIER) {
743 parse_error("expected identifier after #define\n");
748 static void ifdef_directive(int is_ifndef)
751 lexer_next_preprocessing_token();
752 //expect_identifier();
756 static void endif_directive(void)
761 static void parse_line_directive(void)
763 if(pp_token.type != T_INTEGER) {
764 parse_error("expected integer");
766 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
769 if(pp_token.type == T_STRING_LITERAL) {
770 lexer_token.source_position.input_name = pp_token.v.string;
777 static void parse_preprocessor_identifier(void)
779 assert(pp_token.type == T_IDENTIFIER);
780 symbol_t *symbol = pp_token.v.symbol;
782 switch(symbol->pp_ID) {
784 printf("include - enable header name parsing!\n");
800 parse_line_directive();
814 static void parse_preprocessor_directive(void)
818 switch(pp_token.type) {
820 parse_preprocessor_identifier();
823 parse_line_directive();
826 parse_error("invalid preprocessor directive");
832 #define MAYBE_PROLOG \
837 #define MAYBE(ch, set_type) \
840 lexer_token.type = set_type; \
843 #define ELSE_CODE(code) \
847 } /* end of while(1) */ \
850 #define ELSE(set_type) \
852 lexer_token.type = set_type; \
856 void lexer_next_preprocessing_token(void)
866 lexer_token.type = '\n';
872 /* might be a wide string ( L"string" ) */
873 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
874 lexer_token.v.symbol == symbol_L)) {
875 parse_string_literal();
885 parse_string_literal();
889 parse_character_constant();
896 MAYBE('.', T_DOTDOTDOT)
900 lexer_token.type = '.';
907 MAYBE('=', T_ANDEQUAL)
911 MAYBE('=', T_ASTERISKEQUAL)
915 MAYBE('+', T_PLUSPLUS)
916 MAYBE('=', T_PLUSEQUAL)
920 MAYBE('>', T_MINUSGREATER)
921 MAYBE('-', T_MINUSMINUS)
922 MAYBE('=', T_MINUSEQUAL)
926 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
930 MAYBE('=', T_SLASHEQUAL)
933 skip_multiline_comment();
934 lexer_next_preprocessing_token();
939 lexer_next_preprocessing_token();
944 MAYBE('>', T_PERCENTGREATER)
945 MAYBE('=', T_PERCENTEQUAL)
950 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
954 lexer_token.type = T_PERCENTCOLON;
961 MAYBE(':', T_LESSCOLON)
962 MAYBE('%', T_LESSPERCENT)
963 MAYBE('=', T_LESSEQUAL)
966 MAYBE('=', T_LESSLESSEQUAL)
971 MAYBE('=', T_GREATEREQUAL)
974 MAYBE('=', T_GREATERGREATEREQUAL)
975 ELSE(T_GREATERGREATER)
979 MAYBE('=', T_CARETEQUAL)
983 MAYBE('=', T_PIPEEQUAL)
984 MAYBE('|', T_PIPEPIPE)
988 MAYBE('>', T_COLONGREATER)
992 MAYBE('=', T_EQUALEQUAL)
996 MAYBE('#', T_HASHHASH)
1010 lexer_token.type = c;
1015 lexer_token.type = T_EOF;
1021 fprintf(stderr, "unknown character '%c' found\n", c);
1022 lexer_token.type = T_ERROR;
1028 void lexer_next_token(void)
1030 lexer_next_preprocessing_token();
1031 if(lexer_token.type != '\n')
1036 lexer_next_preprocessing_token();
1037 } while(lexer_token.type == '\n');
1039 if(lexer_token.type == '#') {
1040 parse_preprocessor_directive();
1045 void init_lexer(void)
1047 strset_init(&stringset);
1049 type_int = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_CONST);
1050 type_uint = make_atomic_type(ATOMIC_TYPE_UINT, TYPE_QUALIFIER_CONST);
1051 type_long = make_atomic_type(ATOMIC_TYPE_LONG, TYPE_QUALIFIER_CONST);
1052 type_ulong = make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_CONST);
1053 type_longlong = make_atomic_type(ATOMIC_TYPE_LONGLONG,
1054 TYPE_QUALIFIER_CONST);
1055 type_ulonglong = make_atomic_type(ATOMIC_TYPE_ULONGLONG,
1056 TYPE_QUALIFIER_CONST);
1058 type_float = make_atomic_type(ATOMIC_TYPE_FLOAT, TYPE_QUALIFIER_CONST);
1059 type_double = make_atomic_type(ATOMIC_TYPE_DOUBLE,
1060 TYPE_QUALIFIER_CONST);
1061 type_longdouble = make_atomic_type(ATOMIC_TYPE_LONG_DOUBLE,
1062 TYPE_QUALIFIER_CONST);
1065 void lexer_open_stream(FILE *stream, const char *input_name)
1068 lexer_token.source_position.linenr = 0;
1069 lexer_token.source_position.input_name = input_name;
1071 symbol_L = symbol_table_insert("L");
1073 /* place a virtual \n at the beginning so the lexer knows that we're
1074 * at the beginning of a line */
1078 void exit_lexer(void)
1080 strset_destroy(&stringset);
1083 static __attribute__((unused))
1084 void dbg_pos(const source_position_t source_position)
1086 fprintf(stdout, "%s:%d\n", source_position.input_name,
1087 source_position.linenr);