5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
22 static char buf[1024 + MAX_PUTBACK];
23 static const char *bufend;
24 static const char *bufpos;
25 static strset_t stringset;
27 static void error_prefix_at(const char *input_name, unsigned linenr)
29 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
32 static void error_prefix(void)
34 error_prefix_at(lexer_token.source_position.input_name,
35 lexer_token.source_position.linenr);
38 static void parse_error(const char *msg)
41 fprintf(stderr, "%s\n", msg);
44 static inline void next_real_char(void)
47 if(bufpos >= bufend) {
48 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
54 bufpos = buf + MAX_PUTBACK;
55 bufend = buf + MAX_PUTBACK + s;
60 static inline void put_back(int pc)
62 assert(bufpos >= buf);
63 assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
65 char *p = buf + (bufpos - buf);
68 /* going backwards in the buffer is legal as long as it's not more often
73 printf("putback '%c'\n", pc);
77 static inline void next_char(void);
79 #define MATCH_NEWLINE(code) \
85 lexer_token.source_position.linenr++; \
89 lexer_token.source_position.linenr++; \
92 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
94 static void maybe_concat_lines(void)
99 MATCH_NEWLINE(return;)
109 static inline void next_char(void)
114 /* filter trigraphs */
115 if(UNLIKELY(c == '\\')) {
116 maybe_concat_lines();
117 goto end_of_next_char;
121 goto end_of_next_char;
124 if(LIKELY(c != '?')) {
127 goto end_of_next_char;
132 case '=': c = '#'; break;
133 case '(': c = '['; break;
134 case '/': c = '\\'; maybe_concat_lines(); break;
135 case ')': c = ']'; break;
136 case '\'': c = '^'; break;
137 case '<': c = '{'; break;
138 case '!': c = '|'; break;
139 case '>': c = '}'; break;
140 case '-': c = '~'; break;
150 (void) maybe_concat_lines;
152 printf("nchar '%c'\n", c);
158 #define SYMBOL_CHARS \
225 static void parse_symbol(void)
230 obstack_1grow(&symbol_obstack, c);
237 obstack_1grow(&symbol_obstack, c);
247 obstack_1grow(&symbol_obstack, '\0');
249 string = obstack_finish(&symbol_obstack);
250 symbol = symbol_table_insert(string);
252 lexer_token.type = symbol->ID;
253 lexer_token.v.symbol = symbol;
255 if(symbol->string != string) {
256 obstack_free(&symbol_obstack, string);
260 static void parse_integer_suffix(void)
262 if(c == 'U' || c == 'U') {
263 /* TODO do something with the suffixes... */
265 if(c == 'L' || c == 'l') {
267 if(c == 'L' || c == 'l') {
271 } else if(c == 'l' || c == 'L') {
273 if(c == 'l' || c == 'L') {
275 if(c == 'u' || c == 'U') {
278 } else if(c == 'u' || c == 'U') {
284 static void parse_floating_suffix(void)
287 /* TODO: do something usefull with the suffixes... */
299 static void parse_number_hex(void)
301 assert(c == 'x' || c == 'X');
305 !('A' <= c && c <= 'F') &&
306 !('a' <= c && c <= 'f')) {
307 parse_error("premature end of hex number literal");
308 lexer_token.type = T_ERROR;
315 value = 16 * value + c - '0';
316 } else if ('A' <= c && c <= 'F') {
317 value = 16 * value + c - 'A' + 10;
318 } else if ('a' <= c && c <= 'f') {
319 value = 16 * value + c - 'a' + 10;
321 parse_integer_suffix();
323 lexer_token.type = T_INTEGER;
324 lexer_token.v.intvalue = value;
330 if(c == '.' || c == 'p' || c == 'P') {
332 panic("Hex floating point numbers not implemented yet");
336 static void parse_number_oct(void)
339 while(c >= '0' && c <= '7') {
340 value = 8 * value + c - '0';
343 if (c == '8' || c == '9') {
344 parse_error("invalid octal number");
345 lexer_token.type = T_ERROR;
349 lexer_token.type = T_INTEGER;
350 lexer_token.v.intvalue = value;
352 parse_integer_suffix();
355 static void parse_floatingpoint_exponent(long double value)
357 unsigned int expo = 0;
358 long double factor = 10.;
363 } else if(c == '+') {
367 while(c >= '0' && c <= '9') {
368 expo = 10 * expo + (c - '0');
381 lexer_token.type = T_FLOATINGPOINT;
382 lexer_token.v.floatvalue = value;
384 parse_floating_suffix();
387 static void parse_floatingpoint_fract(int integer_part)
389 long double value = integer_part;
390 long double factor = 1.;
392 while(c >= '0' && c <= '9') {
394 value += (c - '0') * factor;
398 if(c == 'e' || c == 'E') {
400 parse_floatingpoint_exponent(value);
404 lexer_token.type = T_FLOATINGPOINT;
405 lexer_token.v.floatvalue = value;
407 parse_floating_suffix();
410 static void parse_number_dec(void)
415 value = 10 * value + c - '0';
421 parse_floatingpoint_fract(value);
424 if(c == 'e' || c == 'E') {
426 parse_floatingpoint_exponent(value);
429 parse_integer_suffix();
431 lexer_token.type = T_INTEGER;
432 lexer_token.v.intvalue = value;
435 static void parse_number(void)
456 parse_floatingpoint_fract(0);
460 parse_floatingpoint_exponent(0);
465 parse_error("invalid octal number");
466 lexer_token.type = T_ERROR;
479 static inline int is_octal_digit(int chr)
481 return '0' <= chr && chr <= '7';
484 static int parse_octal_sequence(const int first_digit)
486 assert(is_octal_digit(first_digit));
487 int value = first_digit - '0';
488 if (!is_octal_digit(c)) return value;
489 value = 8 * value + c - '0';
491 if (!is_octal_digit(c)) return value;
492 value = 8 * value + c - '0';
497 static int parse_hex_sequence(void)
501 if (c >= '0' && c <= '9') {
502 value = 16 * value + c - '0';
503 } else if ('A' <= c && c <= 'F') {
504 value = 16 * value + c - 'A' + 10;
505 } else if ('a' <= c && c <= 'f') {
506 value = 16 * value + c - 'a' + 10;
516 static int parse_escape_sequence(void)
524 case '"': return '"';
525 case '\'': return '\'';
526 case '\\': return '\\';
527 case '?': return '\?';
528 case 'a': return '\a';
529 case 'b': return '\b';
530 case 'f': return '\f';
531 case 'n': return '\n';
532 case 'r': return '\r';
533 case 't': return '\t';
534 case 'v': return '\v';
536 return parse_hex_sequence();
545 return parse_octal_sequence(ec);
547 parse_error("reached end of file while parsing escape sequence");
550 parse_error("unknown escape sequence");
555 const char *concat_strings(const char *s1, const char *s2)
557 size_t len1 = strlen(s1);
558 size_t len2 = strlen(s2);
560 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
561 memcpy(concat, s1, len1);
562 memcpy(concat + len1, s2, len2 + 1);
564 const char *result = strset_insert(&stringset, concat);
565 if(result != concat) {
566 obstack_free(&symbol_obstack, concat);
572 static void parse_string_literal(void)
574 unsigned start_linenr = lexer_token.source_position.linenr;
585 tc = parse_escape_sequence();
586 obstack_1grow(&symbol_obstack, tc);
590 error_prefix_at(lexer_token.source_position.input_name,
592 fprintf(stderr, "string has no end\n");
593 lexer_token.type = T_ERROR;
601 obstack_1grow(&symbol_obstack, c);
609 /* TODO: concatenate multiple strings separated by whitespace... */
611 /* add finishing 0 to the string */
612 obstack_1grow(&symbol_obstack, '\0');
613 string = obstack_finish(&symbol_obstack);
615 /* check if there is already a copy of the string */
616 result = strset_insert(&stringset, string);
617 if(result != string) {
618 obstack_free(&symbol_obstack, string);
621 lexer_token.type = T_STRING_LITERAL;
622 lexer_token.v.string = result;
625 static void parse_character_constant(void)
633 found_char = parse_escape_sequence();
637 parse_error("newline while parsing character constant");
643 goto end_of_char_constant;
646 parse_error("EOF while parsing character constant");
647 lexer_token.type = T_ERROR;
651 if(found_char != 0) {
652 parse_error("more than 1 characters in character "
654 goto end_of_char_constant;
663 end_of_char_constant:
664 lexer_token.type = T_INTEGER;
665 lexer_token.v.intvalue = found_char;
668 static void skip_multiline_comment(void)
670 unsigned start_linenr = lexer_token.source_position.linenr;
682 MATCH_NEWLINE(break;)
685 error_prefix_at(lexer_token.source_position.input_name,
687 fprintf(stderr, "at end of file while looking for comment end\n");
697 static void skip_line_comment(void)
715 static token_t pp_token;
717 static inline void next_pp_token(void)
719 lexer_next_preprocessing_token();
720 pp_token = lexer_token;
723 static void eat_until_newline(void)
725 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
730 static void error_directive(void)
733 fprintf(stderr, "#error directive: \n");
735 /* parse pp-tokens until new-line */
738 static void define_directive(void)
740 lexer_next_preprocessing_token();
741 if(lexer_token.type != T_IDENTIFIER) {
742 parse_error("expected identifier after #define\n");
747 static void ifdef_directive(int is_ifndef)
750 lexer_next_preprocessing_token();
751 //expect_identifier();
755 static void endif_directive(void)
760 static void parse_line_directive(void)
762 if(pp_token.type != T_INTEGER) {
763 parse_error("expected integer");
765 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
768 if(pp_token.type == T_STRING_LITERAL) {
769 lexer_token.source_position.input_name = pp_token.v.string;
776 static void parse_preprocessor_identifier(void)
778 assert(pp_token.type == T_IDENTIFIER);
779 symbol_t *symbol = pp_token.v.symbol;
781 switch(symbol->pp_ID) {
783 printf("include - enable header name parsing!\n");
799 parse_line_directive();
813 static void parse_preprocessor_directive(void)
817 switch(pp_token.type) {
819 parse_preprocessor_identifier();
822 parse_line_directive();
825 parse_error("invalid preprocessor directive");
831 #define MAYBE_PROLOG \
836 #define MAYBE(ch, set_type) \
839 lexer_token.type = set_type; \
842 #define ELSE_CODE(code) \
846 } /* end of while(1) */ \
849 #define ELSE(set_type) \
851 lexer_token.type = set_type; \
855 void lexer_next_preprocessing_token(void)
865 lexer_token.type = '\n';
871 /* might be a wide string ( L"string" ) */
872 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
873 lexer_token.v.symbol == symbol_L)) {
874 parse_string_literal();
884 parse_string_literal();
888 parse_character_constant();
895 MAYBE('.', T_DOTDOTDOT)
899 lexer_token.type = '.';
906 MAYBE('=', T_ANDEQUAL)
910 MAYBE('=', T_ASTERISKEQUAL)
914 MAYBE('+', T_PLUSPLUS)
915 MAYBE('=', T_PLUSEQUAL)
919 MAYBE('>', T_MINUSGREATER)
920 MAYBE('-', T_MINUSMINUS)
921 MAYBE('=', T_MINUSEQUAL)
925 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
929 MAYBE('=', T_SLASHEQUAL)
932 skip_multiline_comment();
933 lexer_next_preprocessing_token();
938 lexer_next_preprocessing_token();
943 MAYBE('>', T_PERCENTGREATER)
944 MAYBE('=', T_PERCENTEQUAL)
949 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
953 lexer_token.type = T_PERCENTCOLON;
960 MAYBE(':', T_LESSCOLON)
961 MAYBE('%', T_LESSPERCENT)
962 MAYBE('=', T_LESSEQUAL)
965 MAYBE('=', T_LESSLESSEQUAL)
970 MAYBE('=', T_GREATEREQUAL)
973 MAYBE('=', T_GREATERGREATEREQUAL)
974 ELSE(T_GREATERGREATER)
978 MAYBE('=', T_CARETEQUAL)
982 MAYBE('=', T_PIPEEQUAL)
983 MAYBE('|', T_PIPEPIPE)
987 MAYBE('>', T_COLONGREATER)
991 MAYBE('=', T_EQUALEQUAL)
995 MAYBE('#', T_HASHHASH)
1009 lexer_token.type = c;
1014 lexer_token.type = T_EOF;
1020 fprintf(stderr, "unknown character '%c' found\n", c);
1021 lexer_token.type = T_ERROR;
1027 void lexer_next_token(void)
1029 lexer_next_preprocessing_token();
1030 if(lexer_token.type != '\n')
1035 lexer_next_preprocessing_token();
1036 } while(lexer_token.type == '\n');
1038 if(lexer_token.type == '#') {
1039 parse_preprocessor_directive();
1044 void init_lexer(void)
1046 strset_init(&stringset);
1049 void lexer_open_stream(FILE *stream, const char *input_name)
1052 lexer_token.source_position.linenr = 0;
1053 lexer_token.source_position.input_name = input_name;
1055 symbol_L = symbol_table_insert("L");
1057 /* place a virtual \n at the beginning so the lexer knows that we're
1058 * at the beginning of a line */
1062 void exit_lexer(void)
1064 strset_destroy(&stringset);
1067 static __attribute__((unused))
1068 void dbg_pos(const source_position_t source_position)
1070 fprintf(stdout, "%s:%d\n", source_position.input_name,
1071 source_position.linenr);