5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
21 static char buf[1024 + MAX_PUTBACK];
22 static const char *bufend;
23 static const char *bufpos;
24 static strset_t stringset;
26 static void error_prefix_at(const char *input_name, unsigned linenr)
28 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
31 static void error_prefix(void)
33 error_prefix_at(lexer_token.source_position.input_name,
34 lexer_token.source_position.linenr);
37 static void parse_error(const char *msg)
40 fprintf(stderr, "%s\n", msg);
43 static inline void next_real_char(void)
46 if(bufpos >= bufend) {
47 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
53 bufpos = buf + MAX_PUTBACK;
54 bufend = buf + MAX_PUTBACK + s;
59 static inline void put_back(int pc)
61 assert(bufpos >= buf);
62 assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
64 char *p = buf + (bufpos - buf);
67 /* going backwards in the buffer is legal as long as it's not more often
72 printf("putback '%c'\n", pc);
76 static inline void next_char(void);
78 #define MATCH_NEWLINE(code) \
84 lexer_token.source_position.linenr++; \
88 lexer_token.source_position.linenr++; \
91 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
93 static void maybe_concat_lines(void)
98 MATCH_NEWLINE(return;)
108 static inline void next_char(void)
113 /* filter trigraphs */
114 if(UNLIKELY(c == '\\')) {
115 maybe_concat_lines();
116 goto end_of_next_char;
120 goto end_of_next_char;
123 if(LIKELY(c != '?')) {
126 goto end_of_next_char;
131 case '=': c = '#'; break;
132 case '(': c = '['; break;
133 case '/': c = '\\'; maybe_concat_lines(); break;
134 case ')': c = ']'; break;
135 case '\'': c = '^'; break;
136 case '<': c = '{'; break;
137 case '!': c = '|'; break;
138 case '>': c = '}'; break;
139 case '-': c = '~'; break;
149 (void) maybe_concat_lines;
151 printf("nchar '%c'\n", c);
157 #define SYMBOL_CHARS \
224 static void parse_symbol(void)
229 obstack_1grow(&symbol_obstack, c);
236 obstack_1grow(&symbol_obstack, c);
246 obstack_1grow(&symbol_obstack, '\0');
248 string = obstack_finish(&symbol_obstack);
249 symbol = symbol_table_insert(string);
251 lexer_token.type = symbol->ID;
252 lexer_token.v.symbol = symbol;
254 if(symbol->string != string) {
255 obstack_free(&symbol_obstack, string);
259 static void parse_integer_suffix(void)
261 if(c == 'U' || c == 'U') {
262 /* TODO do something with the suffixes... */
264 if(c == 'L' || c == 'l') {
266 if(c == 'L' || c == 'l') {
270 } else if(c == 'l' || c == 'L') {
272 if(c == 'l' || c == 'L') {
274 if(c == 'u' || c == 'U') {
277 } else if(c == 'u' || c == 'U') {
283 static void parse_floating_suffix(void)
286 /* TODO: do something usefull with the suffixes... */
298 static void parse_number_hex(void)
300 assert(c == 'x' || c == 'X');
304 !('A' <= c && c <= 'F') &&
305 !('a' <= c && c <= 'f')) {
306 parse_error("premature end of hex number literal");
307 lexer_token.type = T_ERROR;
314 value = 16 * value + c - '0';
315 } else if ('A' <= c && c <= 'F') {
316 value = 16 * value + c - 'A' + 10;
317 } else if ('a' <= c && c <= 'f') {
318 value = 16 * value + c - 'a' + 10;
320 parse_integer_suffix();
322 lexer_token.type = T_INTEGER;
323 lexer_token.v.intvalue = value;
329 if(c == '.' || c == 'p' || c == 'P') {
331 panic("Hex floating point numbers not implemented yet");
335 static void parse_number_oct(void)
338 while(c >= '0' && c <= '7') {
339 value = 8 * value + c - '0';
342 if (c == '8' || c == '9') {
343 parse_error("invalid octal number");
344 lexer_token.type = T_ERROR;
348 lexer_token.type = T_INTEGER;
349 lexer_token.v.intvalue = value;
351 parse_integer_suffix();
354 static void parse_floatingpoint_exponent(long double value)
356 unsigned int expo = 0;
357 long double factor = 10.;
362 } else if(c == '+') {
366 while(c >= '0' && c <= '9') {
367 expo = 10 * expo + (c - '0');
380 lexer_token.type = T_FLOATINGPOINT;
381 lexer_token.v.floatvalue = value;
383 parse_floating_suffix();
386 static void parse_floatingpoint_fract(int integer_part)
388 long double value = integer_part;
389 long double factor = 1.;
391 while(c >= '0' && c <= '9') {
393 value += (c - '0') * factor;
397 if(c == 'e' || c == 'E') {
399 parse_floatingpoint_exponent(value);
403 lexer_token.type = T_FLOATINGPOINT;
404 lexer_token.v.floatvalue = value;
406 parse_floating_suffix();
409 static void parse_number_dec(void)
414 value = 10 * value + c - '0';
420 parse_floatingpoint_fract(value);
423 if(c == 'e' || c == 'E') {
425 parse_floatingpoint_exponent(value);
428 parse_integer_suffix();
430 lexer_token.type = T_INTEGER;
431 lexer_token.v.intvalue = value;
434 static void parse_number(void)
455 parse_floatingpoint_fract(0);
459 parse_floatingpoint_exponent(0);
464 parse_error("invalid octal number");
465 lexer_token.type = T_ERROR;
478 static int parse_octal_sequence(void)
482 if(c < '0' || c > '7')
484 value = 8 * value + c - '0';
491 static int parse_hex_sequence(void)
495 if (c >= '0' && c <= '9') {
496 value = 16 * value + c - '0';
497 } else if ('A' <= c && c <= 'F') {
498 value = 16 * value + c - 'A' + 10;
499 } else if ('a' <= c && c <= 'f') {
500 value = 16 * value + c - 'a' + 10;
510 static int parse_escape_sequence(void)
518 case '"': return '"';
519 case '\'': return'\'';
520 case '\\': return '\\';
521 case '?': return '\?';
522 case 'a': return '\a';
523 case 'b': return '\b';
524 case 'f': return '\f';
525 case 'n': return '\n';
526 case 'r': return '\r';
527 case 't': return '\t';
528 case 'v': return '\v';
530 return parse_hex_sequence();
539 return parse_octal_sequence();
541 parse_error("reached end of file while parsing escape sequence");
544 parse_error("unknown escape sequence");
549 const char *concat_strings(const char *s1, const char *s2)
551 size_t len1 = strlen(s1);
552 size_t len2 = strlen(s2);
554 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
555 memcpy(concat, s1, len1);
556 memcpy(concat + len1, s2, len2 + 1);
558 const char *result = strset_insert(&stringset, concat);
559 if(result != concat) {
560 obstack_free(&symbol_obstack, concat);
566 static void parse_string_literal(void)
568 unsigned start_linenr = lexer_token.source_position.linenr;
579 tc = parse_escape_sequence();
580 obstack_1grow(&symbol_obstack, tc);
584 error_prefix_at(lexer_token.source_position.input_name,
586 fprintf(stderr, "string has no end\n");
587 lexer_token.type = T_ERROR;
595 obstack_1grow(&symbol_obstack, c);
603 /* TODO: concatenate multiple strings separated by whitespace... */
605 /* add finishing 0 to the string */
606 obstack_1grow(&symbol_obstack, '\0');
607 string = obstack_finish(&symbol_obstack);
609 /* check if there is already a copy of the string */
610 result = strset_insert(&stringset, string);
611 if(result != string) {
612 obstack_free(&symbol_obstack, string);
615 lexer_token.type = T_STRING_LITERAL;
616 lexer_token.v.string = result;
619 static void parse_character_constant(void)
627 found_char = parse_escape_sequence();
631 parse_error("newline while parsing character constant");
637 goto end_of_char_constant;
640 parse_error("EOF while parsing character constant");
641 lexer_token.type = T_ERROR;
645 if(found_char != 0) {
646 parse_error("more than 1 characters in character "
648 goto end_of_char_constant;
657 end_of_char_constant:
658 lexer_token.type = T_INTEGER;
659 lexer_token.v.intvalue = found_char;
662 static void skip_multiline_comment(void)
664 unsigned start_linenr = lexer_token.source_position.linenr;
676 MATCH_NEWLINE(break;)
679 error_prefix_at(lexer_token.source_position.input_name,
681 fprintf(stderr, "at end of file while looking for comment end\n");
691 static void skip_line_comment(void)
709 static token_t pp_token;
711 static inline void next_pp_token(void)
713 lexer_next_preprocessing_token();
714 pp_token = lexer_token;
717 static void eat_until_newline(void)
719 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
724 static void error_directive(void)
727 fprintf(stderr, "#error directive: \n");
729 /* parse pp-tokens until new-line */
732 static void define_directive(void)
734 lexer_next_preprocessing_token();
735 if(lexer_token.type != T_IDENTIFIER) {
736 parse_error("expected identifier after #define\n");
741 static void ifdef_directive(int is_ifndef)
744 lexer_next_preprocessing_token();
745 //expect_identifier();
749 static void endif_directive(void)
754 static void parse_line_directive(void)
756 if(pp_token.type != T_INTEGER) {
757 parse_error("expected integer");
759 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
762 if(pp_token.type == T_STRING_LITERAL) {
763 lexer_token.source_position.input_name = pp_token.v.string;
770 static void parse_preprocessor_identifier(void)
772 assert(pp_token.type == T_IDENTIFIER);
773 symbol_t *symbol = pp_token.v.symbol;
775 switch(symbol->pp_ID) {
777 printf("include - enable header name parsing!\n");
793 parse_line_directive();
807 static void parse_preprocessor_directive()
811 switch(pp_token.type) {
813 parse_preprocessor_identifier();
816 parse_line_directive();
819 parse_error("invalid preprocessor directive");
825 #define MAYBE_PROLOG \
830 #define MAYBE(ch, set_type) \
833 lexer_token.type = set_type; \
836 #define ELSE_CODE(code) \
840 } /* end of while(1) */ \
843 #define ELSE(set_type) \
845 lexer_token.type = set_type; \
849 void lexer_next_preprocessing_token(void)
859 lexer_token.type = '\n';
872 parse_string_literal();
876 parse_character_constant();
883 MAYBE('.', T_DOTDOTDOT)
887 lexer_token.type = '.';
894 MAYBE('=', T_ANDEQUAL)
898 MAYBE('=', T_ASTERISKEQUAL)
902 MAYBE('+', T_PLUSPLUS)
903 MAYBE('=', T_PLUSEQUAL)
907 MAYBE('>', T_MINUSGREATER)
908 MAYBE('-', T_MINUSMINUS)
909 MAYBE('=', T_MINUSEQUAL)
913 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
917 MAYBE('=', T_SLASHEQUAL)
920 skip_multiline_comment();
921 lexer_next_preprocessing_token();
926 lexer_next_preprocessing_token();
931 MAYBE('>', T_PERCENTGREATER)
932 MAYBE('=', T_PERCENTEQUAL)
937 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
941 lexer_token.type = T_PERCENTCOLON;
948 MAYBE(':', T_LESSCOLON)
949 MAYBE('%', T_LESSPERCENT)
950 MAYBE('=', T_LESSEQUAL)
953 MAYBE('=', T_LESSLESSEQUAL)
958 MAYBE('=', T_GREATEREQUAL)
961 MAYBE('=', T_GREATERGREATEREQUAL)
962 ELSE(T_GREATERGREATER)
966 MAYBE('=', T_CARETEQUAL)
970 MAYBE('=', T_PIPEEQUAL)
971 MAYBE('|', T_PIPEPIPE)
975 MAYBE('>', T_COLONGREATER)
979 MAYBE('=', T_EQUALEQUAL)
983 MAYBE('#', T_HASHHASH)
997 lexer_token.type = c;
1002 lexer_token.type = T_EOF;
1008 fprintf(stderr, "unknown character '%c' found\n", c);
1009 lexer_token.type = T_ERROR;
1015 void lexer_next_token(void)
1017 lexer_next_preprocessing_token();
1018 if(lexer_token.type != '\n')
1023 lexer_next_preprocessing_token();
1024 } while(lexer_token.type == '\n');
1026 if(lexer_token.type == '#') {
1027 parse_preprocessor_directive();
1032 void init_lexer(void)
1034 strset_init(&stringset);
1037 void lexer_open_stream(FILE *stream, const char *input_name)
1040 lexer_token.source_position.linenr = 0;
1041 lexer_token.source_position.input_name = input_name;
1043 /* place a virtual \n at the beginning so the lexer knows that we're
1044 * at the beginning of a line */
1048 void exit_lexer(void)
1050 strset_destroy(&stringset);
1053 static __attribute__((unused))
1054 void dbg_pos(const source_position_t source_position)
1056 fprintf(stdout, "%s:%d\n", source_position.input_name,
1057 source_position.linenr);