5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
22 static char buf[1024 + MAX_PUTBACK];
23 static const char *bufend;
24 static const char *bufpos;
25 static strset_t stringset;
27 static void error_prefix_at(const char *input_name, unsigned linenr)
29 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
32 static void error_prefix(void)
34 error_prefix_at(lexer_token.source_position.input_name,
35 lexer_token.source_position.linenr);
38 static void parse_error(const char *msg)
41 fprintf(stderr, "%s\n", msg);
44 static inline void next_real_char(void)
47 if(bufpos >= bufend) {
48 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
54 bufpos = buf + MAX_PUTBACK;
55 bufend = buf + MAX_PUTBACK + s;
60 static inline void put_back(int pc)
62 assert(bufpos >= buf);
63 assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
65 char *p = buf + (bufpos - buf);
68 /* going backwards in the buffer is legal as long as it's not more often
73 printf("putback '%c'\n", pc);
77 static inline void next_char(void);
79 #define MATCH_NEWLINE(code) \
85 lexer_token.source_position.linenr++; \
89 lexer_token.source_position.linenr++; \
92 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
94 static void maybe_concat_lines(void)
99 MATCH_NEWLINE(return;)
109 static inline void next_char(void)
114 /* filter trigraphs */
115 if(UNLIKELY(c == '\\')) {
116 maybe_concat_lines();
117 goto end_of_next_char;
121 goto end_of_next_char;
124 if(LIKELY(c != '?')) {
127 goto end_of_next_char;
132 case '=': c = '#'; break;
133 case '(': c = '['; break;
134 case '/': c = '\\'; maybe_concat_lines(); break;
135 case ')': c = ']'; break;
136 case '\'': c = '^'; break;
137 case '<': c = '{'; break;
138 case '!': c = '|'; break;
139 case '>': c = '}'; break;
140 case '-': c = '~'; break;
150 (void) maybe_concat_lines;
152 printf("nchar '%c'\n", c);
158 #define SYMBOL_CHARS \
225 static void parse_symbol(void)
230 obstack_1grow(&symbol_obstack, c);
237 obstack_1grow(&symbol_obstack, c);
247 obstack_1grow(&symbol_obstack, '\0');
249 string = obstack_finish(&symbol_obstack);
250 symbol = symbol_table_insert(string);
252 lexer_token.type = symbol->ID;
253 lexer_token.v.symbol = symbol;
255 if(symbol->string != string) {
256 obstack_free(&symbol_obstack, string);
260 static void parse_integer_suffix(void)
262 if(c == 'U' || c == 'U') {
263 /* TODO do something with the suffixes... */
265 if(c == 'L' || c == 'l') {
267 if(c == 'L' || c == 'l') {
271 } else if(c == 'l' || c == 'L') {
273 if(c == 'l' || c == 'L') {
275 if(c == 'u' || c == 'U') {
278 } else if(c == 'u' || c == 'U') {
284 static void parse_floating_suffix(void)
287 /* TODO: do something usefull with the suffixes... */
299 static void parse_number_hex(void)
301 assert(c == 'x' || c == 'X');
305 !('A' <= c && c <= 'F') &&
306 !('a' <= c && c <= 'f')) {
307 parse_error("premature end of hex number literal");
308 lexer_token.type = T_ERROR;
315 value = 16 * value + c - '0';
316 } else if ('A' <= c && c <= 'F') {
317 value = 16 * value + c - 'A' + 10;
318 } else if ('a' <= c && c <= 'f') {
319 value = 16 * value + c - 'a' + 10;
321 parse_integer_suffix();
323 lexer_token.type = T_INTEGER;
324 lexer_token.v.intvalue = value;
330 if(c == '.' || c == 'p' || c == 'P') {
332 panic("Hex floating point numbers not implemented yet");
336 static void parse_number_oct(void)
339 while(c >= '0' && c <= '7') {
340 value = 8 * value + c - '0';
343 if (c == '8' || c == '9') {
344 parse_error("invalid octal number");
345 lexer_token.type = T_ERROR;
349 lexer_token.type = T_INTEGER;
350 lexer_token.v.intvalue = value;
352 parse_integer_suffix();
355 static void parse_floatingpoint_exponent(long double value)
357 unsigned int expo = 0;
358 long double factor = 10.;
363 } else if(c == '+') {
367 while(c >= '0' && c <= '9') {
368 expo = 10 * expo + (c - '0');
381 lexer_token.type = T_FLOATINGPOINT;
382 lexer_token.v.floatvalue = value;
384 parse_floating_suffix();
387 static void parse_floatingpoint_fract(int integer_part)
389 long double value = integer_part;
390 long double factor = 1.;
392 while(c >= '0' && c <= '9') {
394 value += (c - '0') * factor;
398 if(c == 'e' || c == 'E') {
400 parse_floatingpoint_exponent(value);
404 lexer_token.type = T_FLOATINGPOINT;
405 lexer_token.v.floatvalue = value;
407 parse_floating_suffix();
410 static void parse_number_dec(void)
415 value = 10 * value + c - '0';
421 parse_floatingpoint_fract(value);
424 if(c == 'e' || c == 'E') {
426 parse_floatingpoint_exponent(value);
429 parse_integer_suffix();
431 lexer_token.type = T_INTEGER;
432 lexer_token.v.intvalue = value;
435 static void parse_number(void)
456 parse_floatingpoint_fract(0);
460 parse_floatingpoint_exponent(0);
465 parse_error("invalid octal number");
466 lexer_token.type = T_ERROR;
479 static int parse_octal_sequence(void)
483 if(c < '0' || c > '7')
485 value = 8 * value + c - '0';
492 static int parse_hex_sequence(void)
496 if (c >= '0' && c <= '9') {
497 value = 16 * value + c - '0';
498 } else if ('A' <= c && c <= 'F') {
499 value = 16 * value + c - 'A' + 10;
500 } else if ('a' <= c && c <= 'f') {
501 value = 16 * value + c - 'a' + 10;
511 static int parse_escape_sequence(void)
519 case '"': return '"';
520 case '\'': return'\'';
521 case '\\': return '\\';
522 case '?': return '\?';
523 case 'a': return '\a';
524 case 'b': return '\b';
525 case 'f': return '\f';
526 case 'n': return '\n';
527 case 'r': return '\r';
528 case 't': return '\t';
529 case 'v': return '\v';
531 return parse_hex_sequence();
540 return parse_octal_sequence();
542 parse_error("reached end of file while parsing escape sequence");
545 parse_error("unknown escape sequence");
550 const char *concat_strings(const char *s1, const char *s2)
552 size_t len1 = strlen(s1);
553 size_t len2 = strlen(s2);
555 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
556 memcpy(concat, s1, len1);
557 memcpy(concat + len1, s2, len2 + 1);
559 const char *result = strset_insert(&stringset, concat);
560 if(result != concat) {
561 obstack_free(&symbol_obstack, concat);
567 static void parse_string_literal(void)
569 unsigned start_linenr = lexer_token.source_position.linenr;
580 tc = parse_escape_sequence();
581 obstack_1grow(&symbol_obstack, tc);
585 error_prefix_at(lexer_token.source_position.input_name,
587 fprintf(stderr, "string has no end\n");
588 lexer_token.type = T_ERROR;
596 obstack_1grow(&symbol_obstack, c);
604 /* TODO: concatenate multiple strings separated by whitespace... */
606 /* add finishing 0 to the string */
607 obstack_1grow(&symbol_obstack, '\0');
608 string = obstack_finish(&symbol_obstack);
610 /* check if there is already a copy of the string */
611 result = strset_insert(&stringset, string);
612 if(result != string) {
613 obstack_free(&symbol_obstack, string);
616 lexer_token.type = T_STRING_LITERAL;
617 lexer_token.v.string = result;
620 static void parse_character_constant(void)
628 found_char = parse_escape_sequence();
632 parse_error("newline while parsing character constant");
638 goto end_of_char_constant;
641 parse_error("EOF while parsing character constant");
642 lexer_token.type = T_ERROR;
646 if(found_char != 0) {
647 parse_error("more than 1 characters in character "
649 goto end_of_char_constant;
658 end_of_char_constant:
659 lexer_token.type = T_INTEGER;
660 lexer_token.v.intvalue = found_char;
663 static void skip_multiline_comment(void)
665 unsigned start_linenr = lexer_token.source_position.linenr;
677 MATCH_NEWLINE(break;)
680 error_prefix_at(lexer_token.source_position.input_name,
682 fprintf(stderr, "at end of file while looking for comment end\n");
692 static void skip_line_comment(void)
710 static token_t pp_token;
712 static inline void next_pp_token(void)
714 lexer_next_preprocessing_token();
715 pp_token = lexer_token;
718 static void eat_until_newline(void)
720 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
725 static void error_directive(void)
728 fprintf(stderr, "#error directive: \n");
730 /* parse pp-tokens until new-line */
733 static void define_directive(void)
735 lexer_next_preprocessing_token();
736 if(lexer_token.type != T_IDENTIFIER) {
737 parse_error("expected identifier after #define\n");
742 static void ifdef_directive(int is_ifndef)
745 lexer_next_preprocessing_token();
746 //expect_identifier();
750 static void endif_directive(void)
755 static void parse_line_directive(void)
757 if(pp_token.type != T_INTEGER) {
758 parse_error("expected integer");
760 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
763 if(pp_token.type == T_STRING_LITERAL) {
764 lexer_token.source_position.input_name = pp_token.v.string;
771 static void parse_preprocessor_identifier(void)
773 assert(pp_token.type == T_IDENTIFIER);
774 symbol_t *symbol = pp_token.v.symbol;
776 switch(symbol->pp_ID) {
778 printf("include - enable header name parsing!\n");
794 parse_line_directive();
808 static void parse_preprocessor_directive(void)
812 switch(pp_token.type) {
814 parse_preprocessor_identifier();
817 parse_line_directive();
820 parse_error("invalid preprocessor directive");
826 #define MAYBE_PROLOG \
831 #define MAYBE(ch, set_type) \
834 lexer_token.type = set_type; \
837 #define ELSE_CODE(code) \
841 } /* end of while(1) */ \
844 #define ELSE(set_type) \
846 lexer_token.type = set_type; \
850 void lexer_next_preprocessing_token(void)
860 lexer_token.type = '\n';
866 /* might be a wide string ( L"string" ) */
867 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
868 lexer_token.v.symbol == symbol_L)) {
869 parse_string_literal();
879 parse_string_literal();
883 parse_character_constant();
890 MAYBE('.', T_DOTDOTDOT)
894 lexer_token.type = '.';
901 MAYBE('=', T_ANDEQUAL)
905 MAYBE('=', T_ASTERISKEQUAL)
909 MAYBE('+', T_PLUSPLUS)
910 MAYBE('=', T_PLUSEQUAL)
914 MAYBE('>', T_MINUSGREATER)
915 MAYBE('-', T_MINUSMINUS)
916 MAYBE('=', T_MINUSEQUAL)
920 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
924 MAYBE('=', T_SLASHEQUAL)
927 skip_multiline_comment();
928 lexer_next_preprocessing_token();
933 lexer_next_preprocessing_token();
938 MAYBE('>', T_PERCENTGREATER)
939 MAYBE('=', T_PERCENTEQUAL)
944 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
948 lexer_token.type = T_PERCENTCOLON;
955 MAYBE(':', T_LESSCOLON)
956 MAYBE('%', T_LESSPERCENT)
957 MAYBE('=', T_LESSEQUAL)
960 MAYBE('=', T_LESSLESSEQUAL)
965 MAYBE('=', T_GREATEREQUAL)
968 MAYBE('=', T_GREATERGREATEREQUAL)
969 ELSE(T_GREATERGREATER)
973 MAYBE('=', T_CARETEQUAL)
977 MAYBE('=', T_PIPEEQUAL)
978 MAYBE('|', T_PIPEPIPE)
982 MAYBE('>', T_COLONGREATER)
986 MAYBE('=', T_EQUALEQUAL)
990 MAYBE('#', T_HASHHASH)
1004 lexer_token.type = c;
1009 lexer_token.type = T_EOF;
1015 fprintf(stderr, "unknown character '%c' found\n", c);
1016 lexer_token.type = T_ERROR;
1022 void lexer_next_token(void)
1024 lexer_next_preprocessing_token();
1025 if(lexer_token.type != '\n')
1030 lexer_next_preprocessing_token();
1031 } while(lexer_token.type == '\n');
1033 if(lexer_token.type == '#') {
1034 parse_preprocessor_directive();
1039 void init_lexer(void)
1041 strset_init(&stringset);
1044 void lexer_open_stream(FILE *stream, const char *input_name)
1047 lexer_token.source_position.linenr = 0;
1048 lexer_token.source_position.input_name = input_name;
1050 symbol_L = symbol_table_insert("L");
1052 /* place a virtual \n at the beginning so the lexer knows that we're
1053 * at the beginning of a line */
1057 void exit_lexer(void)
1059 strset_destroy(&stringset);
1062 static __attribute__((unused))
1063 void dbg_pos(const source_position_t source_position)
1065 fprintf(stdout, "%s:%d\n", source_position.input_name,
1066 source_position.linenr);