5 #include "symbol_table_t.h"
17 source_position_t source_position;
19 static char buf[1027];
20 static const char *bufend;
21 static const char *bufpos;
22 static strset_t stringset;
23 //static FILE **input_stack;
24 //static char **buf_stack;
27 void error_prefix_at(const char *input_name, unsigned linenr)
29 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
35 error_prefix_at(source_position.input_name, source_position.linenr);
39 void parse_error(const char *msg)
42 fprintf(stderr, "%s\n", msg);
49 if(bufpos >= bufend) {
50 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
56 bufpos = buf + MAX_PUTBACK;
57 bufend = buf + MAX_PUTBACK + s;
61 printf("nchar '%c'\n", c);
68 char *p = (char*) bufpos - 1;
74 printf("putback '%c'\n", pc);
80 int replace_trigraph(void)
82 #define MATCH_TRIGRAPH(ch,replacement) \
88 MATCH_TRIGRAPH('=', '#')
89 MATCH_TRIGRAPH('(', '[')
90 MATCH_TRIGRAPH('/', '\\')
91 MATCH_TRIGRAPH(')', ']')
92 MATCH_TRIGRAPH('\'', '^')
93 MATCH_TRIGRAPH('<', '{')
94 MATCH_TRIGRAPH('!', '|')
95 MATCH_TRIGRAPH('>', '}')
96 MATCH_TRIGRAPH('-', '~')
104 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
114 if(replace_trigraph()) { \
123 #define EAT_NEWLINE(newline_code) \
128 source_position.linenr++; \
130 } else if(c == '\n') { \
132 source_position.linenr++; \
136 #define SYMBOL_CHARS \
204 void parse_symbol(token_t *token)
209 obstack_1grow(&symbol_obstack, c);
221 obstack_1grow(&symbol_obstack, c);
233 if(replace_trigraph())
245 obstack_1grow(&symbol_obstack, '\0');
247 string = obstack_finish(&symbol_obstack);
248 symbol = symbol_table_insert(string);
250 token->type = symbol->ID;
251 token->v.symbol = symbol;
253 if(symbol->string != string) {
254 obstack_free(&symbol_obstack, string);
259 void parse_number_hex(token_t *token)
261 assert(c == 'x' || c == 'X');
265 !('A' <= c && c <= 'F') &&
266 !('a' <= c && c <= 'f')) {
267 parse_error("premature end of hex number literal");
268 token->type = T_ERROR;
275 value = 16 * value + c - '0';
276 } else if ('A' <= c && c <= 'F') {
277 value = 16 * value + c - 'A' + 10;
278 } else if ('a' <= c && c <= 'f') {
279 value = 16 * value + c - 'a' + 10;
281 token->type = T_INTEGER;
282 token->v.intvalue = value;
290 void parse_number_oct(token_t *token)
292 assert(c == 'o' || c == 'O');
297 if ('0' <= c && c <= '7') {
298 value = 8 * value + c - '0';
300 token->type = T_INTEGER;
301 token->v.intvalue = value;
309 void parse_number_dec(token_t *token, int first_char)
313 assert(first_char >= '0' && first_char <= '9');
314 value = first_char - '0';
319 value = 10 * value + c - '0';
321 token->type = T_INTEGER;
322 token->v.intvalue = value;
330 void parse_number(token_t *token)
332 // TODO check for overflow
333 // TODO check for various invalid inputs sequences
339 case 'x': parse_number_hex(token); break;
341 case 'O': parse_number_oct(token); break;
342 default: parse_number_dec(token, '0');
345 parse_number_dec(token, 0);
350 int parse_escape_sequence()
357 case '"': return '"';
358 case '\'': return'\'';
362 case 'a': return '\a';
363 case 'b': return '\b';
364 case 'f': return '\f';
365 case 'n': return '\n';
366 case 'r': return '\r';
367 case 't': return '\t';
368 case 'v': return '\v';
369 case 'x': /* TODO parse hex number ... */
370 parse_error("hex escape sequences not implemented yet");
380 /* TODO parse octal number ... */
381 parse_error("octal escape sequences not implemented yet");
387 /* might be a trigraph */
389 if(replace_trigraph()) {
397 parse_error("reached end of file while parsing escape sequence");
400 parse_error("unknown escape sequence");
407 void parse_string_literal(token_t *token)
409 unsigned start_linenr = source_position.linenr;
419 obstack_1grow(&symbol_obstack, '?');
427 int ec = parse_escape_sequence();
428 obstack_1grow(&symbol_obstack, ec);
432 error_prefix_at(source_position.input_name, start_linenr);
433 fprintf(stderr, "string has no end\n");
434 token->type = T_ERROR;
442 obstack_1grow(&symbol_obstack, c);
450 /* TODO: concatenate multiple strings separated by whitespace... */
452 /* add finishing 0 to the string */
453 obstack_1grow(&symbol_obstack, '\0');
454 string = obstack_finish(&symbol_obstack);
456 /* check if there is already a copy of the string */
457 result = strset_insert(&stringset, string);
458 if(result != string) {
459 obstack_free(&symbol_obstack, string);
462 token->type = T_STRING_LITERAL;
463 token->v.string = result;
466 #define MATCH_NEWLINE(code) \
472 source_position.linenr++; \
476 source_position.linenr++; \
480 void parse_character_constant(token_t *token)
500 parse_error("newline while parsing character constant");
506 goto end_of_char_constant;
509 parse_error("EOF while parsing character constant");
510 token->type = T_ERROR;
514 if(found_char != 0) {
515 parse_error("more than 1 characters in character "
517 goto end_of_char_constant;
526 end_of_char_constant:
527 token->type = T_INTEGER;
528 token->v.intvalue = found_char;
532 void skip_multiline_comment(void)
534 unsigned start_linenr = source_position.linenr;
565 if(replace_trigraph())
570 /* we don't put back the 2nd ? as the comment text is discarded
574 MATCH_NEWLINE(had_star = 0; break;)
577 error_prefix_at(source_position.input_name, start_linenr);
578 fprintf(stderr, "at end of file while looking for comment end\n");
589 void skip_line_comment(void)
598 if(replace_trigraph())
601 /* we don't put back the 2nd ? as the comment text is discarded
609 source_position.linenr++;
625 static token_t pp_token;
628 void next_pp_token(void)
630 lexer_next_preprocessing_token(&pp_token);
634 void eat_until_newline(void)
636 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
642 void error_directive(void)
645 fprintf(stderr, "#error directive: \n");
647 /* parse pp-tokens until new-line */
651 void define_directive(void)
655 lexer_next_preprocessing_token(&temptoken);
656 if(temptoken.type != T_IDENTIFIER) {
657 parse_error("expected identifier after #define\n");
663 void ifdef_directive(int is_ifndef)
667 lexer_next_preprocessing_token(&temptoken);
668 //expect_identifier();
673 void endif_directive(void)
679 void parse_line_directive(void)
681 if(pp_token.type != T_INTEGER) {
682 parse_error("expected integer");
684 source_position.linenr = pp_token.v.intvalue - 1;
687 if(pp_token.type == T_STRING_LITERAL) {
688 source_position.input_name = pp_token.v.string;
696 void parse_preprocessor_identifier(void)
698 assert(pp_token.type == T_IDENTIFIER);
699 symbol_t *symbol = pp_token.v.symbol;
701 switch(symbol->pp_ID) {
703 printf("include - enable header name parsing!\n");
719 parse_line_directive();
734 void parse_preprocessor_directive()
738 switch(pp_token.type) {
740 parse_preprocessor_identifier();
743 parse_line_directive();
746 parse_error("invalid preprocessor directive");
752 #define MAYBE_PROLOG \
757 #define MAYBE(ch, set_type) \
760 token->type = set_type; \
763 #define ELSE_CODE(code) \
770 EAT_NEWLINE(break;) \
775 } /* end of while(1) */ \
778 #define ELSE(set_type) \
780 token->type = set_type; \
784 void lexer_next_preprocessing_token(token_t *token)
807 parse_string_literal(token);
811 parse_character_constant(token);
818 source_position.linenr++;
821 parse_error("unexpected '\\' found");
822 token->type = T_ERROR;
830 MAYBE('.', T_DOTDOTDOT)
841 MAYBE('=', T_ANDEQUAL)
845 MAYBE('=', T_ASTERISKEQUAL)
849 MAYBE('+', T_PLUSPLUS)
850 MAYBE('=', T_PLUSEQUAL)
854 MAYBE('-', T_MINUSMINUS)
855 MAYBE('=', T_MINUSEQUAL)
859 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
863 MAYBE('=', T_SLASHEQUAL)
866 skip_multiline_comment();
867 lexer_next_preprocessing_token(token);
872 lexer_next_preprocessing_token(token);
877 MAYBE('>', T_PERCENTGREATER)
878 MAYBE('=', T_PERCENTEQUAL)
883 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
887 token->type = T_PERCENTCOLON;
894 MAYBE(':', T_LESSCOLON)
895 MAYBE('%', T_LESSPERCENT)
898 MAYBE('=', T_LESSLESSEQUAL)
905 MAYBE('=', T_GREATERGREATEREQUAL)
906 ELSE(T_GREATERGREATER)
910 MAYBE('=', T_CARETEQUAL)
914 MAYBE('=', T_PIPEEQUAL)
915 MAYBE('|', T_PIPEPIPE)
919 MAYBE('>', T_COLONGREATER)
923 MAYBE('=', T_EQUALEQUAL)
927 MAYBE('#', T_HASHHASH)
932 /* just a simple ? */
937 /* might be a trigraph */
939 if(replace_trigraph()) {
967 fprintf(stderr, "unknown character '%c' found\n", c);
968 token->type = T_ERROR;
974 void lexer_next_token(token_t *token)
976 lexer_next_preprocessing_token(token);
977 if(token->type != '\n')
982 lexer_next_preprocessing_token(token);
983 } while(token->type == '\n');
985 if(token->type == '#') {
986 parse_preprocessor_directive();
991 void init_lexer(void)
993 strset_init(&stringset);
996 void lexer_open_stream(FILE *stream, const char *input_name)
999 source_position.linenr = 0;
1000 source_position.input_name = input_name;
1002 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1003 * beginning of a line */
1007 void exit_lexer(void)
1009 strset_destroy(&stringset);
1012 static __attribute__((unused))
1013 void dbg_pos(const source_position_t source_position)
1015 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);