5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
20 static char buf[1024 + MAX_PUTBACK];
21 static const char *bufend;
22 static const char *bufpos;
23 static strset_t stringset;
24 //static FILE **input_stack;
25 //static char **buf_stack;
28 void error_prefix_at(const char *input_name, unsigned linenr)
30 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
34 void error_prefix(void)
36 error_prefix_at(lexer_token.source_position.input_name,
37 lexer_token.source_position.linenr);
41 void parse_error(const char *msg)
44 fprintf(stderr, "%s\n", msg);
51 if(bufpos >= bufend) {
52 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
58 bufpos = buf + MAX_PUTBACK;
59 bufend = buf + MAX_PUTBACK + s;
63 printf("nchar '%c'\n", c);
70 char *p = (char*) bufpos - 1;
76 printf("putback '%c'\n", pc);
82 int replace_trigraph(void)
84 #define MATCH_TRIGRAPH(ch,replacement) \
90 MATCH_TRIGRAPH('=', '#')
91 MATCH_TRIGRAPH('(', '[')
92 MATCH_TRIGRAPH('/', '\\')
93 MATCH_TRIGRAPH(')', ']')
94 MATCH_TRIGRAPH('\'', '^')
95 MATCH_TRIGRAPH('<', '{')
96 MATCH_TRIGRAPH('!', '|')
97 MATCH_TRIGRAPH('>', '}')
98 MATCH_TRIGRAPH('-', '~')
106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
116 if(replace_trigraph()) { \
125 #define EAT_NEWLINE(newline_code) \
130 lexer_token.source_position.linenr++; \
132 } else if(c == '\n') { \
134 lexer_token.source_position.linenr++; \
138 #define SYMBOL_CHARS \
206 void parse_symbol(void)
211 obstack_1grow(&symbol_obstack, c);
223 obstack_1grow(&symbol_obstack, c);
235 if(replace_trigraph())
247 obstack_1grow(&symbol_obstack, '\0');
249 string = obstack_finish(&symbol_obstack);
250 symbol = symbol_table_insert(string);
252 lexer_token.type = symbol->ID;
253 lexer_token.v.symbol = symbol;
255 if(symbol->string != string) {
256 obstack_free(&symbol_obstack, string);
261 void parse_number_hex(void)
263 assert(c == 'x' || c == 'X');
267 !('A' <= c && c <= 'F') &&
268 !('a' <= c && c <= 'f')) {
269 parse_error("premature end of hex number literal");
270 lexer_token.type = T_ERROR;
277 value = 16 * value + c - '0';
278 } else if ('A' <= c && c <= 'F') {
279 value = 16 * value + c - 'A' + 10;
280 } else if ('a' <= c && c <= 'f') {
281 value = 16 * value + c - 'a' + 10;
283 lexer_token.type = T_INTEGER;
284 lexer_token.v.intvalue = value;
292 void parse_number_oct(void)
294 assert(c == 'o' || c == 'O');
299 if ('0' <= c && c <= '7') {
300 value = 8 * value + c - '0';
302 lexer_token.type = T_INTEGER;
303 lexer_token.v.intvalue = value;
311 void parse_number_dec(int first_char)
315 assert(first_char >= '0' && first_char <= '9');
316 value = first_char - '0';
321 value = 10 * value + c - '0';
323 lexer_token.type = T_INTEGER;
324 lexer_token.v.intvalue = value;
332 void parse_number(void)
334 // TODO check for overflow
335 // TODO check for various invalid inputs sequences
341 case 'x': parse_number_hex(); break;
343 case 'O': parse_number_oct(); break;
344 default: parse_number_dec('0');
352 int parse_escape_sequence(void)
359 case '"': return '"';
360 case '\'': return'\'';
364 case 'a': return '\a';
365 case 'b': return '\b';
366 case 'f': return '\f';
367 case 'n': return '\n';
368 case 'r': return '\r';
369 case 't': return '\t';
370 case 'v': return '\v';
371 case 'x': /* TODO parse hex number ... */
372 parse_error("hex escape sequences not implemented yet");
382 /* TODO parse octal number ... */
383 parse_error("octal escape sequences not implemented yet");
389 /* might be a trigraph */
391 if(replace_trigraph()) {
399 parse_error("reached end of file while parsing escape sequence");
402 parse_error("unknown escape sequence");
408 const char *concat_strings(const char *s1, const char *s2)
410 size_t len1 = strlen(s1);
411 size_t len2 = strlen(s2);
413 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
414 memcpy(concat, s1, len1);
415 memcpy(concat + len1, s2, len2 + 1);
417 const char *result = strset_insert(&stringset, concat);
418 if(result != concat) {
419 obstack_free(&symbol_obstack, concat);
426 void parse_string_literal(void)
428 unsigned start_linenr = lexer_token.source_position.linenr;
438 obstack_1grow(&symbol_obstack, '?');
446 int ec = parse_escape_sequence();
447 obstack_1grow(&symbol_obstack, ec);
451 error_prefix_at(lexer_token.source_position.input_name,
453 fprintf(stderr, "string has no end\n");
454 lexer_token.type = T_ERROR;
462 obstack_1grow(&symbol_obstack, c);
470 /* TODO: concatenate multiple strings separated by whitespace... */
472 /* add finishing 0 to the string */
473 obstack_1grow(&symbol_obstack, '\0');
474 string = obstack_finish(&symbol_obstack);
476 /* check if there is already a copy of the string */
477 result = strset_insert(&stringset, string);
478 if(result != string) {
479 obstack_free(&symbol_obstack, string);
482 lexer_token.type = T_STRING_LITERAL;
483 lexer_token.v.string = result;
486 #define MATCH_NEWLINE(code) \
492 lexer_token.source_position.linenr++; \
496 lexer_token.source_position.linenr++; \
500 void parse_character_constant(void)
520 parse_error("newline while parsing character constant");
526 goto end_of_char_constant;
529 parse_error("EOF while parsing character constant");
530 lexer_token.type = T_ERROR;
534 if(found_char != 0) {
535 parse_error("more than 1 characters in character "
537 goto end_of_char_constant;
546 end_of_char_constant:
547 lexer_token.type = T_INTEGER;
548 lexer_token.v.intvalue = found_char;
552 void skip_multiline_comment(void)
554 unsigned start_linenr = lexer_token.source_position.linenr;
585 if(replace_trigraph())
590 /* we don't put back the 2nd ? as the comment text is discarded
594 MATCH_NEWLINE(had_star = 0; break;)
597 error_prefix_at(lexer_token.source_position.input_name,
599 fprintf(stderr, "at end of file while looking for comment end\n");
610 void skip_line_comment(void)
619 if(replace_trigraph())
622 /* we don't put back the 2nd ? as the comment text is discarded
630 lexer_token.source_position.linenr++;
646 static token_t pp_token;
649 void next_pp_token(void)
651 lexer_next_preprocessing_token();
652 pp_token = lexer_token;
656 void eat_until_newline(void)
658 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
664 void error_directive(void)
667 fprintf(stderr, "#error directive: \n");
669 /* parse pp-tokens until new-line */
673 void define_directive(void)
675 lexer_next_preprocessing_token();
676 if(lexer_token.type != T_IDENTIFIER) {
677 parse_error("expected identifier after #define\n");
683 void ifdef_directive(int is_ifndef)
686 lexer_next_preprocessing_token();
687 //expect_identifier();
692 void endif_directive(void)
698 void parse_line_directive(void)
700 if(pp_token.type != T_INTEGER) {
701 parse_error("expected integer");
703 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
706 if(pp_token.type == T_STRING_LITERAL) {
707 lexer_token.source_position.input_name = pp_token.v.string;
715 void parse_preprocessor_identifier(void)
717 assert(pp_token.type == T_IDENTIFIER);
718 symbol_t *symbol = pp_token.v.symbol;
720 switch(symbol->pp_ID) {
722 printf("include - enable header name parsing!\n");
738 parse_line_directive();
753 void parse_preprocessor_directive()
757 switch(pp_token.type) {
759 parse_preprocessor_identifier();
762 parse_line_directive();
765 parse_error("invalid preprocessor directive");
771 #define MAYBE_PROLOG \
776 #define MAYBE(ch, set_type) \
779 lexer_token.type = set_type; \
782 #define ELSE_CODE(code) \
789 EAT_NEWLINE(break;) \
794 } /* end of while(1) */ \
797 #define ELSE(set_type) \
799 lexer_token.type = set_type; \
803 void lexer_next_preprocessing_token(void)
813 lexer_token.type = '\n';
826 parse_string_literal();
830 parse_character_constant();
837 lexer_token.source_position.linenr++;
840 parse_error("unexpected '\\' found");
841 lexer_token.type = T_ERROR;
849 MAYBE('.', T_DOTDOTDOT)
853 lexer_token.type = '.';
860 MAYBE('=', T_ANDEQUAL)
864 MAYBE('=', T_ASTERISKEQUAL)
868 MAYBE('+', T_PLUSPLUS)
869 MAYBE('=', T_PLUSEQUAL)
873 MAYBE('-', T_MINUSMINUS)
874 MAYBE('=', T_MINUSEQUAL)
878 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
882 MAYBE('=', T_SLASHEQUAL)
885 skip_multiline_comment();
886 lexer_next_preprocessing_token();
891 lexer_next_preprocessing_token();
896 MAYBE('>', T_PERCENTGREATER)
897 MAYBE('=', T_PERCENTEQUAL)
902 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
906 lexer_token.type = T_PERCENTCOLON;
913 MAYBE(':', T_LESSCOLON)
914 MAYBE('%', T_LESSPERCENT)
917 MAYBE('=', T_LESSLESSEQUAL)
924 MAYBE('=', T_GREATERGREATEREQUAL)
925 ELSE(T_GREATERGREATER)
929 MAYBE('=', T_CARETEQUAL)
933 MAYBE('=', T_PIPEEQUAL)
934 MAYBE('|', T_PIPEPIPE)
938 MAYBE('>', T_COLONGREATER)
942 MAYBE('=', T_EQUALEQUAL)
946 MAYBE('#', T_HASHHASH)
951 /* just a simple ? */
953 lexer_token.type = '?';
956 /* might be a trigraph */
958 if(replace_trigraph()) {
963 lexer_token.type = '?';
975 lexer_token.type = c;
980 lexer_token.type = T_EOF;
986 fprintf(stderr, "unknown character '%c' found\n", c);
987 lexer_token.type = T_ERROR;
993 void lexer_next_token(void)
995 lexer_next_preprocessing_token();
996 if(lexer_token.type != '\n')
1001 lexer_next_preprocessing_token();
1002 } while(lexer_token.type == '\n');
1004 if(lexer_token.type == '#') {
1005 parse_preprocessor_directive();
1010 void init_lexer(void)
1012 strset_init(&stringset);
1015 void lexer_open_stream(FILE *stream, const char *input_name)
1018 lexer_token.source_position.linenr = 0;
1019 lexer_token.source_position.input_name = input_name;
1021 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1022 * beginning of a line */
1026 void exit_lexer(void)
1028 strset_destroy(&stringset);
1031 static __attribute__((unused))
1032 void dbg_pos(const source_position_t source_position)
1034 fprintf(stdout, "%s:%d\n", source_position.input_name,
1035 source_position.linenr);