5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
20 static char buf[1024 + MAX_PUTBACK];
21 static const char *bufend;
22 static const char *bufpos;
23 static strset_t stringset;
24 //static FILE **input_stack;
25 //static char **buf_stack;
28 void error_prefix_at(const char *input_name, unsigned linenr)
30 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
34 void error_prefix(void)
36 error_prefix_at(lexer_token.source_position.input_name,
37 lexer_token.source_position.linenr);
41 void parse_error(const char *msg)
44 fprintf(stderr, "%s\n", msg);
51 if(bufpos >= bufend) {
52 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
58 bufpos = buf + MAX_PUTBACK;
59 bufend = buf + MAX_PUTBACK + s;
63 printf("nchar '%c'\n", c);
70 char *p = (char*) bufpos - 1;
76 printf("putback '%c'\n", pc);
82 int replace_trigraph(void)
84 #define MATCH_TRIGRAPH(ch,replacement) \
90 MATCH_TRIGRAPH('=', '#')
91 MATCH_TRIGRAPH('(', '[')
92 MATCH_TRIGRAPH('/', '\\')
93 MATCH_TRIGRAPH(')', ']')
94 MATCH_TRIGRAPH('\'', '^')
95 MATCH_TRIGRAPH('<', '{')
96 MATCH_TRIGRAPH('!', '|')
97 MATCH_TRIGRAPH('>', '}')
98 MATCH_TRIGRAPH('-', '~')
106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
116 if(replace_trigraph()) { \
125 #define EAT_NEWLINE(newline_code) \
130 lexer_token.source_position.linenr++; \
132 } else if(c == '\n') { \
134 lexer_token.source_position.linenr++; \
138 #define SYMBOL_CHARS \
206 void parse_symbol(void)
211 obstack_1grow(&symbol_obstack, c);
223 obstack_1grow(&symbol_obstack, c);
235 if(replace_trigraph())
247 obstack_1grow(&symbol_obstack, '\0');
249 string = obstack_finish(&symbol_obstack);
250 symbol = symbol_table_insert(string);
252 lexer_token.type = symbol->ID;
253 lexer_token.v.symbol = symbol;
255 if(symbol->string != string) {
256 obstack_free(&symbol_obstack, string);
261 void parse_number_hex(void)
263 assert(c == 'x' || c == 'X');
267 !('A' <= c && c <= 'F') &&
268 !('a' <= c && c <= 'f')) {
269 parse_error("premature end of hex number literal");
270 lexer_token.type = T_ERROR;
277 value = 16 * value + c - '0';
278 } else if ('A' <= c && c <= 'F') {
279 value = 16 * value + c - 'A' + 10;
280 } else if ('a' <= c && c <= 'f') {
281 value = 16 * value + c - 'a' + 10;
283 lexer_token.type = T_INTEGER;
284 lexer_token.v.intvalue = value;
292 void parse_number_oct(void)
294 assert(c == 'o' || c == 'O');
299 if ('0' <= c && c <= '7') {
300 value = 8 * value + c - '0';
302 lexer_token.type = T_INTEGER;
303 lexer_token.v.intvalue = value;
311 void parse_number_dec(int first_char)
315 assert(first_char >= '0' && first_char <= '9');
316 value = first_char - '0';
321 value = 10 * value + c - '0';
323 lexer_token.type = T_INTEGER;
324 lexer_token.v.intvalue = value;
332 void parse_number(void)
334 // TODO check for overflow
335 // TODO check for various invalid inputs sequences
341 case 'x': parse_number_hex(); break;
343 case 'O': parse_number_oct(); break;
344 default: parse_number_dec('0');
349 if(c == 'U' || c == 'U') {
350 /* TODO do something with the suffixes... */
352 if(c == 'L' || c == 'l') {
354 if(c == 'L' || c == 'l') {
358 } else if(c == 'l' || c == 'L') {
360 if(c == 'l' || c == 'L') {
362 if(c == 'u' || c == 'U') {
365 } else if(c == 'u' || c == 'U') {
371 static int parse_octal_sequence(void)
375 if(c < '0' || c > '7')
377 value = 8 * value + c - '0';
384 static int parse_hex_sequence(void)
388 if (c >= '0' && c <= '9') {
389 value = 16 * value + c - '0';
390 } else if ('A' <= c && c <= 'F') {
391 value = 16 * value + c - 'A' + 10;
392 } else if ('a' <= c && c <= 'f') {
393 value = 16 * value + c - 'a' + 10;
403 static int parse_escape_sequence(void)
410 case '"': return '"';
411 case '\'': return'\'';
415 case 'a': return '\a';
416 case 'b': return '\b';
417 case 'f': return '\f';
418 case 'n': return '\n';
419 case 'r': return '\r';
420 case 't': return '\t';
421 case 'v': return '\v';
423 return parse_hex_sequence();
432 return parse_octal_sequence();
437 /* might be a trigraph */
439 if(replace_trigraph()) {
447 parse_error("reached end of file while parsing escape sequence");
450 parse_error("unknown escape sequence");
456 const char *concat_strings(const char *s1, const char *s2)
458 size_t len1 = strlen(s1);
459 size_t len2 = strlen(s2);
461 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
462 memcpy(concat, s1, len1);
463 memcpy(concat + len1, s2, len2 + 1);
465 const char *result = strset_insert(&stringset, concat);
466 if(result != concat) {
467 obstack_free(&symbol_obstack, concat);
474 void parse_string_literal(void)
476 unsigned start_linenr = lexer_token.source_position.linenr;
486 obstack_1grow(&symbol_obstack, '?');
494 int ec = parse_escape_sequence();
495 obstack_1grow(&symbol_obstack, ec);
499 error_prefix_at(lexer_token.source_position.input_name,
501 fprintf(stderr, "string has no end\n");
502 lexer_token.type = T_ERROR;
510 obstack_1grow(&symbol_obstack, c);
518 /* TODO: concatenate multiple strings separated by whitespace... */
520 /* add finishing 0 to the string */
521 obstack_1grow(&symbol_obstack, '\0');
522 string = obstack_finish(&symbol_obstack);
524 /* check if there is already a copy of the string */
525 result = strset_insert(&stringset, string);
526 if(result != string) {
527 obstack_free(&symbol_obstack, string);
530 lexer_token.type = T_STRING_LITERAL;
531 lexer_token.v.string = result;
534 #define MATCH_NEWLINE(code) \
540 lexer_token.source_position.linenr++; \
544 lexer_token.source_position.linenr++; \
548 void parse_character_constant(void)
565 found_char = parse_escape_sequence();
569 parse_error("newline while parsing character constant");
575 goto end_of_char_constant;
578 parse_error("EOF while parsing character constant");
579 lexer_token.type = T_ERROR;
583 if(found_char != 0) {
584 parse_error("more than 1 characters in character "
586 goto end_of_char_constant;
595 end_of_char_constant:
596 lexer_token.type = T_INTEGER;
597 lexer_token.v.intvalue = found_char;
601 void skip_multiline_comment(void)
603 unsigned start_linenr = lexer_token.source_position.linenr;
634 if(replace_trigraph())
639 /* we don't put back the 2nd ? as the comment text is discarded
643 MATCH_NEWLINE(had_star = 0; break;)
646 error_prefix_at(lexer_token.source_position.input_name,
648 fprintf(stderr, "at end of file while looking for comment end\n");
659 void skip_line_comment(void)
668 if(replace_trigraph())
671 /* we don't put back the 2nd ? as the comment text is discarded
679 lexer_token.source_position.linenr++;
695 static token_t pp_token;
698 void next_pp_token(void)
700 lexer_next_preprocessing_token();
701 pp_token = lexer_token;
705 void eat_until_newline(void)
707 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
713 void error_directive(void)
716 fprintf(stderr, "#error directive: \n");
718 /* parse pp-tokens until new-line */
722 void define_directive(void)
724 lexer_next_preprocessing_token();
725 if(lexer_token.type != T_IDENTIFIER) {
726 parse_error("expected identifier after #define\n");
732 void ifdef_directive(int is_ifndef)
735 lexer_next_preprocessing_token();
736 //expect_identifier();
741 void endif_directive(void)
747 void parse_line_directive(void)
749 if(pp_token.type != T_INTEGER) {
750 parse_error("expected integer");
752 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
755 if(pp_token.type == T_STRING_LITERAL) {
756 lexer_token.source_position.input_name = pp_token.v.string;
764 void parse_preprocessor_identifier(void)
766 assert(pp_token.type == T_IDENTIFIER);
767 symbol_t *symbol = pp_token.v.symbol;
769 switch(symbol->pp_ID) {
771 printf("include - enable header name parsing!\n");
787 parse_line_directive();
802 void parse_preprocessor_directive()
806 switch(pp_token.type) {
808 parse_preprocessor_identifier();
811 parse_line_directive();
814 parse_error("invalid preprocessor directive");
820 #define MAYBE_PROLOG \
825 #define MAYBE(ch, set_type) \
828 lexer_token.type = set_type; \
831 #define ELSE_CODE(code) \
838 EAT_NEWLINE(break;) \
843 } /* end of while(1) */ \
846 #define ELSE(set_type) \
848 lexer_token.type = set_type; \
852 void lexer_next_preprocessing_token(void)
862 lexer_token.type = '\n';
875 parse_string_literal();
879 parse_character_constant();
886 lexer_token.source_position.linenr++;
889 parse_error("unexpected '\\' found");
890 lexer_token.type = T_ERROR;
898 MAYBE('.', T_DOTDOTDOT)
902 lexer_token.type = '.';
909 MAYBE('=', T_ANDEQUAL)
913 MAYBE('=', T_ASTERISKEQUAL)
917 MAYBE('+', T_PLUSPLUS)
918 MAYBE('=', T_PLUSEQUAL)
922 MAYBE('>', T_MINUSGREATER)
923 MAYBE('-', T_MINUSMINUS)
924 MAYBE('=', T_MINUSEQUAL)
928 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
932 MAYBE('=', T_SLASHEQUAL)
935 skip_multiline_comment();
936 lexer_next_preprocessing_token();
941 lexer_next_preprocessing_token();
946 MAYBE('>', T_PERCENTGREATER)
947 MAYBE('=', T_PERCENTEQUAL)
952 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
956 lexer_token.type = T_PERCENTCOLON;
963 MAYBE(':', T_LESSCOLON)
964 MAYBE('%', T_LESSPERCENT)
965 MAYBE('=', T_LESSEQUAL)
968 MAYBE('=', T_LESSLESSEQUAL)
973 MAYBE('=', T_GREATEREQUAL)
976 MAYBE('=', T_GREATERGREATEREQUAL)
977 ELSE(T_GREATERGREATER)
981 MAYBE('=', T_CARETEQUAL)
985 MAYBE('=', T_PIPEEQUAL)
986 MAYBE('|', T_PIPEPIPE)
990 MAYBE('>', T_COLONGREATER)
994 MAYBE('=', T_EQUALEQUAL)
998 MAYBE('#', T_HASHHASH)
1003 /* just a simple ? */
1005 lexer_token.type = '?';
1008 /* might be a trigraph */
1010 if(replace_trigraph()) {
1015 lexer_token.type = '?';
1027 lexer_token.type = c;
1032 lexer_token.type = T_EOF;
1038 fprintf(stderr, "unknown character '%c' found\n", c);
1039 lexer_token.type = T_ERROR;
1045 void lexer_next_token(void)
1047 lexer_next_preprocessing_token();
1048 if(lexer_token.type != '\n')
1053 lexer_next_preprocessing_token();
1054 } while(lexer_token.type == '\n');
1056 if(lexer_token.type == '#') {
1057 parse_preprocessor_directive();
1062 void init_lexer(void)
1064 strset_init(&stringset);
1067 void lexer_open_stream(FILE *stream, const char *input_name)
1070 lexer_token.source_position.linenr = 0;
1071 lexer_token.source_position.input_name = input_name;
1073 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1074 * beginning of a line */
1078 void exit_lexer(void)
1080 strset_destroy(&stringset);
1083 static __attribute__((unused))
1084 void dbg_pos(const source_position_t source_position)
1086 fprintf(stdout, "%s:%d\n", source_position.input_name,
1087 source_position.linenr);