5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
22 static char buf[1024 + MAX_PUTBACK];
23 static const char *bufend;
24 static const char *bufpos;
25 static strset_t stringset;
27 static void error_prefix_at(const char *input_name, unsigned linenr)
29 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
32 static void error_prefix(void)
34 error_prefix_at(lexer_token.source_position.input_name,
35 lexer_token.source_position.linenr);
38 static void parse_error(const char *msg)
41 fprintf(stderr, "%s\n", msg);
44 static inline void next_real_char(void)
47 if(bufpos >= bufend) {
48 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
54 bufpos = buf + MAX_PUTBACK;
55 bufend = buf + MAX_PUTBACK + s;
60 static inline void put_back(int pc)
62 assert(bufpos >= buf);
63 assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
65 char *p = buf + (bufpos - buf);
68 /* going backwards in the buffer is legal as long as it's not more often
73 printf("putback '%c'\n", pc);
77 static inline void next_char(void);
79 #define MATCH_NEWLINE(code) \
85 lexer_token.source_position.linenr++; \
89 lexer_token.source_position.linenr++; \
92 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
94 static void maybe_concat_lines(void)
99 MATCH_NEWLINE(return;)
109 static inline void next_char(void)
114 /* filter trigraphs */
115 if(UNLIKELY(c == '\\')) {
116 maybe_concat_lines();
117 goto end_of_next_char;
121 goto end_of_next_char;
124 if(LIKELY(c != '?')) {
127 goto end_of_next_char;
132 case '=': c = '#'; break;
133 case '(': c = '['; break;
134 case '/': c = '\\'; maybe_concat_lines(); break;
135 case ')': c = ']'; break;
136 case '\'': c = '^'; break;
137 case '<': c = '{'; break;
138 case '!': c = '|'; break;
139 case '>': c = '}'; break;
140 case '-': c = '~'; break;
150 (void) maybe_concat_lines;
152 printf("nchar '%c'\n", c);
156 #define SYMBOL_CHARS \
223 static void parse_symbol(void)
228 obstack_1grow(&symbol_obstack, c);
235 obstack_1grow(&symbol_obstack, c);
245 obstack_1grow(&symbol_obstack, '\0');
247 string = obstack_finish(&symbol_obstack);
248 symbol = symbol_table_insert(string);
250 lexer_token.type = symbol->ID;
251 lexer_token.v.symbol = symbol;
253 if(symbol->string != string) {
254 obstack_free(&symbol_obstack, string);
258 static void parse_integer_suffix(void)
260 if(c == 'U' || c == 'U') {
261 /* TODO do something with the suffixes... */
263 if(c == 'L' || c == 'l') {
265 if(c == 'L' || c == 'l') {
269 } else if(c == 'l' || c == 'L') {
271 if(c == 'l' || c == 'L') {
273 if(c == 'u' || c == 'U') {
276 } else if(c == 'u' || c == 'U') {
282 static void parse_floating_suffix(void)
285 /* TODO: do something usefull with the suffixes... */
297 static void parse_number_hex(void)
299 assert(c == 'x' || c == 'X');
303 !('A' <= c && c <= 'F') &&
304 !('a' <= c && c <= 'f')) {
305 parse_error("premature end of hex number literal");
306 lexer_token.type = T_ERROR;
313 value = 16 * value + c - '0';
314 } else if ('A' <= c && c <= 'F') {
315 value = 16 * value + c - 'A' + 10;
316 } else if ('a' <= c && c <= 'f') {
317 value = 16 * value + c - 'a' + 10;
319 parse_integer_suffix();
321 lexer_token.type = T_INTEGER;
322 lexer_token.v.intvalue = value;
328 if(c == '.' || c == 'p' || c == 'P') {
330 panic("Hex floating point numbers not implemented yet");
334 static void parse_number_oct(void)
337 while(c >= '0' && c <= '7') {
338 value = 8 * value + c - '0';
341 if (c == '8' || c == '9') {
342 parse_error("invalid octal number");
343 lexer_token.type = T_ERROR;
347 lexer_token.type = T_INTEGER;
348 lexer_token.v.intvalue = value;
350 parse_integer_suffix();
353 static void parse_floatingpoint_exponent(long double value)
355 unsigned int expo = 0;
356 long double factor = 10.;
361 } else if(c == '+') {
365 while(c >= '0' && c <= '9') {
366 expo = 10 * expo + (c - '0');
379 lexer_token.type = T_FLOATINGPOINT;
380 lexer_token.v.floatvalue = value;
382 parse_floating_suffix();
385 static void parse_floatingpoint_fract(int integer_part)
387 long double value = integer_part;
388 long double factor = 1.;
390 while(c >= '0' && c <= '9') {
392 value += (c - '0') * factor;
396 if(c == 'e' || c == 'E') {
398 parse_floatingpoint_exponent(value);
402 lexer_token.type = T_FLOATINGPOINT;
403 lexer_token.v.floatvalue = value;
405 parse_floating_suffix();
408 static void parse_number_dec(void)
413 value = 10 * value + c - '0';
419 parse_floatingpoint_fract(value);
422 if(c == 'e' || c == 'E') {
424 parse_floatingpoint_exponent(value);
427 parse_integer_suffix();
429 lexer_token.type = T_INTEGER;
430 lexer_token.v.intvalue = value;
433 static void parse_number(void)
454 parse_floatingpoint_fract(0);
458 parse_floatingpoint_exponent(0);
463 parse_error("invalid octal number");
464 lexer_token.type = T_ERROR;
477 static inline int is_octal_digit(int chr)
479 return '0' <= chr && chr <= '7';
482 static int parse_octal_sequence(const int first_digit)
484 assert(is_octal_digit(first_digit));
485 int value = first_digit - '0';
486 if (!is_octal_digit(c)) return value;
487 value = 8 * value + c - '0';
489 if (!is_octal_digit(c)) return value;
490 value = 8 * value + c - '0';
495 static int parse_hex_sequence(void)
499 if (c >= '0' && c <= '9') {
500 value = 16 * value + c - '0';
501 } else if ('A' <= c && c <= 'F') {
502 value = 16 * value + c - 'A' + 10;
503 } else if ('a' <= c && c <= 'f') {
504 value = 16 * value + c - 'a' + 10;
514 static int parse_escape_sequence(void)
522 case '"': return '"';
523 case '\'': return '\'';
524 case '\\': return '\\';
525 case '?': return '\?';
526 case 'a': return '\a';
527 case 'b': return '\b';
528 case 'f': return '\f';
529 case 'n': return '\n';
530 case 'r': return '\r';
531 case 't': return '\t';
532 case 'v': return '\v';
534 return parse_hex_sequence();
543 return parse_octal_sequence(ec);
545 parse_error("reached end of file while parsing escape sequence");
548 parse_error("unknown escape sequence");
553 const char *concat_strings(const char *s1, const char *s2)
555 size_t len1 = strlen(s1);
556 size_t len2 = strlen(s2);
558 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
559 memcpy(concat, s1, len1);
560 memcpy(concat + len1, s2, len2 + 1);
562 const char *result = strset_insert(&stringset, concat);
563 if(result != concat) {
564 obstack_free(&symbol_obstack, concat);
570 static void parse_string_literal(void)
572 unsigned start_linenr = lexer_token.source_position.linenr;
583 tc = parse_escape_sequence();
584 obstack_1grow(&symbol_obstack, tc);
588 error_prefix_at(lexer_token.source_position.input_name,
590 fprintf(stderr, "string has no end\n");
591 lexer_token.type = T_ERROR;
599 obstack_1grow(&symbol_obstack, c);
607 /* TODO: concatenate multiple strings separated by whitespace... */
609 /* add finishing 0 to the string */
610 obstack_1grow(&symbol_obstack, '\0');
611 string = obstack_finish(&symbol_obstack);
613 /* check if there is already a copy of the string */
614 result = strset_insert(&stringset, string);
615 if(result != string) {
616 obstack_free(&symbol_obstack, string);
619 lexer_token.type = T_STRING_LITERAL;
620 lexer_token.v.string = result;
623 static void parse_character_constant(void)
631 found_char = parse_escape_sequence();
635 parse_error("newline while parsing character constant");
641 goto end_of_char_constant;
644 parse_error("EOF while parsing character constant");
645 lexer_token.type = T_ERROR;
649 if(found_char != 0) {
650 parse_error("more than 1 characters in character "
652 goto end_of_char_constant;
661 end_of_char_constant:
662 lexer_token.type = T_INTEGER;
663 lexer_token.v.intvalue = found_char;
666 static void skip_multiline_comment(void)
668 unsigned start_linenr = lexer_token.source_position.linenr;
680 MATCH_NEWLINE(break;)
683 error_prefix_at(lexer_token.source_position.input_name,
685 fprintf(stderr, "at end of file while looking for comment end\n");
695 static void skip_line_comment(void)
713 static token_t pp_token;
715 static inline void next_pp_token(void)
717 lexer_next_preprocessing_token();
718 pp_token = lexer_token;
721 static void eat_until_newline(void)
723 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
728 static void error_directive(void)
731 fprintf(stderr, "#error directive: \n");
733 /* parse pp-tokens until new-line */
736 static void define_directive(void)
738 lexer_next_preprocessing_token();
739 if(lexer_token.type != T_IDENTIFIER) {
740 parse_error("expected identifier after #define\n");
745 static void ifdef_directive(int is_ifndef)
748 lexer_next_preprocessing_token();
749 //expect_identifier();
753 static void endif_directive(void)
758 static void parse_line_directive(void)
760 if(pp_token.type != T_INTEGER) {
761 parse_error("expected integer");
763 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
766 if(pp_token.type == T_STRING_LITERAL) {
767 lexer_token.source_position.input_name = pp_token.v.string;
774 static void parse_preprocessor_identifier(void)
776 assert(pp_token.type == T_IDENTIFIER);
777 symbol_t *symbol = pp_token.v.symbol;
779 switch(symbol->pp_ID) {
781 printf("include - enable header name parsing!\n");
797 parse_line_directive();
811 static void parse_preprocessor_directive(void)
815 switch(pp_token.type) {
817 parse_preprocessor_identifier();
820 parse_line_directive();
823 parse_error("invalid preprocessor directive");
829 #define MAYBE_PROLOG \
834 #define MAYBE(ch, set_type) \
837 lexer_token.type = set_type; \
840 #define ELSE_CODE(code) \
844 } /* end of while(1) */ \
847 #define ELSE(set_type) \
849 lexer_token.type = set_type; \
853 void lexer_next_preprocessing_token(void)
863 lexer_token.type = '\n';
869 /* might be a wide string ( L"string" ) */
870 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
871 lexer_token.v.symbol == symbol_L)) {
872 parse_string_literal();
882 parse_string_literal();
886 parse_character_constant();
893 MAYBE('.', T_DOTDOTDOT)
897 lexer_token.type = '.';
904 MAYBE('=', T_ANDEQUAL)
908 MAYBE('=', T_ASTERISKEQUAL)
912 MAYBE('+', T_PLUSPLUS)
913 MAYBE('=', T_PLUSEQUAL)
917 MAYBE('>', T_MINUSGREATER)
918 MAYBE('-', T_MINUSMINUS)
919 MAYBE('=', T_MINUSEQUAL)
923 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
927 MAYBE('=', T_SLASHEQUAL)
930 skip_multiline_comment();
931 lexer_next_preprocessing_token();
936 lexer_next_preprocessing_token();
941 MAYBE('>', T_PERCENTGREATER)
942 MAYBE('=', T_PERCENTEQUAL)
947 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
951 lexer_token.type = T_PERCENTCOLON;
958 MAYBE(':', T_LESSCOLON)
959 MAYBE('%', T_LESSPERCENT)
960 MAYBE('=', T_LESSEQUAL)
963 MAYBE('=', T_LESSLESSEQUAL)
968 MAYBE('=', T_GREATEREQUAL)
971 MAYBE('=', T_GREATERGREATEREQUAL)
972 ELSE(T_GREATERGREATER)
976 MAYBE('=', T_CARETEQUAL)
980 MAYBE('=', T_PIPEEQUAL)
981 MAYBE('|', T_PIPEPIPE)
985 MAYBE('>', T_COLONGREATER)
989 MAYBE('=', T_EQUALEQUAL)
993 MAYBE('#', T_HASHHASH)
1007 lexer_token.type = c;
1012 lexer_token.type = T_EOF;
1018 fprintf(stderr, "unknown character '%c' found\n", c);
1019 lexer_token.type = T_ERROR;
1025 void lexer_next_token(void)
1027 lexer_next_preprocessing_token();
1028 if(lexer_token.type != '\n')
1033 lexer_next_preprocessing_token();
1034 } while(lexer_token.type == '\n');
1036 if(lexer_token.type == '#') {
1037 parse_preprocessor_directive();
1042 void init_lexer(void)
1044 strset_init(&stringset);
1047 void lexer_open_stream(FILE *stream, const char *input_name)
1050 lexer_token.source_position.linenr = 0;
1051 lexer_token.source_position.input_name = input_name;
1053 symbol_L = symbol_table_insert("L");
1055 /* place a virtual \n at the beginning so the lexer knows that we're
1056 * at the beginning of a line */
1060 void exit_lexer(void)
1062 strset_destroy(&stringset);
1065 static __attribute__((unused))
1066 void dbg_pos(const source_position_t source_position)
1068 fprintf(stdout, "%s:%d\n", source_position.input_name,
1069 source_position.linenr);