5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
21 static char buf[1024 + MAX_PUTBACK];
22 static const char *bufend;
23 static const char *bufpos;
24 static strset_t stringset;
26 static void error_prefix_at(const char *input_name, unsigned linenr)
28 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
31 static void error_prefix(void)
33 error_prefix_at(lexer_token.source_position.input_name,
34 lexer_token.source_position.linenr);
37 static void parse_error(const char *msg)
40 fprintf(stderr, "%s\n", msg);
43 static inline void next_real_char(void)
46 if(bufpos >= bufend) {
47 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
53 bufpos = buf + MAX_PUTBACK;
54 bufend = buf + MAX_PUTBACK + s;
59 static inline void put_back(int pc)
61 assert(bufpos >= buf);
62 assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
64 char *p = buf + (bufpos - buf);
67 /* going backwards in the buffer is legal as long as it's not more often
72 printf("putback '%c'\n", pc);
76 static inline void next_char(void);
78 #define MATCH_NEWLINE(code) \
84 lexer_token.source_position.linenr++; \
88 lexer_token.source_position.linenr++; \
91 static inline void eat(char c_type)
97 static void maybe_concat_lines(void)
102 MATCH_NEWLINE(return;)
112 static inline void next_char(void)
116 /* filter trigraphs */
117 if(UNLIKELY(c == '\\')) {
118 maybe_concat_lines();
119 goto end_of_next_char;
123 goto end_of_next_char;
126 if(LIKELY(c != '?')) {
129 goto end_of_next_char;
134 case '=': c = '#'; break;
135 case '(': c = '['; break;
136 case '/': c = '\\'; maybe_concat_lines(); break;
137 case ')': c = ']'; break;
138 case '\'': c = '^'; break;
139 case '<': c = '{'; break;
140 case '!': c = '|'; break;
141 case '>': c = '}'; break;
142 case '-': c = '~'; break;
152 printf("nchar '%c'\n", c);
158 #define SYMBOL_CHARS \
225 static void parse_symbol(void)
230 obstack_1grow(&symbol_obstack, c);
237 obstack_1grow(&symbol_obstack, c);
247 obstack_1grow(&symbol_obstack, '\0');
249 string = obstack_finish(&symbol_obstack);
250 symbol = symbol_table_insert(string);
252 lexer_token.type = symbol->ID;
253 lexer_token.v.symbol = symbol;
255 if(symbol->string != string) {
256 obstack_free(&symbol_obstack, string);
260 static void parse_integer_suffix(void)
262 if(c == 'U' || c == 'U') {
263 /* TODO do something with the suffixes... */
265 if(c == 'L' || c == 'l') {
267 if(c == 'L' || c == 'l') {
271 } else if(c == 'l' || c == 'L') {
273 if(c == 'l' || c == 'L') {
275 if(c == 'u' || c == 'U') {
278 } else if(c == 'u' || c == 'U') {
284 static void parse_number_hex(void)
286 assert(c == 'x' || c == 'X');
290 !('A' <= c && c <= 'F') &&
291 !('a' <= c && c <= 'f')) {
292 parse_error("premature end of hex number literal");
293 lexer_token.type = T_ERROR;
300 value = 16 * value + c - '0';
301 } else if ('A' <= c && c <= 'F') {
302 value = 16 * value + c - 'A' + 10;
303 } else if ('a' <= c && c <= 'f') {
304 value = 16 * value + c - 'a' + 10;
306 parse_integer_suffix();
308 lexer_token.type = T_INTEGER;
309 lexer_token.v.intvalue = value;
315 if(c == '.' || c == 'p' || c == 'P') {
317 panic("Hex floating point numbers not implemented yet");
321 static void parse_number_oct(void)
324 while(c >= '0' && c <= '7') {
325 value = 8 * value + c - '0';
328 if (c == '8' || c == '9') {
329 parse_error("invalid octal number");
330 lexer_token.type = T_ERROR;
334 lexer_token.type = T_INTEGER;
335 lexer_token.v.intvalue = value;
337 parse_integer_suffix();
340 static void parse_floatingpoint_exponent(long double value)
342 unsigned int expo = 0;
343 long double factor = 10.;
348 } else if(c == '+') {
352 while(c >= '0' && c <= '9') {
353 expo = 10 * expo + (c - '0');
366 lexer_token.type = T_FLOATINGPOINT;
367 lexer_token.v.floatvalue = value;
370 static void parse_floatingpoint_fract(int integer_part)
372 long double value = integer_part;
373 long double factor = 1.;
375 while(c >= '0' && c <= '9') {
377 value += (c - '0') * factor;
381 if(c == 'e' || c == 'E') {
383 parse_floatingpoint_exponent(value);
387 lexer_token.type = T_FLOATINGPOINT;
388 lexer_token.v.floatvalue = value;
391 static void parse_number_dec(void)
396 value = 10 * value + c - '0';
402 parse_floatingpoint_fract(value);
405 if(c == 'e' || c == 'E') {
407 parse_floatingpoint_exponent(value);
410 parse_integer_suffix();
412 lexer_token.type = T_INTEGER;
413 lexer_token.v.intvalue = value;
416 static void parse_number(void)
437 parse_floatingpoint_fract(0);
441 parse_floatingpoint_exponent(0);
446 parse_error("invalid octal number");
447 lexer_token.type = T_ERROR;
460 static int parse_octal_sequence(void)
464 if(c < '0' || c > '7')
466 value = 8 * value + c - '0';
473 static int parse_hex_sequence(void)
477 if (c >= '0' && c <= '9') {
478 value = 16 * value + c - '0';
479 } else if ('A' <= c && c <= 'F') {
480 value = 16 * value + c - 'A' + 10;
481 } else if ('a' <= c && c <= 'f') {
482 value = 16 * value + c - 'a' + 10;
492 static int parse_escape_sequence(void)
500 case '"': return '"';
501 case '\'': return'\'';
502 case '\\': return '\\';
503 case '?': return '\?';
504 case 'a': return '\a';
505 case 'b': return '\b';
506 case 'f': return '\f';
507 case 'n': return '\n';
508 case 'r': return '\r';
509 case 't': return '\t';
510 case 'v': return '\v';
512 return parse_hex_sequence();
521 return parse_octal_sequence();
523 parse_error("reached end of file while parsing escape sequence");
526 parse_error("unknown escape sequence");
531 const char *concat_strings(const char *s1, const char *s2)
533 size_t len1 = strlen(s1);
534 size_t len2 = strlen(s2);
536 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
537 memcpy(concat, s1, len1);
538 memcpy(concat + len1, s2, len2 + 1);
540 const char *result = strset_insert(&stringset, concat);
541 if(result != concat) {
542 obstack_free(&symbol_obstack, concat);
548 static void parse_string_literal(void)
550 unsigned start_linenr = lexer_token.source_position.linenr;
561 tc = parse_escape_sequence();
562 obstack_1grow(&symbol_obstack, tc);
566 error_prefix_at(lexer_token.source_position.input_name,
568 fprintf(stderr, "string has no end\n");
569 lexer_token.type = T_ERROR;
577 obstack_1grow(&symbol_obstack, c);
585 /* TODO: concatenate multiple strings separated by whitespace... */
587 /* add finishing 0 to the string */
588 obstack_1grow(&symbol_obstack, '\0');
589 string = obstack_finish(&symbol_obstack);
591 /* check if there is already a copy of the string */
592 result = strset_insert(&stringset, string);
593 if(result != string) {
594 obstack_free(&symbol_obstack, string);
597 lexer_token.type = T_STRING_LITERAL;
598 lexer_token.v.string = result;
601 static void parse_character_constant(void)
609 found_char = parse_escape_sequence();
613 parse_error("newline while parsing character constant");
619 goto end_of_char_constant;
622 parse_error("EOF while parsing character constant");
623 lexer_token.type = T_ERROR;
627 if(found_char != 0) {
628 parse_error("more than 1 characters in character "
630 goto end_of_char_constant;
639 end_of_char_constant:
640 lexer_token.type = T_INTEGER;
641 lexer_token.v.intvalue = found_char;
644 static void skip_multiline_comment(void)
646 unsigned start_linenr = lexer_token.source_position.linenr;
658 MATCH_NEWLINE(break;)
661 error_prefix_at(lexer_token.source_position.input_name,
663 fprintf(stderr, "at end of file while looking for comment end\n");
673 static void skip_line_comment(void)
691 static token_t pp_token;
693 static inline void next_pp_token(void)
695 lexer_next_preprocessing_token();
696 pp_token = lexer_token;
699 static void eat_until_newline(void)
701 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
706 static void error_directive(void)
709 fprintf(stderr, "#error directive: \n");
711 /* parse pp-tokens until new-line */
714 static void define_directive(void)
716 lexer_next_preprocessing_token();
717 if(lexer_token.type != T_IDENTIFIER) {
718 parse_error("expected identifier after #define\n");
723 static void ifdef_directive(int is_ifndef)
726 lexer_next_preprocessing_token();
727 //expect_identifier();
731 static void endif_directive(void)
736 static void parse_line_directive(void)
738 if(pp_token.type != T_INTEGER) {
739 parse_error("expected integer");
741 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
744 if(pp_token.type == T_STRING_LITERAL) {
745 lexer_token.source_position.input_name = pp_token.v.string;
752 static void parse_preprocessor_identifier(void)
754 assert(pp_token.type == T_IDENTIFIER);
755 symbol_t *symbol = pp_token.v.symbol;
757 switch(symbol->pp_ID) {
759 printf("include - enable header name parsing!\n");
775 parse_line_directive();
789 static void parse_preprocessor_directive()
793 switch(pp_token.type) {
795 parse_preprocessor_identifier();
798 parse_line_directive();
801 parse_error("invalid preprocessor directive");
807 #define MAYBE_PROLOG \
812 #define MAYBE(ch, set_type) \
815 lexer_token.type = set_type; \
818 #define ELSE_CODE(code) \
822 } /* end of while(1) */ \
825 #define ELSE(set_type) \
827 lexer_token.type = set_type; \
831 void lexer_next_preprocessing_token(void)
841 lexer_token.type = '\n';
854 parse_string_literal();
858 parse_character_constant();
865 MAYBE('.', T_DOTDOTDOT)
869 lexer_token.type = '.';
876 MAYBE('=', T_ANDEQUAL)
880 MAYBE('=', T_ASTERISKEQUAL)
884 MAYBE('+', T_PLUSPLUS)
885 MAYBE('=', T_PLUSEQUAL)
889 MAYBE('>', T_MINUSGREATER)
890 MAYBE('-', T_MINUSMINUS)
891 MAYBE('=', T_MINUSEQUAL)
895 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
899 MAYBE('=', T_SLASHEQUAL)
902 skip_multiline_comment();
903 lexer_next_preprocessing_token();
908 lexer_next_preprocessing_token();
913 MAYBE('>', T_PERCENTGREATER)
914 MAYBE('=', T_PERCENTEQUAL)
919 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
923 lexer_token.type = T_PERCENTCOLON;
930 MAYBE(':', T_LESSCOLON)
931 MAYBE('%', T_LESSPERCENT)
932 MAYBE('=', T_LESSEQUAL)
935 MAYBE('=', T_LESSLESSEQUAL)
940 MAYBE('=', T_GREATEREQUAL)
943 MAYBE('=', T_GREATERGREATEREQUAL)
944 ELSE(T_GREATERGREATER)
948 MAYBE('=', T_CARETEQUAL)
952 MAYBE('=', T_PIPEEQUAL)
953 MAYBE('|', T_PIPEPIPE)
957 MAYBE('>', T_COLONGREATER)
961 MAYBE('=', T_EQUALEQUAL)
965 MAYBE('#', T_HASHHASH)
979 lexer_token.type = c;
984 lexer_token.type = T_EOF;
990 fprintf(stderr, "unknown character '%c' found\n", c);
991 lexer_token.type = T_ERROR;
997 void lexer_next_token(void)
999 lexer_next_preprocessing_token();
1000 if(lexer_token.type != '\n')
1005 lexer_next_preprocessing_token();
1006 } while(lexer_token.type == '\n');
1008 if(lexer_token.type == '#') {
1009 parse_preprocessor_directive();
1014 void init_lexer(void)
1016 strset_init(&stringset);
1019 void lexer_open_stream(FILE *stream, const char *input_name)
1022 lexer_token.source_position.linenr = 1;
1023 lexer_token.source_position.input_name = input_name;
1028 void exit_lexer(void)
1030 strset_destroy(&stringset);
1033 static __attribute__((unused))
1034 void dbg_pos(const source_position_t source_position)
1036 fprintf(stdout, "%s:%d\n", source_position.input_name,
1037 source_position.linenr);