5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
23 static char buf[1024 + MAX_PUTBACK];
24 static const char *bufend;
25 static const char *bufpos;
26 static strset_t stringset;
28 static void error_prefix_at(const char *input_name, unsigned linenr)
30 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
33 static void error_prefix(void)
35 error_prefix_at(lexer_token.source_position.input_name,
36 lexer_token.source_position.linenr);
39 static void parse_error(const char *msg)
42 fprintf(stderr, "%s\n", msg);
45 static inline void next_real_char(void)
48 if(bufpos >= bufend) {
49 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
55 bufpos = buf + MAX_PUTBACK;
56 bufend = buf + MAX_PUTBACK + s;
61 static inline void put_back(int pc)
63 assert(bufpos >= buf);
64 //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
66 char *p = buf + (bufpos - buf);
69 /* going backwards in the buffer is legal as long as it's not more often
74 printf("putback '%c'\n", pc);
78 static inline void next_char(void);
80 #define MATCH_NEWLINE(code) \
86 lexer_token.source_position.linenr++; \
90 lexer_token.source_position.linenr++; \
93 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
95 static void maybe_concat_lines(void)
100 MATCH_NEWLINE(return;)
110 static inline void next_char(void)
114 /* filter trigraphs */
115 if(UNLIKELY(c == '\\')) {
116 maybe_concat_lines();
117 goto end_of_next_char;
121 goto end_of_next_char;
124 if(LIKELY(c != '?')) {
127 goto end_of_next_char;
132 case '=': c = '#'; break;
133 case '(': c = '['; break;
134 case '/': c = '\\'; maybe_concat_lines(); break;
135 case ')': c = ']'; break;
136 case '\'': c = '^'; break;
137 case '<': c = '{'; break;
138 case '!': c = '|'; break;
139 case '>': c = '}'; break;
140 case '-': c = '~'; break;
150 printf("nchar '%c'\n", c);
154 #define SYMBOL_CHARS \
221 static void parse_symbol(void)
226 obstack_1grow(&symbol_obstack, c);
233 obstack_1grow(&symbol_obstack, c);
243 obstack_1grow(&symbol_obstack, '\0');
245 string = obstack_finish(&symbol_obstack);
246 symbol = symbol_table_insert(string);
248 lexer_token.type = symbol->ID;
249 lexer_token.v.symbol = symbol;
251 if(symbol->string != string) {
252 obstack_free(&symbol_obstack, string);
256 static void parse_integer_suffix(void)
258 if(c == 'U' || c == 'U') {
259 /* TODO do something with the suffixes... */
261 if(c == 'L' || c == 'l') {
263 if(c == 'L' || c == 'l') {
267 } else if(c == 'l' || c == 'L') {
269 if(c == 'l' || c == 'L') {
271 if(c == 'u' || c == 'U') {
274 } else if(c == 'u' || c == 'U') {
280 static void parse_floating_suffix(void)
283 /* TODO: do something usefull with the suffixes... */
295 static inline bool is_hex_digit(int c)
297 return (c >= '0' && c <= '9')
298 || (c >= 'a' && c <= 'z')
299 || (c >= 'A' && c <= 'Z');
302 static void parse_number_hex(void)
304 assert(c == 'x' || c == 'X');
307 while(is_hex_digit(c)) {
308 obstack_1grow(&symbol_obstack, c);
311 obstack_1grow(&symbol_obstack, '\0');
312 char *string = obstack_finish(&symbol_obstack);
314 if(c == '.' || c == 'p' || c == 'P') {
316 panic("Hex floating point numbers not implemented yet");
318 if(*string == '\0') {
319 parse_error("invalid hex number");
320 lexer_token.type = T_ERROR;
324 int value = strtol(string, &endptr, 16);
325 if(*endptr != '\0') {
326 parse_error("hex number literal too long");
329 lexer_token.type = T_INTEGER;
330 lexer_token.v.intvalue = value;
332 parse_integer_suffix();
333 obstack_free(&symbol_obstack, string);
336 static inline bool is_octal_digit(int chr)
338 return '0' <= chr && chr <= '7';
341 static void parse_number_oct(void)
343 while(is_octal_digit(c)) {
344 obstack_1grow(&symbol_obstack, c);
347 obstack_1grow(&symbol_obstack, '\0');
348 char *string = obstack_finish(&symbol_obstack);
351 int value = strtol(string, &endptr, 8);
352 if(*endptr != '\0') {
353 parse_error("octal number literal too long");
356 lexer_token.type = T_INTEGER;
357 lexer_token.v.intvalue = value;
359 parse_integer_suffix();
360 obstack_free(&symbol_obstack, string);
363 static void parse_number_dec(void)
365 bool is_float = false;
367 obstack_1grow(&symbol_obstack, c);
372 obstack_1grow(&symbol_obstack, '.');
376 obstack_1grow(&symbol_obstack, c);
381 if(c == 'e' || c == 'E') {
382 obstack_1grow(&symbol_obstack, 'e');
385 if(c == '-' || c == '+') {
386 obstack_1grow(&symbol_obstack, c);
391 obstack_1grow(&symbol_obstack, c);
397 obstack_1grow(&symbol_obstack, '\0');
398 char *string = obstack_finish(&symbol_obstack);
402 lexer_token.type = T_FLOATINGPOINT;
403 lexer_token.v.floatvalue = strtod(string, &endptr);
405 if(*endptr != '\0') {
406 parse_error("invalid number literal");
409 parse_floating_suffix();
411 lexer_token.type = T_INTEGER;
412 lexer_token.v.intvalue = strtol(string, &endptr, 10);
414 if(*endptr != '\0') {
415 parse_error("invalid number literal");
418 parse_integer_suffix();
420 obstack_free(&symbol_obstack, string);
423 static void parse_number(void)
445 parse_error("invalid octal number");
446 lexer_token.type = T_ERROR;
452 obstack_1grow(&symbol_obstack, '0');
461 static int parse_octal_sequence(const int first_digit)
463 assert(is_octal_digit(first_digit));
464 int value = first_digit - '0';
465 if (!is_octal_digit(c)) return value;
466 value = 8 * value + c - '0';
468 if (!is_octal_digit(c)) return value;
469 value = 8 * value + c - '0';
474 static int parse_hex_sequence(void)
478 if (c >= '0' && c <= '9') {
479 value = 16 * value + c - '0';
480 } else if ('A' <= c && c <= 'F') {
481 value = 16 * value + c - 'A' + 10;
482 } else if ('a' <= c && c <= 'f') {
483 value = 16 * value + c - 'a' + 10;
493 static int parse_escape_sequence(void)
501 case '"': return '"';
502 case '\'': return '\'';
503 case '\\': return '\\';
504 case '?': return '\?';
505 case 'a': return '\a';
506 case 'b': return '\b';
507 case 'f': return '\f';
508 case 'n': return '\n';
509 case 'r': return '\r';
510 case 't': return '\t';
511 case 'v': return '\v';
513 return parse_hex_sequence();
522 return parse_octal_sequence(ec);
524 parse_error("reached end of file while parsing escape sequence");
527 parse_error("unknown escape sequence");
532 const char *concat_strings(const char *s1, const char *s2)
534 size_t len1 = strlen(s1);
535 size_t len2 = strlen(s2);
537 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
538 memcpy(concat, s1, len1);
539 memcpy(concat + len1, s2, len2 + 1);
541 const char *result = strset_insert(&stringset, concat);
542 if(result != concat) {
543 obstack_free(&symbol_obstack, concat);
549 static void parse_string_literal(void)
551 unsigned start_linenr = lexer_token.source_position.linenr;
562 tc = parse_escape_sequence();
563 obstack_1grow(&symbol_obstack, tc);
567 error_prefix_at(lexer_token.source_position.input_name,
569 fprintf(stderr, "string has no end\n");
570 lexer_token.type = T_ERROR;
578 obstack_1grow(&symbol_obstack, c);
586 /* TODO: concatenate multiple strings separated by whitespace... */
588 /* add finishing 0 to the string */
589 obstack_1grow(&symbol_obstack, '\0');
590 string = obstack_finish(&symbol_obstack);
592 /* check if there is already a copy of the string */
593 result = strset_insert(&stringset, string);
594 if(result != string) {
595 obstack_free(&symbol_obstack, string);
598 lexer_token.type = T_STRING_LITERAL;
599 lexer_token.v.string = result;
602 static void parse_character_constant(void)
610 found_char = parse_escape_sequence();
614 parse_error("newline while parsing character constant");
620 goto end_of_char_constant;
623 parse_error("EOF while parsing character constant");
624 lexer_token.type = T_ERROR;
628 if(found_char != 0) {
629 parse_error("more than 1 characters in character "
631 goto end_of_char_constant;
640 end_of_char_constant:
641 lexer_token.type = T_INTEGER;
642 lexer_token.v.intvalue = found_char;
645 static void skip_multiline_comment(void)
647 unsigned start_linenr = lexer_token.source_position.linenr;
659 MATCH_NEWLINE(break;)
662 error_prefix_at(lexer_token.source_position.input_name,
664 fprintf(stderr, "at end of file while looking for comment end\n");
674 static void skip_line_comment(void)
692 static token_t pp_token;
694 static inline void next_pp_token(void)
696 lexer_next_preprocessing_token();
697 pp_token = lexer_token;
700 static void eat_until_newline(void)
702 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
707 static void error_directive(void)
710 fprintf(stderr, "#error directive: \n");
712 /* parse pp-tokens until new-line */
715 static void define_directive(void)
717 lexer_next_preprocessing_token();
718 if(lexer_token.type != T_IDENTIFIER) {
719 parse_error("expected identifier after #define\n");
724 static void ifdef_directive(int is_ifndef)
727 lexer_next_preprocessing_token();
728 //expect_identifier();
732 static void endif_directive(void)
737 static void parse_line_directive(void)
739 if(pp_token.type != T_INTEGER) {
740 parse_error("expected integer");
742 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
745 if(pp_token.type == T_STRING_LITERAL) {
746 lexer_token.source_position.input_name = pp_token.v.string;
753 static void parse_preprocessor_identifier(void)
755 assert(pp_token.type == T_IDENTIFIER);
756 symbol_t *symbol = pp_token.v.symbol;
758 switch(symbol->pp_ID) {
760 printf("include - enable header name parsing!\n");
776 parse_line_directive();
790 static void parse_preprocessor_directive(void)
794 switch(pp_token.type) {
796 parse_preprocessor_identifier();
799 parse_line_directive();
802 parse_error("invalid preprocessor directive");
808 #define MAYBE_PROLOG \
813 #define MAYBE(ch, set_type) \
816 lexer_token.type = set_type; \
819 #define ELSE_CODE(code) \
823 } /* end of while(1) */ \
826 #define ELSE(set_type) \
828 lexer_token.type = set_type; \
832 void lexer_next_preprocessing_token(void)
842 lexer_token.type = '\n';
848 /* might be a wide string ( L"string" ) */
849 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
850 lexer_token.v.symbol == symbol_L)) {
851 parse_string_literal();
861 parse_string_literal();
865 parse_character_constant();
872 MAYBE('.', T_DOTDOTDOT)
876 lexer_token.type = '.';
883 MAYBE('=', T_ANDEQUAL)
887 MAYBE('=', T_ASTERISKEQUAL)
891 MAYBE('+', T_PLUSPLUS)
892 MAYBE('=', T_PLUSEQUAL)
896 MAYBE('>', T_MINUSGREATER)
897 MAYBE('-', T_MINUSMINUS)
898 MAYBE('=', T_MINUSEQUAL)
902 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
906 MAYBE('=', T_SLASHEQUAL)
909 skip_multiline_comment();
910 lexer_next_preprocessing_token();
915 lexer_next_preprocessing_token();
920 MAYBE('>', T_PERCENTGREATER)
921 MAYBE('=', T_PERCENTEQUAL)
926 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
930 lexer_token.type = T_PERCENTCOLON;
937 MAYBE(':', T_LESSCOLON)
938 MAYBE('%', T_LESSPERCENT)
939 MAYBE('=', T_LESSEQUAL)
942 MAYBE('=', T_LESSLESSEQUAL)
947 MAYBE('=', T_GREATEREQUAL)
950 MAYBE('=', T_GREATERGREATEREQUAL)
951 ELSE(T_GREATERGREATER)
955 MAYBE('=', T_CARETEQUAL)
959 MAYBE('=', T_PIPEEQUAL)
960 MAYBE('|', T_PIPEPIPE)
964 MAYBE('>', T_COLONGREATER)
968 MAYBE('=', T_EQUALEQUAL)
972 MAYBE('#', T_HASHHASH)
986 lexer_token.type = c;
991 lexer_token.type = T_EOF;
997 fprintf(stderr, "unknown character '%c' found\n", c);
998 lexer_token.type = T_ERROR;
1004 void lexer_next_token(void)
1006 lexer_next_preprocessing_token();
1007 if(lexer_token.type != '\n')
1012 lexer_next_preprocessing_token();
1013 } while(lexer_token.type == '\n');
1015 if(lexer_token.type == '#') {
1016 parse_preprocessor_directive();
1021 void init_lexer(void)
1023 strset_init(&stringset);
1026 void lexer_open_stream(FILE *stream, const char *input_name)
1029 lexer_token.source_position.linenr = 0;
1030 lexer_token.source_position.input_name = input_name;
1032 symbol_L = symbol_table_insert("L");
1034 /* place a virtual \n at the beginning so the lexer knows that we're
1035 * at the beginning of a line */
1039 void exit_lexer(void)
1041 strset_destroy(&stringset);
1044 static __attribute__((unused))
1045 void dbg_pos(const source_position_t source_position)
1047 fprintf(stdout, "%s:%d\n", source_position.input_name,
1048 source_position.linenr);