5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
23 static char buf[1024 + MAX_PUTBACK];
24 static const char *bufend;
25 static const char *bufpos;
26 static strset_t stringset;
28 static void error_prefix_at(const char *input_name, unsigned linenr)
30 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
33 static void error_prefix(void)
35 error_prefix_at(lexer_token.source_position.input_name,
36 lexer_token.source_position.linenr);
39 static void parse_error(const char *msg)
42 fprintf(stderr, "%s\n", msg);
45 static inline void next_real_char(void)
48 if(bufpos >= bufend) {
49 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
55 bufpos = buf + MAX_PUTBACK;
56 bufend = buf + MAX_PUTBACK + s;
61 static inline void put_back(int pc)
63 assert(bufpos >= buf);
64 //assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
66 char *p = buf + (bufpos - buf);
69 /* going backwards in the buffer is legal as long as it's not more often
74 printf("putback '%c'\n", pc);
78 static inline void next_char(void);
80 #define MATCH_NEWLINE(code) \
86 lexer_token.source_position.linenr++; \
90 lexer_token.source_position.linenr++; \
93 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
95 static void maybe_concat_lines(void)
100 MATCH_NEWLINE(return;)
110 static inline void next_char(void)
114 /* filter trigraphs */
115 if(UNLIKELY(c == '\\')) {
116 maybe_concat_lines();
117 goto end_of_next_char;
121 goto end_of_next_char;
124 if(LIKELY(c != '?')) {
127 goto end_of_next_char;
132 case '=': c = '#'; break;
133 case '(': c = '['; break;
134 case '/': c = '\\'; maybe_concat_lines(); break;
135 case ')': c = ']'; break;
136 case '\'': c = '^'; break;
137 case '<': c = '{'; break;
138 case '!': c = '|'; break;
139 case '>': c = '}'; break;
140 case '-': c = '~'; break;
150 printf("nchar '%c'\n", c);
154 #define SYMBOL_CHARS \
221 static void parse_symbol(void)
226 obstack_1grow(&symbol_obstack, c);
233 obstack_1grow(&symbol_obstack, c);
243 obstack_1grow(&symbol_obstack, '\0');
245 string = obstack_finish(&symbol_obstack);
246 symbol = symbol_table_insert(string);
248 lexer_token.type = symbol->ID;
249 lexer_token.v.symbol = symbol;
251 if(symbol->string != string) {
252 obstack_free(&symbol_obstack, string);
256 static void parse_integer_suffix(void)
258 if(c == 'U' || c == 'U') {
259 /* TODO do something with the suffixes... */
261 if(c == 'L' || c == 'l') {
263 if(c == 'L' || c == 'l') {
267 } else if(c == 'l' || c == 'L') {
269 if(c == 'l' || c == 'L') {
271 if(c == 'u' || c == 'U') {
274 } else if(c == 'u' || c == 'U') {
280 static void parse_floating_suffix(void)
283 /* TODO: do something usefull with the suffixes... */
295 static inline bool is_hex_digit(int c)
297 return (c >= '0' && c <= '9')
298 || (c >= 'a' && c <= 'z')
299 || (c >= 'A' && c <= 'Z');
302 static void parse_number_hex(void)
304 assert(c == 'x' || c == 'X');
307 while(is_hex_digit(c)) {
308 obstack_1grow(&symbol_obstack, c);
311 obstack_1grow(&symbol_obstack, '\0');
312 char *string = obstack_finish(&symbol_obstack);
314 if(c == '.' || c == 'p' || c == 'P') {
316 panic("Hex floating point numbers not implemented yet");
318 if(*string == '\0') {
319 parse_error("invalid hex number");
320 lexer_token.type = T_ERROR;
324 lexer_token.type = T_INTEGER;
325 lexer_token.v.intvalue = strtoll(string, &endptr, 16);
326 if(*endptr != '\0') {
327 parse_error("hex number literal too long");
330 obstack_free(&symbol_obstack, string);
333 static inline bool is_octal_digit(int chr)
335 return '0' <= chr && chr <= '7';
338 static void parse_number_oct(void)
340 while(is_octal_digit(c)) {
341 obstack_1grow(&symbol_obstack, c);
344 obstack_1grow(&symbol_obstack, '\0');
345 char *string = obstack_finish(&symbol_obstack);
348 lexer_token.type = T_INTEGER;
349 lexer_token.v.intvalue = strtoll(string, &endptr, 8);
350 if(*endptr != '\0') {
351 parse_error("octal number literal too long");
354 obstack_free(&symbol_obstack, string);
355 parse_integer_suffix();
358 static void parse_number_dec(void)
360 bool is_float = false;
362 obstack_1grow(&symbol_obstack, c);
367 obstack_1grow(&symbol_obstack, '.');
371 obstack_1grow(&symbol_obstack, c);
376 if(c == 'e' || c == 'E') {
377 obstack_1grow(&symbol_obstack, 'e');
380 if(c == '-' || c == '+') {
381 obstack_1grow(&symbol_obstack, c);
386 obstack_1grow(&symbol_obstack, c);
392 obstack_1grow(&symbol_obstack, '\0');
393 char *string = obstack_finish(&symbol_obstack);
397 lexer_token.type = T_FLOATINGPOINT;
398 lexer_token.v.floatvalue = strtold(string, &endptr);
400 if(*endptr != '\0') {
401 parse_error("invalid number literal");
404 parse_floating_suffix();
406 lexer_token.type = T_INTEGER;
407 lexer_token.v.intvalue = strtoll(string, &endptr, 10);
409 if(*endptr != '\0') {
410 parse_error("invalid number literal");
413 parse_integer_suffix();
415 obstack_free(&symbol_obstack, string);
418 static void parse_number(void)
440 parse_error("invalid octal number");
441 lexer_token.type = T_ERROR;
447 obstack_1grow(&symbol_obstack, '0');
456 static int parse_octal_sequence(const int first_digit)
458 assert(is_octal_digit(first_digit));
459 int value = first_digit - '0';
460 if (!is_octal_digit(c)) return value;
461 value = 8 * value + c - '0';
463 if (!is_octal_digit(c)) return value;
464 value = 8 * value + c - '0';
469 static int parse_hex_sequence(void)
473 if (c >= '0' && c <= '9') {
474 value = 16 * value + c - '0';
475 } else if ('A' <= c && c <= 'F') {
476 value = 16 * value + c - 'A' + 10;
477 } else if ('a' <= c && c <= 'f') {
478 value = 16 * value + c - 'a' + 10;
488 static int parse_escape_sequence(void)
496 case '"': return '"';
497 case '\'': return '\'';
498 case '\\': return '\\';
499 case '?': return '\?';
500 case 'a': return '\a';
501 case 'b': return '\b';
502 case 'f': return '\f';
503 case 'n': return '\n';
504 case 'r': return '\r';
505 case 't': return '\t';
506 case 'v': return '\v';
508 return parse_hex_sequence();
517 return parse_octal_sequence(ec);
519 parse_error("reached end of file while parsing escape sequence");
522 parse_error("unknown escape sequence");
527 const char *concat_strings(const char *s1, const char *s2)
529 size_t len1 = strlen(s1);
530 size_t len2 = strlen(s2);
532 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
533 memcpy(concat, s1, len1);
534 memcpy(concat + len1, s2, len2 + 1);
536 const char *result = strset_insert(&stringset, concat);
537 if(result != concat) {
538 obstack_free(&symbol_obstack, concat);
544 static void parse_string_literal(void)
546 unsigned start_linenr = lexer_token.source_position.linenr;
557 tc = parse_escape_sequence();
558 obstack_1grow(&symbol_obstack, tc);
562 error_prefix_at(lexer_token.source_position.input_name,
564 fprintf(stderr, "string has no end\n");
565 lexer_token.type = T_ERROR;
573 obstack_1grow(&symbol_obstack, c);
581 /* TODO: concatenate multiple strings separated by whitespace... */
583 /* add finishing 0 to the string */
584 obstack_1grow(&symbol_obstack, '\0');
585 string = obstack_finish(&symbol_obstack);
587 /* check if there is already a copy of the string */
588 result = strset_insert(&stringset, string);
589 if(result != string) {
590 obstack_free(&symbol_obstack, string);
593 lexer_token.type = T_STRING_LITERAL;
594 lexer_token.v.string = result;
597 static void parse_character_constant(void)
605 found_char = parse_escape_sequence();
609 parse_error("newline while parsing character constant");
615 goto end_of_char_constant;
618 parse_error("EOF while parsing character constant");
619 lexer_token.type = T_ERROR;
623 if(found_char != 0) {
624 parse_error("more than 1 characters in character "
626 goto end_of_char_constant;
635 end_of_char_constant:
636 lexer_token.type = T_INTEGER;
637 lexer_token.v.intvalue = found_char;
640 static void skip_multiline_comment(void)
642 unsigned start_linenr = lexer_token.source_position.linenr;
654 MATCH_NEWLINE(break;)
657 error_prefix_at(lexer_token.source_position.input_name,
659 fprintf(stderr, "at end of file while looking for comment end\n");
669 static void skip_line_comment(void)
687 static token_t pp_token;
689 static inline void next_pp_token(void)
691 lexer_next_preprocessing_token();
692 pp_token = lexer_token;
695 static void eat_until_newline(void)
697 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
702 static void error_directive(void)
705 fprintf(stderr, "#error directive: \n");
707 /* parse pp-tokens until new-line */
710 static void define_directive(void)
712 lexer_next_preprocessing_token();
713 if(lexer_token.type != T_IDENTIFIER) {
714 parse_error("expected identifier after #define\n");
719 static void ifdef_directive(int is_ifndef)
722 lexer_next_preprocessing_token();
723 //expect_identifier();
727 static void endif_directive(void)
732 static void parse_line_directive(void)
734 if(pp_token.type != T_INTEGER) {
735 parse_error("expected integer");
737 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
740 if(pp_token.type == T_STRING_LITERAL) {
741 lexer_token.source_position.input_name = pp_token.v.string;
748 static void parse_preprocessor_identifier(void)
750 assert(pp_token.type == T_IDENTIFIER);
751 symbol_t *symbol = pp_token.v.symbol;
753 switch(symbol->pp_ID) {
755 printf("include - enable header name parsing!\n");
771 parse_line_directive();
785 static void parse_preprocessor_directive(void)
789 switch(pp_token.type) {
791 parse_preprocessor_identifier();
794 parse_line_directive();
797 parse_error("invalid preprocessor directive");
803 #define MAYBE_PROLOG \
808 #define MAYBE(ch, set_type) \
811 lexer_token.type = set_type; \
814 #define ELSE_CODE(code) \
818 } /* end of while(1) */ \
821 #define ELSE(set_type) \
823 lexer_token.type = set_type; \
827 void lexer_next_preprocessing_token(void)
837 lexer_token.type = '\n';
843 /* might be a wide string ( L"string" ) */
844 if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
845 lexer_token.v.symbol == symbol_L)) {
846 parse_string_literal();
856 parse_string_literal();
860 parse_character_constant();
867 MAYBE('.', T_DOTDOTDOT)
871 lexer_token.type = '.';
878 MAYBE('=', T_ANDEQUAL)
882 MAYBE('=', T_ASTERISKEQUAL)
886 MAYBE('+', T_PLUSPLUS)
887 MAYBE('=', T_PLUSEQUAL)
891 MAYBE('>', T_MINUSGREATER)
892 MAYBE('-', T_MINUSMINUS)
893 MAYBE('=', T_MINUSEQUAL)
897 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
901 MAYBE('=', T_SLASHEQUAL)
904 skip_multiline_comment();
905 lexer_next_preprocessing_token();
910 lexer_next_preprocessing_token();
915 MAYBE('>', T_PERCENTGREATER)
916 MAYBE('=', T_PERCENTEQUAL)
921 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
925 lexer_token.type = T_PERCENTCOLON;
932 MAYBE(':', T_LESSCOLON)
933 MAYBE('%', T_LESSPERCENT)
934 MAYBE('=', T_LESSEQUAL)
937 MAYBE('=', T_LESSLESSEQUAL)
942 MAYBE('=', T_GREATEREQUAL)
945 MAYBE('=', T_GREATERGREATEREQUAL)
946 ELSE(T_GREATERGREATER)
950 MAYBE('=', T_CARETEQUAL)
954 MAYBE('=', T_PIPEEQUAL)
955 MAYBE('|', T_PIPEPIPE)
959 MAYBE('>', T_COLONGREATER)
963 MAYBE('=', T_EQUALEQUAL)
967 MAYBE('#', T_HASHHASH)
981 lexer_token.type = c;
986 lexer_token.type = T_EOF;
992 fprintf(stderr, "unknown character '%c' found\n", c);
993 lexer_token.type = T_ERROR;
999 void lexer_next_token(void)
1001 lexer_next_preprocessing_token();
1002 if(lexer_token.type != '\n')
1007 lexer_next_preprocessing_token();
1008 } while(lexer_token.type == '\n');
1010 if(lexer_token.type == '#') {
1011 parse_preprocessor_directive();
1016 void init_lexer(void)
1018 strset_init(&stringset);
1021 void lexer_open_stream(FILE *stream, const char *input_name)
1024 lexer_token.source_position.linenr = 0;
1025 lexer_token.source_position.input_name = input_name;
1027 symbol_L = symbol_table_insert("L");
1029 /* place a virtual \n at the beginning so the lexer knows that we're
1030 * at the beginning of a line */
1034 void exit_lexer(void)
1036 strset_destroy(&stringset);
1039 static __attribute__((unused))
1040 void dbg_pos(const source_position_t source_position)
1042 fprintf(stdout, "%s:%d\n", source_position.input_name,
1043 source_position.linenr);