5 #include "symbol_table_t.h"
7 #include "adt/strset.h"
21 static char buf[1024 + MAX_PUTBACK];
22 static const char *bufend;
23 static const char *bufpos;
24 static strset_t stringset;
25 //static FILE **input_stack;
26 //static char **buf_stack;
28 static void error_prefix_at(const char *input_name, unsigned linenr)
30 fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
33 static void error_prefix(void)
35 error_prefix_at(lexer_token.source_position.input_name,
36 lexer_token.source_position.linenr);
39 static void parse_error(const char *msg)
42 fprintf(stderr, "%s\n", msg);
45 static inline void next_real_char(void)
48 if(bufpos >= bufend) {
49 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
55 bufpos = buf + MAX_PUTBACK;
56 bufend = buf + MAX_PUTBACK + s;
61 static inline void put_back(int pc)
63 char *p = (char*) bufpos - 1;
69 printf("putback '%c'\n", pc);
73 static inline void next_char(void);
75 #define MATCH_NEWLINE(code) \
81 lexer_token.source_position.linenr++; \
85 lexer_token.source_position.linenr++; \
88 static void maybe_concat_lines(void)
92 MATCH_NEWLINE(return;)
102 static inline void next_char(void)
106 /* filter trigraphs */
107 if(UNLIKELY(c == '\\')) {
108 maybe_concat_lines();
109 goto end_of_next_char;
113 goto end_of_next_char;
116 if(LIKELY(c != '?')) {
119 goto end_of_next_char;
124 case '=': c = '#'; break;
125 case '(': c = '['; break;
126 case '/': c = '\\'; maybe_concat_lines(); break;
127 case ')': c = ']'; break;
128 case '\'': c = '^'; break;
129 case '<': c = '{'; break;
130 case '!': c = '|'; break;
131 case '>': c = '}'; break;
132 case '-': c = '~'; break;
142 printf("nchar '%c'\n", c);
148 #define SYMBOL_CHARS \
215 static void parse_symbol(void)
220 obstack_1grow(&symbol_obstack, c);
227 obstack_1grow(&symbol_obstack, c);
237 obstack_1grow(&symbol_obstack, '\0');
239 string = obstack_finish(&symbol_obstack);
240 symbol = symbol_table_insert(string);
242 lexer_token.type = symbol->ID;
243 lexer_token.v.symbol = symbol;
245 if(symbol->string != string) {
246 obstack_free(&symbol_obstack, string);
250 static void parse_number_hex(void)
252 assert(c == 'x' || c == 'X');
256 !('A' <= c && c <= 'F') &&
257 !('a' <= c && c <= 'f')) {
258 parse_error("premature end of hex number literal");
259 lexer_token.type = T_ERROR;
266 value = 16 * value + c - '0';
267 } else if ('A' <= c && c <= 'F') {
268 value = 16 * value + c - 'A' + 10;
269 } else if ('a' <= c && c <= 'f') {
270 value = 16 * value + c - 'a' + 10;
272 lexer_token.type = T_INTEGER;
273 lexer_token.v.intvalue = value;
280 static void parse_number_oct(void)
284 if ('0' <= c && c <= '7') {
285 value = 8 * value + c - '0';
286 } else if (c == '8' || c == '9') {
287 parse_error("invalid octal number");
288 lexer_token.type = T_ERROR;
291 lexer_token.type = T_INTEGER;
292 lexer_token.v.intvalue = value;
299 static void parse_number_dec(void)
305 value = 10 * value + c - '0';
307 lexer_token.type = T_INTEGER;
308 lexer_token.v.intvalue = value;
315 static void parse_integer_suffix(void)
317 if(c == 'U' || c == 'U') {
318 /* TODO do something with the suffixes... */
320 if(c == 'L' || c == 'l') {
322 if(c == 'L' || c == 'l') {
326 } else if(c == 'l' || c == 'L') {
328 if(c == 'l' || c == 'L') {
330 if(c == 'u' || c == 'U') {
333 } else if(c == 'u' || c == 'U') {
339 static void parse_number(void)
345 case 'x': parse_number_hex(); break;
346 default: parse_number_oct(); break;
352 parse_integer_suffix();
355 static int parse_octal_sequence(void)
359 if(c < '0' || c > '7')
361 value = 8 * value + c - '0';
368 static int parse_hex_sequence(void)
372 if (c >= '0' && c <= '9') {
373 value = 16 * value + c - '0';
374 } else if ('A' <= c && c <= 'F') {
375 value = 16 * value + c - 'A' + 10;
376 } else if ('a' <= c && c <= 'f') {
377 value = 16 * value + c - 'a' + 10;
387 static int parse_escape_sequence(void)
394 case '"': return '"';
395 case '\'': return'\'';
396 case '\\': return '\\';
397 case '?': return '\?';
398 case 'a': return '\a';
399 case 'b': return '\b';
400 case 'f': return '\f';
401 case 'n': return '\n';
402 case 'r': return '\r';
403 case 't': return '\t';
404 case 'v': return '\v';
406 return parse_hex_sequence();
415 return parse_octal_sequence();
417 parse_error("reached end of file while parsing escape sequence");
420 parse_error("unknown escape sequence");
426 const char *concat_strings(const char *s1, const char *s2)
428 size_t len1 = strlen(s1);
429 size_t len2 = strlen(s2);
431 char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
432 memcpy(concat, s1, len1);
433 memcpy(concat + len1, s2, len2 + 1);
435 const char *result = strset_insert(&stringset, concat);
436 if(result != concat) {
437 obstack_free(&symbol_obstack, concat);
443 static void parse_string_literal(void)
445 unsigned start_linenr = lexer_token.source_position.linenr;
456 int ec = parse_escape_sequence();
457 obstack_1grow(&symbol_obstack, ec);
461 error_prefix_at(lexer_token.source_position.input_name,
463 fprintf(stderr, "string has no end\n");
464 lexer_token.type = T_ERROR;
472 obstack_1grow(&symbol_obstack, c);
480 /* TODO: concatenate multiple strings separated by whitespace... */
482 /* add finishing 0 to the string */
483 obstack_1grow(&symbol_obstack, '\0');
484 string = obstack_finish(&symbol_obstack);
486 /* check if there is already a copy of the string */
487 result = strset_insert(&stringset, string);
488 if(result != string) {
489 obstack_free(&symbol_obstack, string);
492 lexer_token.type = T_STRING_LITERAL;
493 lexer_token.v.string = result;
496 static void parse_character_constant(void)
506 found_char = parse_escape_sequence();
510 parse_error("newline while parsing character constant");
516 goto end_of_char_constant;
519 parse_error("EOF while parsing character constant");
520 lexer_token.type = T_ERROR;
524 if(found_char != 0) {
525 parse_error("more than 1 characters in character "
527 goto end_of_char_constant;
536 end_of_char_constant:
537 lexer_token.type = T_INTEGER;
538 lexer_token.v.intvalue = found_char;
541 static void skip_multiline_comment(void)
543 unsigned start_linenr = lexer_token.source_position.linenr;
555 MATCH_NEWLINE(break;)
558 error_prefix_at(lexer_token.source_position.input_name,
560 fprintf(stderr, "at end of file while looking for comment end\n");
570 static void skip_line_comment(void)
588 static token_t pp_token;
590 static inline void next_pp_token(void)
592 lexer_next_preprocessing_token();
593 pp_token = lexer_token;
596 static void eat_until_newline(void)
598 while(pp_token.type != '\n' && pp_token.type != T_EOF) {
603 static void error_directive(void)
606 fprintf(stderr, "#error directive: \n");
608 /* parse pp-tokens until new-line */
611 static void define_directive(void)
613 lexer_next_preprocessing_token();
614 if(lexer_token.type != T_IDENTIFIER) {
615 parse_error("expected identifier after #define\n");
620 static void ifdef_directive(int is_ifndef)
623 lexer_next_preprocessing_token();
624 //expect_identifier();
628 static void endif_directive(void)
633 static void parse_line_directive(void)
635 if(pp_token.type != T_INTEGER) {
636 parse_error("expected integer");
638 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
641 if(pp_token.type == T_STRING_LITERAL) {
642 lexer_token.source_position.input_name = pp_token.v.string;
649 static void parse_preprocessor_identifier(void)
651 assert(pp_token.type == T_IDENTIFIER);
652 symbol_t *symbol = pp_token.v.symbol;
654 switch(symbol->pp_ID) {
656 printf("include - enable header name parsing!\n");
672 parse_line_directive();
686 static void parse_preprocessor_directive()
690 switch(pp_token.type) {
692 parse_preprocessor_identifier();
695 parse_line_directive();
698 parse_error("invalid preprocessor directive");
704 #define MAYBE_PROLOG \
709 #define MAYBE(ch, set_type) \
712 lexer_token.type = set_type; \
715 #define ELSE_CODE(code) \
719 } /* end of while(1) */ \
722 #define ELSE(set_type) \
724 lexer_token.type = set_type; \
728 void lexer_next_preprocessing_token(void)
738 lexer_token.type = '\n';
751 parse_string_literal();
755 parse_character_constant();
762 MAYBE('.', T_DOTDOTDOT)
766 lexer_token.type = '.';
773 MAYBE('=', T_ANDEQUAL)
777 MAYBE('=', T_ASTERISKEQUAL)
781 MAYBE('+', T_PLUSPLUS)
782 MAYBE('=', T_PLUSEQUAL)
786 MAYBE('>', T_MINUSGREATER)
787 MAYBE('-', T_MINUSMINUS)
788 MAYBE('=', T_MINUSEQUAL)
792 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
796 MAYBE('=', T_SLASHEQUAL)
799 skip_multiline_comment();
800 lexer_next_preprocessing_token();
805 lexer_next_preprocessing_token();
810 MAYBE('>', T_PERCENTGREATER)
811 MAYBE('=', T_PERCENTEQUAL)
816 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
820 lexer_token.type = T_PERCENTCOLON;
827 MAYBE(':', T_LESSCOLON)
828 MAYBE('%', T_LESSPERCENT)
829 MAYBE('=', T_LESSEQUAL)
832 MAYBE('=', T_LESSLESSEQUAL)
837 MAYBE('=', T_GREATEREQUAL)
840 MAYBE('=', T_GREATERGREATEREQUAL)
841 ELSE(T_GREATERGREATER)
845 MAYBE('=', T_CARETEQUAL)
849 MAYBE('=', T_PIPEEQUAL)
850 MAYBE('|', T_PIPEPIPE)
854 MAYBE('>', T_COLONGREATER)
858 MAYBE('=', T_EQUALEQUAL)
862 MAYBE('#', T_HASHHASH)
876 lexer_token.type = c;
881 lexer_token.type = T_EOF;
887 fprintf(stderr, "unknown character '%c' found\n", c);
888 lexer_token.type = T_ERROR;
894 void lexer_next_token(void)
896 lexer_next_preprocessing_token();
897 if(lexer_token.type != '\n')
902 lexer_next_preprocessing_token();
903 } while(lexer_token.type == '\n');
905 if(lexer_token.type == '#') {
906 parse_preprocessor_directive();
911 void init_lexer(void)
913 strset_init(&stringset);
916 void lexer_open_stream(FILE *stream, const char *input_name)
919 lexer_token.source_position.linenr = 1;
920 lexer_token.source_position.input_name = input_name;
925 void exit_lexer(void)
927 strset_destroy(&stringset);
930 static __attribute__((unused))
931 void dbg_pos(const source_position_t source_position)
933 fprintf(stdout, "%s:%d\n", source_position.input_name,
934 source_position.linenr);