5 #include "symbol_table_t.h"
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
20 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
24 void error_prefix(lexer_t *this)
26 error_prefix_at(this, this->source_position.input_name,
27 this->source_position.linenr);
31 void parse_error(lexer_t *this, const char *msg)
34 fprintf(stderr, "%s\n", msg);
38 void next_char(lexer_t *this)
41 if(this->bufpos >= this->bufend) {
42 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43 sizeof(this->buf) - MAX_PUTBACK, this->input);
48 this->bufpos = this->buf + MAX_PUTBACK;
49 this->bufend = this->buf + MAX_PUTBACK + s;
51 this->c = *(this->bufpos);
53 printf("nchar '%c'\n", this->c);
58 void put_back(lexer_t *this, int c)
60 char *p = (char*) this->bufpos - 1;
62 assert(p >= this->buf);
66 printf("putback '%c'\n", c);
71 int replace_trigraph(lexer_t *this)
73 #define MATCH_TRIGRAPH(ch,replacement) \
75 this->c = replacement; \
79 MATCH_TRIGRAPH('=', '#')
80 MATCH_TRIGRAPH('(', '[')
81 MATCH_TRIGRAPH('/', '\\')
82 MATCH_TRIGRAPH(')', ']')
83 MATCH_TRIGRAPH('\'', '^')
84 MATCH_TRIGRAPH('<', '{')
85 MATCH_TRIGRAPH('!', '|')
86 MATCH_TRIGRAPH('>', '}')
87 MATCH_TRIGRAPH('-', '~')
95 #define SKIP_TRIGRAPHS(no_trigraph_code) \
98 if(this->c != '?') { \
99 put_back(this, this->c); \
104 if(replace_trigraph(this)) \
106 put_back(this, '?'); \
107 put_back(this, this->c); \
111 #define EAT_NEWLINE(newline_code) \
112 if(this->c == '\r') { \
114 if(this->c == '\n') \
116 this->source_position.linenr++; \
118 } else if(this->c == '\n') { \
120 this->source_position.linenr++; \
125 void parse_symbol(lexer_t *this, token_t *token)
130 obstack_1grow(&symbol_obstack, this->c);
143 obstack_1grow(&symbol_obstack, this->c);
150 put_back(this, this->c);
155 if(replace_trigraph(this))
158 put_back(this, this->c);
167 obstack_1grow(&symbol_obstack, '\0');
169 string = obstack_finish(&symbol_obstack);
170 symbol = symbol_table_insert(string);
173 token->type = symbol->ID;
175 token->type = T_IDENTIFIER;
177 token->v.symbol = symbol;
179 if(symbol->string != string) {
180 obstack_free(&symbol_obstack, string);
186 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
189 obstack_1grow(&symbol_obstack, this->c);
191 } while(is_ident_char(this->c));
192 obstack_1grow(&symbol_obstack, '\0');
194 char *string = obstack_finish(&symbol_obstack);
195 symbol_t *symbol = preprocessor_symbol_table_find(string);
196 obstack_free(&symbol_obstack, string);
206 void parse_number_hex(lexer_t *this, token_t *token)
208 assert(this->c == 'x' || this->c == 'X');
211 if (!isdigit(this->c) &&
212 !('A' <= this->c && this->c <= 'F') &&
213 !('a' <= this->c && this->c <= 'f')) {
214 parse_error(this, "premature end of hex number literal");
215 token->type = T_ERROR;
221 if (isdigit(this->c)) {
222 value = 16 * value + this->c - '0';
223 } else if ('A' <= this->c && this->c <= 'F') {
224 value = 16 * value + this->c - 'A' + 10;
225 } else if ('a' <= this->c && this->c <= 'f') {
226 value = 16 * value + this->c - 'a' + 10;
228 token->type = T_INTEGER;
229 token->v.intvalue = value;
237 void parse_number_oct(lexer_t *this, token_t *token)
239 assert(this->c == 'o' || this->c == 'O');
244 if ('0' <= this->c && this->c <= '7') {
245 value = 8 * value + this->c - '0';
247 token->type = T_INTEGER;
248 token->v.intvalue = value;
256 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
260 assert(first_char >= '0' && first_char <= '9');
261 value = first_char - '0';
265 if (isdigit(this->c)) {
266 value = 10 * value + this->c - '0';
268 token->type = T_INTEGER;
269 token->v.intvalue = value;
277 void parse_number(lexer_t *this, token_t *token)
279 // TODO check for overflow
280 // TODO check for various invalid inputs sequences
282 if (this->c == '0') {
286 case 'x': parse_number_hex(this, token); break;
288 case 'O': parse_number_oct(this, token); break;
289 default: parse_number_dec(this, token, '0');
292 parse_number_dec(this, token, 0);
297 int parse_escape_sequence(lexer_t *this)
304 case '"': return '"';
305 case '\'': return'\'';
309 case 'a': return '\a';
310 case 'b': return '\b';
311 case 'f': return '\f';
312 case 'n': return '\n';
313 case 'r': return '\r';
314 case 't': return '\t';
315 case 'v': return '\v';
316 case 'x': /* TODO parse hex number ... */
317 parse_error(this, "hex escape sequences not implemented yet");
319 case 0 ... 8: /* TODO parse octal number ... */
320 parse_error(this, "octal escape sequences not implemented yet");
326 /* might be a trigraph */
328 if(replace_trigraph(this)) {
331 put_back(this, this->c);
336 parse_error(this, "reached end of file while parsing escape sequence");
339 parse_error(this, "unknown escape sequence");
346 void parse_string_literal(lexer_t *this, token_t *token)
348 unsigned start_linenr = this->source_position.linenr;
352 assert(this->c == '"');
358 obstack_1grow(&symbol_obstack, '?');
366 int c = parse_escape_sequence(this);
367 obstack_1grow(&symbol_obstack, c);
371 error_prefix_at(this, this->source_position.input_name,
373 fprintf(stderr, "string has no end\n");
374 token->type = T_ERROR;
382 obstack_1grow(&symbol_obstack, this->c);
390 /* TODO: concatenate multiple strings separated by whitespace... */
392 /* add finishing 0 to the string */
393 obstack_1grow(&symbol_obstack, '\0');
394 string = obstack_finish(&symbol_obstack);
396 /* check if there is already a copy of the string */
397 result = strset_insert(&this->stringset, string);
398 if(result != string) {
399 obstack_free(&symbol_obstack, string);
402 token->type = T_STRING_LITERAL;
403 token->v.string = result;
406 #define MATCH_NEWLINE(code) \
409 if(this->c == '\n') { \
412 this->source_position.linenr++; \
416 this->source_position.linenr++; \
420 void parse_character_constant(lexer_t *this, token_t *token)
422 assert(this->c == '\'');
440 parse_error(this, "newline while parsing character constant");
446 goto end_of_char_constant;
449 parse_error(this, "EOF while parsing character constant");
450 token->type = T_ERROR;
454 if(found_char != 0) {
455 parse_error(this, "more than 1 characters in character "
457 goto end_of_char_constant;
459 found_char = this->c;
466 end_of_char_constant:
467 token->type = T_INTEGER;
468 token->v.intvalue = found_char;
472 void skip_multiline_comment(lexer_t *this)
474 unsigned start_linenr = this->source_position.linenr;
490 if(replace_trigraph(this))
493 /* we don't put back the 2nd ? as the comment text is discarded
497 MATCH_NEWLINE(break;)
500 error_prefix_at(this, this->source_position.input_name,
502 fprintf(stderr, "at end of file while looking for comment end\n");
512 void skip_line_comment(lexer_t *this)
521 if(replace_trigraph(this))
524 /* we don't put back the 2nd ? as the comment text is discarded
530 if(this->c == '\n') {
532 this->source_position.linenr++;
549 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
552 /* skip whitespaces */
553 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
558 void preprocessor_next_token(lexer_t *this, token_t *token)
560 /* skip whitespaces */
561 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
569 parse_symbol(this, token);
573 void lexer_next_token(lexer_t *this, token_t *token)
584 MATCH_NEWLINE(break;)
589 parse_symbol(this, token);
593 parse_number(this, token);
597 parse_string_literal(this, token);
601 parse_character_constant(this, token);
606 if(this->c == '\n') {
608 this->source_position.linenr++;
611 parse_error(this, "unexpected '\\' found");
612 token->type = T_ERROR;
616 #define MAYBE_PROLOG \
621 #define MAYBE(ch, set_type) \
624 token->type = set_type; \
627 #define ELSE_CODE(code) \
634 if(this->c == '\n') { \
636 this->source_position.linenr++; \
643 } /* end of while(1) */ \
646 #define ELSE(set_type) \
648 token->type = set_type; \
656 MAYBE('.', T_DOTDOTDOT)
658 put_back(this, this->c);
667 MAYBE('=', T_ANDEQUAL)
671 MAYBE('=', T_ASTERISKEQUAL)
675 MAYBE('+', T_PLUSPLUS)
676 MAYBE('=', T_PLUSEQUAL)
680 MAYBE('-', T_MINUSMINUS)
681 MAYBE('=', T_MINUSEQUAL)
685 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
689 MAYBE('=', T_SLASHEQUAL)
692 skip_multiline_comment(this);
693 lexer_next_token(this, token);
697 skip_line_comment(this);
698 lexer_next_token(this, token);
703 MAYBE('>', T_PERCENTGREATER)
704 MAYBE('=', T_PERCENTEQUAL)
709 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
711 put_back(this, this->c);
713 token->type = T_PERCENTCOLON;
720 MAYBE(':', T_LESSCOLON)
721 MAYBE('%', T_LESSPERCENT)
724 MAYBE('=', T_LESSLESSEQUAL)
731 MAYBE('=', T_GREATERGREATEREQUAL)
732 ELSE(T_GREATERGREATER)
736 MAYBE('=', T_CARETEQUAL)
740 MAYBE('=', T_PIPEEQUAL)
741 MAYBE('|', T_PIPEPIPE)
745 MAYBE('>', T_COLONGREATER)
749 MAYBE('=', T_EQUALEQUAL)
753 MAYBE('#', T_HASHHASH)
756 parse_preprocessor_directive(this, token);
766 /* just a simple ? */
771 /* might be a trigraph */
773 if(replace_trigraph(this)) {
776 put_back(this, this->c);
790 token->type = this->c;
801 fprintf(stderr, "unknown character '%c' found\n", this->c);
802 token->type = T_ERROR;
808 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
810 memset(this, 0, sizeof(this[0]));
812 this->input = stream;
814 this->source_position.linenr = 0;
815 this->source_position.input_name = input_name;
816 strset_init(&this->stringset);
818 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
819 * beginning of a line */
823 void lexer_destroy(lexer_t *this)
828 static __attribute__((unused))
829 void dbg_pos(const source_position_t source_position)
831 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);