5 #include "symbol_table_t.h"
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
20 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
24 void error_prefix(lexer_t *this)
26 error_prefix_at(this, this->source_position.input_name,
27 this->source_position.linenr);
31 void parse_error(lexer_t *this, const char *msg)
34 fprintf(stderr, "%s\n", msg);
38 void next_char(lexer_t *this)
41 if(this->bufpos >= this->bufend) {
42 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43 sizeof(this->buf) - MAX_PUTBACK, this->input);
48 this->bufpos = this->buf + MAX_PUTBACK;
49 this->bufend = this->buf + MAX_PUTBACK + s;
51 this->c = *(this->bufpos);
53 printf("nchar '%c'\n", this->c);
58 void put_back(lexer_t *this, int c)
60 char *p = (char*) this->bufpos - 1;
62 assert(p >= this->buf);
66 printf("putback '%c'\n", c);
71 int replace_trigraph(lexer_t *this)
73 #define MATCH_TRIGRAPH(ch,replacement) \
75 this->c = replacement; \
79 MATCH_TRIGRAPH('=', '#')
80 MATCH_TRIGRAPH('(', '[')
81 MATCH_TRIGRAPH('/', '\\')
82 MATCH_TRIGRAPH(')', ']')
83 MATCH_TRIGRAPH('\'', '^')
84 MATCH_TRIGRAPH('<', '{')
85 MATCH_TRIGRAPH('!', '|')
86 MATCH_TRIGRAPH('>', '}')
87 MATCH_TRIGRAPH('-', '~')
95 #define SKIP_TRIGRAPHS(no_trigraph_code) \
98 if(this->c != '?') { \
99 put_back(this, this->c); \
104 if(replace_trigraph(this)) \
106 put_back(this, '?'); \
107 put_back(this, this->c); \
111 #define EAT_NEWLINE(newline_code) \
112 if(this->c == '\r') { \
114 if(this->c == '\n') \
116 this->source_position.linenr++; \
118 } else if(this->c == '\n') { \
120 this->source_position.linenr++; \
125 void parse_symbol(lexer_t *this, token_t *token)
130 obstack_1grow(&symbol_obstack, this->c);
143 obstack_1grow(&symbol_obstack, this->c);
150 put_back(this, this->c);
155 if(replace_trigraph(this))
158 put_back(this, this->c);
167 obstack_1grow(&symbol_obstack, '\0');
169 string = obstack_finish(&symbol_obstack);
170 symbol = symbol_table_insert(string);
173 token->type = symbol->ID;
175 token->type = T_IDENTIFIER;
177 token->v.symbol = symbol;
179 if(symbol->string != string) {
180 obstack_free(&symbol_obstack, string);
186 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
189 obstack_1grow(&symbol_obstack, this->c);
191 } while(is_ident_char(this->c));
192 obstack_1grow(&symbol_obstack, '\0');
194 char *string = obstack_finish(&symbol_obstack);
195 symbol_t *symbol = preprocessor_symbol_table_find(string);
196 obstack_free(&symbol_obstack, string);
206 void parse_number_hex(lexer_t *this, token_t *token)
208 assert(this->c == 'x' || this->c == 'X');
211 if (!isdigit(this->c) &&
212 !('A' <= this->c && this->c <= 'F') &&
213 !('a' <= this->c && this->c <= 'f')) {
214 parse_error(this, "premature end of hex number literal");
215 token->type = T_ERROR;
221 if (isdigit(this->c)) {
222 value = 16 * value + this->c - '0';
223 } else if ('A' <= this->c && this->c <= 'F') {
224 value = 16 * value + this->c - 'A' + 10;
225 } else if ('a' <= this->c && this->c <= 'f') {
226 value = 16 * value + this->c - 'a' + 10;
228 token->type = T_INTEGER;
229 token->v.intvalue = value;
237 void parse_number_oct(lexer_t *this, token_t *token)
239 assert(this->c == 'o' || this->c == 'O');
244 if ('0' <= this->c && this->c <= '7') {
245 value = 8 * value + this->c - '0';
247 token->type = T_INTEGER;
248 token->v.intvalue = value;
256 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
260 assert(first_char >= '0' && first_char <= '9');
261 value = first_char - '0';
265 if (isdigit(this->c)) {
266 value = 10 * value + this->c - '0';
268 token->type = T_INTEGER;
269 token->v.intvalue = value;
277 void parse_number(lexer_t *this, token_t *token)
279 // TODO check for overflow
280 // TODO check for various invalid inputs sequences
282 if (this->c == '0') {
286 case 'x': parse_number_hex(this, token); break;
288 case 'O': parse_number_oct(this, token); break;
289 default: parse_number_dec(this, token, '0');
292 parse_number_dec(this, token, 0);
297 int parse_escape_sequence(lexer_t *this)
304 case '"': return '"';
305 case '\'': return'\'';
309 case 'a': return '\a';
310 case 'b': return '\b';
311 case 'f': return '\f';
312 case 'n': return '\n';
313 case 'r': return '\r';
314 case 't': return '\t';
315 case 'v': return '\v';
316 case 'x': /* TODO parse hex number ... */
317 parse_error(this, "hex escape sequences not implemented yet");
319 case 0 ... 8: /* TODO parse octal number ... */
320 parse_error(this, "octal escape sequences not implemented yet");
326 /* might be a trigraph */
328 if(replace_trigraph(this)) {
331 put_back(this, this->c);
336 parse_error(this, "reached end of file while parsing escape sequence");
339 parse_error(this, "unknown escape sequence");
346 void parse_string_literal(lexer_t *this, token_t *token)
348 unsigned start_linenr = this->source_position.linenr;
352 assert(this->c == '"');
358 obstack_1grow(&symbol_obstack, '?');
366 int c = parse_escape_sequence(this);
367 obstack_1grow(&symbol_obstack, c);
371 error_prefix_at(this, this->source_position.input_name,
373 fprintf(stderr, "string has no end\n");
374 token->type = T_ERROR;
382 obstack_1grow(&symbol_obstack, this->c);
390 /* TODO: concatenate multiple strings separated by whitespace... */
392 /* add finishing 0 to the string */
393 obstack_1grow(&symbol_obstack, '\0');
394 string = obstack_finish(&symbol_obstack);
396 /* check if there is already a copy of the string */
397 result = strset_insert(&this->stringset, string);
398 if(result != string) {
399 obstack_free(&symbol_obstack, string);
402 token->type = T_STRING_LITERAL;
403 token->v.string = result;
406 #define MATCH_NEWLINE(code) \
409 if(this->c == '\n') { \
412 this->source_position.linenr++; \
416 this->source_position.linenr++; \
420 void parse_character_constant(lexer_t *this, token_t *token)
422 assert(this->c == '\'');
440 parse_error(this, "newline while parsing character constant");
446 goto end_of_char_constant;
449 parse_error(this, "EOF while parsing character constant");
450 token->type = T_ERROR;
454 if(found_char != 0) {
455 parse_error(this, "more than 1 characters in character "
457 goto end_of_char_constant;
459 found_char = this->c;
466 end_of_char_constant:
467 token->type = T_INTEGER;
468 token->v.intvalue = found_char;
472 void skip_multiline_comment(lexer_t *this)
474 unsigned start_linenr = this->source_position.linenr;
505 if(replace_trigraph(this))
507 put_back(this, this->c);
510 /* we don't put back the 2nd ? as the comment text is discarded
514 MATCH_NEWLINE(had_star = 0; break;)
517 error_prefix_at(this, this->source_position.input_name,
519 fprintf(stderr, "at end of file while looking for comment end\n");
530 void skip_line_comment(lexer_t *this)
539 if(replace_trigraph(this))
542 /* we don't put back the 2nd ? as the comment text is discarded
548 if(this->c == '\n') {
550 this->source_position.linenr++;
567 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
570 /* skip whitespaces */
571 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
576 void preprocessor_next_token(lexer_t *this, token_t *token)
578 /* skip whitespaces */
579 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
587 parse_symbol(this, token);
591 void lexer_next_token(lexer_t *this, token_t *token)
602 MATCH_NEWLINE(break;)
607 parse_symbol(this, token);
611 parse_number(this, token);
615 parse_string_literal(this, token);
619 parse_character_constant(this, token);
624 if(this->c == '\n') {
626 this->source_position.linenr++;
629 parse_error(this, "unexpected '\\' found");
630 token->type = T_ERROR;
634 #define MAYBE_PROLOG \
639 #define MAYBE(ch, set_type) \
642 token->type = set_type; \
645 #define ELSE_CODE(code) \
652 if(this->c == '\n') { \
654 this->source_position.linenr++; \
661 } /* end of while(1) */ \
664 #define ELSE(set_type) \
666 token->type = set_type; \
674 MAYBE('.', T_DOTDOTDOT)
676 put_back(this, this->c);
685 MAYBE('=', T_ANDEQUAL)
689 MAYBE('=', T_ASTERISKEQUAL)
693 MAYBE('+', T_PLUSPLUS)
694 MAYBE('=', T_PLUSEQUAL)
698 MAYBE('-', T_MINUSMINUS)
699 MAYBE('=', T_MINUSEQUAL)
703 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
707 MAYBE('=', T_SLASHEQUAL)
710 skip_multiline_comment(this);
711 lexer_next_token(this, token);
715 skip_line_comment(this);
716 lexer_next_token(this, token);
721 MAYBE('>', T_PERCENTGREATER)
722 MAYBE('=', T_PERCENTEQUAL)
727 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
729 put_back(this, this->c);
731 token->type = T_PERCENTCOLON;
738 MAYBE(':', T_LESSCOLON)
739 MAYBE('%', T_LESSPERCENT)
742 MAYBE('=', T_LESSLESSEQUAL)
749 MAYBE('=', T_GREATERGREATEREQUAL)
750 ELSE(T_GREATERGREATER)
754 MAYBE('=', T_CARETEQUAL)
758 MAYBE('=', T_PIPEEQUAL)
759 MAYBE('|', T_PIPEPIPE)
763 MAYBE('>', T_COLONGREATER)
767 MAYBE('=', T_EQUALEQUAL)
771 MAYBE('#', T_HASHHASH)
774 parse_preprocessor_directive(this, token);
784 /* just a simple ? */
789 /* might be a trigraph */
791 if(replace_trigraph(this)) {
794 put_back(this, this->c);
808 token->type = this->c;
819 fprintf(stderr, "unknown character '%c' found\n", this->c);
820 token->type = T_ERROR;
826 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
828 memset(this, 0, sizeof(this[0]));
830 this->input = stream;
832 this->source_position.linenr = 0;
833 this->source_position.input_name = input_name;
834 strset_init(&this->stringset);
836 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
837 * beginning of a line */
841 void lexer_destroy(lexer_t *this)
846 static __attribute__((unused))
847 void dbg_pos(const source_position_t source_position)
849 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);