5 #include "symbol_table_t.h"
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
20 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
24 void error_prefix(lexer_t *this)
26 error_prefix_at(this, this->source_position.input_name,
27 this->source_position.linenr);
31 void parse_error(lexer_t *this, const char *msg)
34 fprintf(stderr, "%s\n", msg);
38 void next_char(lexer_t *this)
41 if(this->bufpos >= this->bufend) {
42 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43 sizeof(this->buf) - MAX_PUTBACK, this->input);
48 this->bufpos = this->buf + MAX_PUTBACK;
49 this->bufend = this->buf + MAX_PUTBACK + s;
51 this->c = *(this->bufpos);
53 printf("nchar '%c'\n", this->c);
58 void put_back(lexer_t *this, int c)
60 char *p = (char*) this->bufpos - 1;
62 assert(p >= this->buf);
66 printf("putback '%c'\n", c);
71 int replace_trigraph(lexer_t *this)
73 #define MATCH_TRIGRAPH(ch,replacement) \
75 this->c = replacement; \
79 MATCH_TRIGRAPH('=', '#')
80 MATCH_TRIGRAPH('(', '[')
81 MATCH_TRIGRAPH('/', '\\')
82 MATCH_TRIGRAPH(')', ']')
83 MATCH_TRIGRAPH('\'', '^')
84 MATCH_TRIGRAPH('<', '{')
85 MATCH_TRIGRAPH('!', '|')
86 MATCH_TRIGRAPH('>', '}')
87 MATCH_TRIGRAPH('-', '~')
96 void parse_symbol(lexer_t *this, token_t *token)
101 obstack_1grow(&symbol_obstack, this->c);
108 if(this->c == '\n') {
110 this->source_position.linenr++;
117 obstack_1grow(&symbol_obstack, this->c);
124 put_back(this, this->c);
129 if(replace_trigraph(this))
132 put_back(this, this->c);
141 obstack_1grow(&symbol_obstack, '\0');
143 string = obstack_finish(&symbol_obstack);
144 symbol = symbol_table_insert(string);
147 token->type = symbol->ID;
149 token->type = T_IDENTIFIER;
151 token->v.symbol = symbol;
153 if(symbol->string != string) {
154 obstack_free(&symbol_obstack, string);
160 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
163 obstack_1grow(&symbol_obstack, this->c);
165 } while(is_ident_char(this->c));
166 obstack_1grow(&symbol_obstack, '\0');
168 char *string = obstack_finish(&symbol_obstack);
169 symbol_t *symbol = preprocessor_symbol_table_find(string);
170 obstack_free(&symbol_obstack, string);
180 void parse_number_hex(lexer_t *this, token_t *token)
182 assert(this->c == 'x' || this->c == 'X');
185 if (!isdigit(this->c) &&
186 !('A' <= this->c && this->c <= 'F') &&
187 !('a' <= this->c && this->c <= 'f')) {
188 parse_error(this, "premature end of hex number literal");
189 token->type = T_ERROR;
195 if (isdigit(this->c)) {
196 value = 16 * value + this->c - '0';
197 } else if ('A' <= this->c && this->c <= 'F') {
198 value = 16 * value + this->c - 'A' + 10;
199 } else if ('a' <= this->c && this->c <= 'f') {
200 value = 16 * value + this->c - 'a' + 10;
202 token->type = T_INTEGER;
203 token->v.intvalue = value;
211 void parse_number_oct(lexer_t *this, token_t *token)
213 assert(this->c == 'o' || this->c == 'O');
218 if ('0' <= this->c && this->c <= '7') {
219 value = 8 * value + this->c - '0';
221 token->type = T_INTEGER;
222 token->v.intvalue = value;
230 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
234 assert(first_char >= '0' && first_char <= '9');
235 value = first_char - '0';
239 if (isdigit(this->c)) {
240 value = 10 * value + this->c - '0';
242 token->type = T_INTEGER;
243 token->v.intvalue = value;
251 void parse_number(lexer_t *this, token_t *token)
253 // TODO check for overflow
254 // TODO check for various invalid inputs sequences
256 if (this->c == '0') {
260 case 'x': parse_number_hex(this, token); break;
262 case 'O': parse_number_oct(this, token); break;
263 default: parse_number_dec(this, token, '0');
266 parse_number_dec(this, token, 0);
271 int parse_escape_sequence(lexer_t *this)
277 case '"': return '"';
278 case '\'': return'\'';
280 if(this->c == '\n') {
281 this->source_position.linenr++;
283 return parse_escape_sequence(this);
286 case 'a': return '\a';
287 case 'b': return '\b';
288 case 'f': return '\f';
289 case 'n': return '\n';
290 case 'r': return '\r';
291 case 't': return '\t';
292 case 'v': return '\v';
293 case 'x': /* TODO parse hex number ... */
294 parse_error(this, "hex escape sequences not implemented yet");
296 case 0 ... 8: /* TODO parse octal number ... */
297 parse_error(this, "octal escape sequences not implemented yet");
303 /* might be a trigraph */
305 if(replace_trigraph(this)) {
306 return parse_escape_sequence(this);
308 put_back(this, this->c);
313 parse_error(this, "reached end of file while parsing escape sequence");
316 parse_error(this, "unknown escape sequence\n");
321 #define SKIP_TRIGRAPHS(no_trigraph_code) \
324 if(this->c != '?') { \
325 put_back(this, this->c); \
330 if(replace_trigraph(this)) \
332 put_back(this, '?'); \
333 put_back(this, this->c); \
338 void parse_string_literal(lexer_t *this, token_t *token)
340 unsigned start_linenr = this->source_position.linenr;
344 assert(this->c == '"');
350 obstack_1grow(&symbol_obstack, '?');
356 if(this->c == '\n') {
358 this->source_position.linenr++;
361 int c = parse_escape_sequence(this);
362 obstack_1grow(&symbol_obstack, c);
366 error_prefix_at(this, this->source_position.input_name,
368 fprintf(stderr, "string has no end\n");
369 token->type = T_ERROR;
377 obstack_1grow(&symbol_obstack, this->c);
385 /* TODO: concatenate multiple strings separated by whitespace... */
387 /* add finishing 0 to the string */
388 obstack_1grow(&symbol_obstack, '\0');
389 string = obstack_finish(&symbol_obstack);
391 /* check if there is already a copy of the string */
392 result = strset_insert(&this->stringset, string);
393 if(result != string) {
394 obstack_free(&symbol_obstack, string);
397 token->type = T_STRING_LITERAL;
398 token->v.string = result;
402 void parse_character_constant(lexer_t *this, token_t *token)
404 assert(this->c == '\'');
410 token->type = T_INTEGER;
411 token->v.intvalue = '?';
412 goto end_of_char_constant;
417 if(this->c == '\n') {
418 this->source_position.linenr++;
421 token->type = T_INTEGER;
422 token->v.intvalue = parse_escape_sequence(this);
423 goto end_of_char_constant;
427 parse_error(this, "newline while parsing character constant");
428 this->source_position.linenr++;
429 goto end_of_char_constant;
432 parse_error(this, "EOF while parsing character constant");
433 token->type = T_ERROR;
437 token->type = T_INTEGER;
438 token->v.intvalue = this->c;
440 goto end_of_char_constant;
444 end_of_char_constant:
445 if(this->c != '\'') {
446 parse_error(this, "multibyte character constant");
453 void skip_multiline_comment(lexer_t *this)
455 unsigned start_linenr = this->source_position.linenr;
471 if(replace_trigraph(this))
474 /* we don't put back the 2nd ? as the comment text is discarded
479 this->source_position.linenr++;
483 error_prefix_at(this, this->source_position.input_name,
485 fprintf(stderr, "at end of file while looking for comment end\n");
495 void skip_line_comment(lexer_t *this)
504 if(replace_trigraph(this))
507 /* we don't put back the 2nd ? as the comment text is discarded
513 if(this->c == '\n') {
515 this->source_position.linenr++;
531 void eat_until_newline(lexer_t *this)
533 while(this->c != '\n') {
541 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
544 /* skip whitespaces */
545 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
551 void preprocessor_next_token(lexer_t *this, token_t *token)
553 /* skip whitespaces */
554 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
562 parse_symbol(this, token);
566 void lexer_next_token(lexer_t *this, token_t *token)
577 this->source_position.linenr++;
584 parse_symbol(this, token);
588 parse_number(this, token);
592 parse_string_literal(this, token);
596 parse_character_constant(this, token);
601 if(this->c == '\n') {
603 this->source_position.linenr++;
606 parse_error(this, "unexpected '\\' found");
607 token->type = T_ERROR;
611 #define MAYBE_PROLOG \
616 #define MAYBE(ch, set_type) \
619 token->type = set_type; \
622 #define ELSE_CODE(code) \
629 if(this->c == '\n') { \
631 this->source_position.linenr++; \
638 } /* end of while(1) */ \
641 #define ELSE(set_type) \
643 token->type = set_type; \
651 MAYBE('.', T_DOTDOTDOT)
653 put_back(this, this->c);
662 MAYBE('=', T_ANDEQUAL)
666 MAYBE('=', T_ASTERISKEQUAL)
670 MAYBE('+', T_PLUSPLUS)
671 MAYBE('=', T_PLUSEQUAL)
675 MAYBE('-', T_MINUSMINUS)
676 MAYBE('=', T_MINUSEQUAL)
680 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
684 MAYBE('=', T_SLASHEQUAL)
687 skip_multiline_comment(this);
688 lexer_next_token(this, token);
692 skip_line_comment(this);
693 lexer_next_token(this, token);
698 MAYBE('>', T_PERCENTGREATER)
699 MAYBE('=', T_PERCENTEQUAL)
704 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
706 put_back(this, this->c);
708 token->type = T_PERCENTCOLON;
715 MAYBE(':', T_LESSCOLON)
716 MAYBE('%', T_LESSPERCENT)
719 MAYBE('=', T_LESSLESSEQUAL)
726 MAYBE('=', T_GREATERGREATEREQUAL)
727 ELSE(T_GREATERGREATER)
731 MAYBE('=', T_CARETEQUAL)
735 MAYBE('=', T_PIPEEQUAL)
736 MAYBE('|', T_PIPEPIPE)
740 MAYBE('>', T_COLONGREATER)
744 MAYBE('=', T_EQUALEQUAL)
748 MAYBE('#', T_HASHHASH)
752 parse_preprocessor_directive(this, token);
763 /* just a simple ? */
768 /* might be a trigraph */
770 if(replace_trigraph(this)) {
773 put_back(this, this->c);
787 token->type = this->c;
798 fprintf(stderr, "unknown character '%c' found\n", this->c);
799 token->type = T_ERROR;
805 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
807 memset(this, 0, sizeof(this[0]));
809 this->input = stream;
811 this->source_position.linenr = 0;
812 this->source_position.input_name = input_name;
813 strset_init(&this->stringset);
815 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
816 * beginning of a line */
820 void lexer_destroy(lexer_t *this)
825 static __attribute__((unused))
826 void dbg_pos(const source_position_t source_position)
828 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);