5 #include "symbol_table_t.h"
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
20 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
24 void error_prefix(lexer_t *this)
26 error_prefix_at(this, this->source_position.input_name,
27 this->source_position.linenr);
31 void parse_error(lexer_t *this, const char *msg)
34 fprintf(stderr, "%s\n", msg);
38 void next_char(lexer_t *this)
41 if(this->bufpos >= this->bufend) {
42 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43 sizeof(this->buf) - MAX_PUTBACK, this->input);
48 this->bufpos = this->buf + MAX_PUTBACK;
49 this->bufend = this->buf + MAX_PUTBACK + s;
51 this->c = *(this->bufpos);
53 printf("nchar '%c'\n", this->c);
58 void put_back(lexer_t *this, int c)
60 char *p = (char*) this->bufpos - 1;
62 assert(p >= this->buf);
66 printf("putback '%c'\n", c);
71 int replace_trigraph(lexer_t *this)
73 #define MATCH_TRIGRAPH(ch,replacement) \
75 this->c = replacement; \
79 MATCH_TRIGRAPH('=', '#')
80 MATCH_TRIGRAPH('(', '[')
81 MATCH_TRIGRAPH('/', '\\')
82 MATCH_TRIGRAPH(')', ']')
83 MATCH_TRIGRAPH('\'', '^')
84 MATCH_TRIGRAPH('<', '{')
85 MATCH_TRIGRAPH('!', '|')
86 MATCH_TRIGRAPH('>', '}')
87 MATCH_TRIGRAPH('-', '~')
95 #define SKIP_TRIGRAPHS(no_trigraph_code) \
98 if(this->c != '?') { \
99 put_back(this, this->c); \
104 if(replace_trigraph(this)) \
106 put_back(this, '?'); \
107 put_back(this, this->c); \
112 void parse_symbol(lexer_t *this, token_t *token)
117 obstack_1grow(&symbol_obstack, this->c);
124 if(this->c == '\n') {
126 this->source_position.linenr++;
133 obstack_1grow(&symbol_obstack, this->c);
140 put_back(this, this->c);
145 if(replace_trigraph(this))
148 put_back(this, this->c);
157 obstack_1grow(&symbol_obstack, '\0');
159 string = obstack_finish(&symbol_obstack);
160 symbol = symbol_table_insert(string);
163 token->type = symbol->ID;
165 token->type = T_IDENTIFIER;
167 token->v.symbol = symbol;
169 if(symbol->string != string) {
170 obstack_free(&symbol_obstack, string);
176 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
179 obstack_1grow(&symbol_obstack, this->c);
181 } while(is_ident_char(this->c));
182 obstack_1grow(&symbol_obstack, '\0');
184 char *string = obstack_finish(&symbol_obstack);
185 symbol_t *symbol = preprocessor_symbol_table_find(string);
186 obstack_free(&symbol_obstack, string);
196 void parse_number_hex(lexer_t *this, token_t *token)
198 assert(this->c == 'x' || this->c == 'X');
201 if (!isdigit(this->c) &&
202 !('A' <= this->c && this->c <= 'F') &&
203 !('a' <= this->c && this->c <= 'f')) {
204 parse_error(this, "premature end of hex number literal");
205 token->type = T_ERROR;
211 if (isdigit(this->c)) {
212 value = 16 * value + this->c - '0';
213 } else if ('A' <= this->c && this->c <= 'F') {
214 value = 16 * value + this->c - 'A' + 10;
215 } else if ('a' <= this->c && this->c <= 'f') {
216 value = 16 * value + this->c - 'a' + 10;
218 token->type = T_INTEGER;
219 token->v.intvalue = value;
227 void parse_number_oct(lexer_t *this, token_t *token)
229 assert(this->c == 'o' || this->c == 'O');
234 if ('0' <= this->c && this->c <= '7') {
235 value = 8 * value + this->c - '0';
237 token->type = T_INTEGER;
238 token->v.intvalue = value;
246 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
250 assert(first_char >= '0' && first_char <= '9');
251 value = first_char - '0';
255 if (isdigit(this->c)) {
256 value = 10 * value + this->c - '0';
258 token->type = T_INTEGER;
259 token->v.intvalue = value;
267 void parse_number(lexer_t *this, token_t *token)
269 // TODO check for overflow
270 // TODO check for various invalid inputs sequences
272 if (this->c == '0') {
276 case 'x': parse_number_hex(this, token); break;
278 case 'O': parse_number_oct(this, token); break;
279 default: parse_number_dec(this, token, '0');
282 parse_number_dec(this, token, 0);
287 int parse_escape_sequence(lexer_t *this)
293 case '"': return '"';
294 case '\'': return'\'';
296 if(this->c == '\n') {
297 this->source_position.linenr++;
299 return parse_escape_sequence(this);
302 case 'a': return '\a';
303 case 'b': return '\b';
304 case 'f': return '\f';
305 case 'n': return '\n';
306 case 'r': return '\r';
307 case 't': return '\t';
308 case 'v': return '\v';
309 case 'x': /* TODO parse hex number ... */
310 parse_error(this, "hex escape sequences not implemented yet");
312 case 0 ... 8: /* TODO parse octal number ... */
313 parse_error(this, "octal escape sequences not implemented yet");
319 /* might be a trigraph */
321 if(replace_trigraph(this)) {
322 return parse_escape_sequence(this);
324 put_back(this, this->c);
329 parse_error(this, "reached end of file while parsing escape sequence");
332 parse_error(this, "unknown escape sequence\n");
338 void parse_string_literal(lexer_t *this, token_t *token)
340 unsigned start_linenr = this->source_position.linenr;
344 assert(this->c == '"');
350 obstack_1grow(&symbol_obstack, '?');
357 if(this->c == '\n') {
359 this->source_position.linenr++;
362 int c = parse_escape_sequence(this);
363 obstack_1grow(&symbol_obstack, c);
367 error_prefix_at(this, this->source_position.input_name,
369 fprintf(stderr, "string has no end\n");
370 token->type = T_ERROR;
378 obstack_1grow(&symbol_obstack, this->c);
386 /* TODO: concatenate multiple strings separated by whitespace... */
388 /* add finishing 0 to the string */
389 obstack_1grow(&symbol_obstack, '\0');
390 string = obstack_finish(&symbol_obstack);
392 /* check if there is already a copy of the string */
393 result = strset_insert(&this->stringset, string);
394 if(result != string) {
395 obstack_free(&symbol_obstack, string);
398 token->type = T_STRING_LITERAL;
399 token->v.string = result;
403 void parse_character_constant(lexer_t *this, token_t *token)
405 assert(this->c == '\'');
418 if(this->c == '\n') {
420 this->source_position.linenr++;
428 parse_error(this, "newline while parsing character constant");
429 this->source_position.linenr++;
434 goto end_of_char_constant;
437 parse_error(this, "EOF while parsing character constant");
438 token->type = T_ERROR;
442 if(found_char != 0) {
443 parse_error(this, "more than 1 characters in character "
445 goto end_of_char_constant;
447 found_char = this->c;
454 end_of_char_constant:
455 token->type = T_INTEGER;
456 token->v.intvalue = found_char;
460 void skip_multiline_comment(lexer_t *this)
462 unsigned start_linenr = this->source_position.linenr;
478 if(replace_trigraph(this))
481 /* we don't put back the 2nd ? as the comment text is discarded
486 this->source_position.linenr++;
490 error_prefix_at(this, this->source_position.input_name,
492 fprintf(stderr, "at end of file while looking for comment end\n");
502 void skip_line_comment(lexer_t *this)
511 if(replace_trigraph(this))
514 /* we don't put back the 2nd ? as the comment text is discarded
520 if(this->c == '\n') {
522 this->source_position.linenr++;
538 void eat_until_newline(lexer_t *this)
540 while(this->c != '\n') {
548 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
551 /* skip whitespaces */
552 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
558 void preprocessor_next_token(lexer_t *this, token_t *token)
560 /* skip whitespaces */
561 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
569 parse_symbol(this, token);
573 void lexer_next_token(lexer_t *this, token_t *token)
584 this->source_position.linenr++;
591 parse_symbol(this, token);
595 parse_number(this, token);
599 parse_string_literal(this, token);
603 parse_character_constant(this, token);
608 if(this->c == '\n') {
610 this->source_position.linenr++;
613 parse_error(this, "unexpected '\\' found");
614 token->type = T_ERROR;
618 #define MAYBE_PROLOG \
623 #define MAYBE(ch, set_type) \
626 token->type = set_type; \
629 #define ELSE_CODE(code) \
636 if(this->c == '\n') { \
638 this->source_position.linenr++; \
645 } /* end of while(1) */ \
648 #define ELSE(set_type) \
650 token->type = set_type; \
658 MAYBE('.', T_DOTDOTDOT)
660 put_back(this, this->c);
669 MAYBE('=', T_ANDEQUAL)
673 MAYBE('=', T_ASTERISKEQUAL)
677 MAYBE('+', T_PLUSPLUS)
678 MAYBE('=', T_PLUSEQUAL)
682 MAYBE('-', T_MINUSMINUS)
683 MAYBE('=', T_MINUSEQUAL)
687 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
691 MAYBE('=', T_SLASHEQUAL)
694 skip_multiline_comment(this);
695 lexer_next_token(this, token);
699 skip_line_comment(this);
700 lexer_next_token(this, token);
705 MAYBE('>', T_PERCENTGREATER)
706 MAYBE('=', T_PERCENTEQUAL)
711 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
713 put_back(this, this->c);
715 token->type = T_PERCENTCOLON;
722 MAYBE(':', T_LESSCOLON)
723 MAYBE('%', T_LESSPERCENT)
726 MAYBE('=', T_LESSLESSEQUAL)
733 MAYBE('=', T_GREATERGREATEREQUAL)
734 ELSE(T_GREATERGREATER)
738 MAYBE('=', T_CARETEQUAL)
742 MAYBE('=', T_PIPEEQUAL)
743 MAYBE('|', T_PIPEPIPE)
747 MAYBE('>', T_COLONGREATER)
751 MAYBE('=', T_EQUALEQUAL)
755 MAYBE('#', T_HASHHASH)
759 parse_preprocessor_directive(this, token);
770 /* just a simple ? */
775 /* might be a trigraph */
777 if(replace_trigraph(this)) {
780 put_back(this, this->c);
794 token->type = this->c;
805 fprintf(stderr, "unknown character '%c' found\n", this->c);
806 token->type = T_ERROR;
812 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
814 memset(this, 0, sizeof(this[0]));
816 this->input = stream;
818 this->source_position.linenr = 0;
819 this->source_position.input_name = input_name;
820 strset_init(&this->stringset);
822 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
823 * beginning of a line */
827 void lexer_destroy(lexer_t *this)
832 static __attribute__((unused))
833 void dbg_pos(const source_position_t source_position)
835 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);