5 #include "symbol_table_t.h"
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
20 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
24 void error_prefix(lexer_t *this)
26 error_prefix_at(this, this->source_position.input_name,
27 this->source_position.linenr);
31 void parse_error(lexer_t *this, const char *msg)
34 fprintf(stderr, "%s\n", msg);
38 void next_char(lexer_t *this)
41 if(this->bufpos >= this->bufend) {
42 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43 sizeof(this->buf) - MAX_PUTBACK, this->input);
48 this->bufpos = this->buf + MAX_PUTBACK;
49 this->bufend = this->buf + MAX_PUTBACK + s;
51 this->c = *(this->bufpos);
53 printf("nchar '%c'\n", this->c);
58 void put_back(lexer_t *this, int c)
60 char *p = (char*) this->bufpos - 1;
62 assert(p >= this->buf);
66 printf("putback '%c'\n", c);
71 int replace_trigraph(lexer_t *this)
73 #define MATCH_TRIGRAPH(ch,replacement) \
75 this->c = replacement; \
79 MATCH_TRIGRAPH('=', '#')
80 MATCH_TRIGRAPH('(', '[')
81 MATCH_TRIGRAPH('/', '\\')
82 MATCH_TRIGRAPH(')', ']')
83 MATCH_TRIGRAPH('\'', '^')
84 MATCH_TRIGRAPH('<', '{')
85 MATCH_TRIGRAPH('!', '|')
86 MATCH_TRIGRAPH('>', '}')
87 MATCH_TRIGRAPH('-', '~')
96 void parse_symbol(lexer_t *this, token_t *token)
101 obstack_1grow(&symbol_obstack, this->c);
108 if(this->c == '\n') {
110 this->source_position.linenr++;
117 obstack_1grow(&symbol_obstack, this->c);
124 put_back(this, this->c);
129 if(replace_trigraph(this))
132 put_back(this, this->c);
141 obstack_1grow(&symbol_obstack, '\0');
143 string = obstack_finish(&symbol_obstack);
144 symbol = symbol_table_insert(string);
147 token->type = symbol->ID;
149 token->type = T_IDENTIFIER;
151 token->v.symbol = symbol;
153 if(symbol->string != string) {
154 obstack_free(&symbol_obstack, string);
160 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
163 obstack_1grow(&symbol_obstack, this->c);
165 } while(is_ident_char(this->c));
166 obstack_1grow(&symbol_obstack, '\0');
168 char *string = obstack_finish(&symbol_obstack);
169 symbol_t *symbol = preprocessor_symbol_table_find(string);
170 obstack_free(&symbol_obstack, string);
180 void parse_number_hex(lexer_t *this, token_t *token)
182 assert(this->c == 'x' || this->c == 'X');
185 if (!isdigit(this->c) &&
186 !('A' <= this->c && this->c <= 'F') &&
187 !('a' <= this->c && this->c <= 'f')) {
188 parse_error(this, "premature end of hex number literal");
189 token->type = T_ERROR;
195 if (isdigit(this->c)) {
196 value = 16 * value + this->c - '0';
197 } else if ('A' <= this->c && this->c <= 'F') {
198 value = 16 * value + this->c - 'A' + 10;
199 } else if ('a' <= this->c && this->c <= 'f') {
200 value = 16 * value + this->c - 'a' + 10;
202 token->type = T_INTEGER;
203 token->v.intvalue = value;
211 void parse_number_oct(lexer_t *this, token_t *token)
213 assert(this->c == 'o' || this->c == 'O');
218 if ('0' <= this->c && this->c <= '7') {
219 value = 8 * value + this->c - '0';
221 token->type = T_INTEGER;
222 token->v.intvalue = value;
230 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
234 assert(first_char >= '0' && first_char <= '9');
235 value = first_char - '0';
239 if (isdigit(this->c)) {
240 value = 10 * value + this->c - '0';
242 token->type = T_INTEGER;
243 token->v.intvalue = value;
251 void parse_number(lexer_t *this, token_t *token)
253 // TODO check for overflow
254 // TODO check for various invalid inputs sequences
256 if (this->c == '0') {
260 case 'x': parse_number_hex(this, token); break;
262 case 'O': parse_number_oct(this, token); break;
263 default: parse_number_dec(this, token, '0');
266 parse_number_dec(this, token, 0);
271 int parse_escape_sequence(lexer_t *this)
277 case '"': return '"';
278 case '\'': return'\'';
280 if(this->c == '\n') {
281 this->source_position.linenr++;
283 return parse_escape_sequence(this);
286 case 'a': return '\a';
287 case 'b': return '\b';
288 case 'f': return '\f';
289 case 'n': return '\n';
290 case 'r': return '\r';
291 case 't': return '\t';
292 case 'v': return '\v';
293 case 'x': /* TODO parse hex number ... */
294 parse_error(this, "hex escape sequences not implemented yet");
296 case 0 ... 8: /* TODO parse octal number ... */
297 parse_error(this, "octal escape sequences not implemented yet");
303 /* might be a trigraph */
305 if(replace_trigraph(this)) {
306 return parse_escape_sequence(this);
312 parse_error(this, "reached end of file while parsing escape sequence");
315 parse_error(this, "unknown escape sequence\n");
321 void parse_string_literal(lexer_t *this, token_t *token)
323 unsigned start_linenr = this->source_position.linenr;
327 assert(this->c == '"');
335 obstack_1grow(&symbol_obstack, '?');
339 if(replace_trigraph(this))
341 obstack_1grow(&symbol_obstack, '?');
342 put_back(this, this->c);
348 if(this->c == '\n') {
350 this->source_position.linenr++;
353 int c = parse_escape_sequence(this);
354 obstack_1grow(&symbol_obstack, c);
358 error_prefix_at(this, this->source_position.input_name,
360 fprintf(stderr, "string has no end\n");
361 token->type = T_ERROR;
369 obstack_1grow(&symbol_obstack, this->c);
377 /* TODO: concatenate multiple strings separated by whitespace... */
379 /* add finishing 0 to the string */
380 obstack_1grow(&symbol_obstack, '\0');
381 string = obstack_finish(&symbol_obstack);
383 /* check if there is already a copy of the string */
384 result = strset_insert(&this->stringset, string);
385 if(result != string) {
386 obstack_free(&symbol_obstack, string);
389 token->type = T_STRING_LITERAL;
390 token->v.string = result;
394 void skip_multiline_comment(lexer_t *this)
396 unsigned start_linenr = this->source_position.linenr;
412 if(replace_trigraph(this))
415 /* we don't put back the 2nd ? as the comment text is discarded
420 this->source_position.linenr++;
424 error_prefix_at(this, this->source_position.input_name,
426 fprintf(stderr, "at end of file while looking for comment end\n");
436 void skip_line_comment(lexer_t *this)
445 if(replace_trigraph(this))
448 /* we don't put back the 2nd ? as the comment text is discarded
454 if(this->c == '\n') {
456 this->source_position.linenr++;
472 void eat_until_newline(lexer_t *this)
474 while(this->c != '\n') {
482 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
485 /* skip whitespaces */
486 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
492 void preprocessor_next_token(lexer_t *this, token_t *token)
494 /* skip whitespaces */
495 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
503 parse_symbol(this, token);
507 void lexer_next_token(lexer_t *this, token_t *token)
511 /* skip whitespaces */
512 while(this->c == ' ' || this->c == '\t' || this->c == '\n'
513 || this->c == '\r') {
514 if(this->c == '\n') {
516 this->source_position.linenr++;
525 parse_symbol(this, token);
529 parse_number(this, token);
533 parse_string_literal(this, token);
538 if(this->c == '\\') {
540 token->type = T_INTEGER;
541 token->v.intvalue = parse_escape_sequence(this);
543 if(this->c == '\n') {
544 parse_error(this, "newline while parsing character constant");
545 this->source_position.linenr++;
547 token->type = T_INTEGER;
548 token->v.intvalue = this->c;
551 if(this->c != '\'') {
552 parse_error(this, "multibyte character constant");
553 token->type = T_ERROR;
561 if(this->c == '\n') {
563 this->source_position.linenr++;
564 lexer_next_token(this, token);
567 parse_error(this, "unexpected '\\' found");
568 token->type = T_ERROR;
572 #define MAYBE1(ch, set_type) \
578 token->type = set_type; \
581 #define MAYBE(ch, set_type) \
584 token->type = set_type; \
587 #define ELSE(set_type) \
590 if(this->c != '?') { \
591 put_back(this, this->c); \
593 token->type = set_type; \
597 if(replace_trigraph(this)) \
599 put_back(this, '?'); \
600 put_back(this, this->c); \
602 token->type = set_type; \
607 if(this->c == '\n') { \
609 this->source_position.linenr++; \
614 token->type = set_type; \
617 } /* end of while(1) */ \
626 token->type = T_DOTDOTDOT;
636 MAYBE1('&', T_ANDAND)
637 MAYBE('=', T_ANDEQUAL)
640 MAYBE1('=', T_ASTERISKEQUAL)
643 MAYBE1('+', T_PLUSPLUS)
644 MAYBE('=', T_PLUSEQUAL)
647 MAYBE1('-', T_MINUSMINUS)
648 MAYBE('=', T_MINUSEQUAL)
651 MAYBE1('=', T_EXCLAMATIONMARKEQUAL)
654 MAYBE1('=', T_SLASHEQUAL)
657 skip_multiline_comment(this);
658 lexer_next_token(this, token);
662 skip_line_comment(this);
663 lexer_next_token(this, token);
667 MAYBE1('=', T_PERCENTEQUAL)
669 /* TODO find trigraphs... */
675 token->type = T_PERCENTCOLONPERCENTCOLON;
678 token->type = T_PERCENTCOLON;
682 token->type = T_PERCENTCOLON;
684 MAYBE('>', T_PERCENTGREATER)
687 MAYBE1(':', T_LESSCOLON)
688 MAYBE('%', T_LESSPERCENT)
690 /* TODO trigraphs... */
696 token->type = T_LESSLESSEQUAL;
698 token->type = T_LESSLESS;
701 token->type = T_LESS;
714 token->type = T_GREATERGREATEREQUAL;
716 token->type = T_GREATERGREATER;
721 MAYBE1('=', T_CARETEQUAL)
724 MAYBE1('=', T_PIPEEQUAL)
725 MAYBE('|', T_PIPEPIPE)
728 MAYBE1('>', T_COLONGREATER)
731 MAYBE1('=', T_EQUALEQUAL)
734 MAYBE1('#', T_HASHHASH)
738 parse_preprocessor_directive(this, token);
749 /* just a simple ? */
754 /* might be a trigraph */
756 if(replace_trigraph(this)) {
757 lexer_next_token(this, token);
760 put_back(this, this->c);
774 token->type = this->c;
784 fprintf(stderr, "unknown character '%c' found\n", this->c);
785 token->type = T_ERROR;
791 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
793 memset(this, 0, sizeof(this[0]));
795 this->input = stream;
797 this->source_position.linenr = 0;
798 this->source_position.input_name = input_name;
799 strset_init(&this->stringset);
801 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
802 * beginning of a line */
806 void lexer_destroy(lexer_t *this)
811 static __attribute__((unused))
812 void dbg_pos(const source_position_t source_position)
814 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);