5 #include "symbol_table_t.h"
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
20 fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
24 void error_prefix(lexer_t *this)
26 error_prefix_at(this, this->source_position.input_name,
27 this->source_position.linenr);
31 void parse_error(lexer_t *this, const char *msg)
34 fprintf(stderr, "%s\n", msg);
38 void next_char(lexer_t *this)
41 if(this->bufpos >= this->bufend) {
42 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43 sizeof(this->buf) - MAX_PUTBACK, this->input);
48 this->bufpos = this->buf + MAX_PUTBACK;
49 this->bufend = this->buf + MAX_PUTBACK + s;
51 this->c = *(this->bufpos);
53 printf("nchar '%c'\n", this->c);
58 void put_back(lexer_t *this, int c)
60 char *p = (char*) this->bufpos - 1;
62 assert(p >= this->buf);
66 printf("putback '%c'\n", c);
72 int replace_trigraph(lexer_t *this)
74 #define MATCH_TRIGRAPH(ch,replacement) \
76 this->c = replacement; \
80 MATCH_TRIGRAPH('=', '#')
81 MATCH_TRIGRAPH('(', '[')
82 MATCH_TRIGRAPH('/', '\\')
83 MATCH_TRIGRAPH(')', ']')
84 MATCH_TRIGRAPH('\'', '^')
85 MATCH_TRIGRAPH('<', '{')
86 MATCH_TRIGRAPH('!', '|')
87 MATCH_TRIGRAPH('>', '}')
88 MATCH_TRIGRAPH('-', '~')
96 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
99 if(this->c != '?') { \
101 put_back(this, this->c); \
106 if(replace_trigraph(this)) { \
110 put_back(this, '?'); \
111 put_back(this, this->c); \
115 #define EAT_NEWLINE(newline_code) \
116 if(this->c == '\r') { \
118 if(this->c == '\n') \
120 this->source_position.linenr++; \
122 } else if(this->c == '\n') { \
124 this->source_position.linenr++; \
129 void parse_symbol(lexer_t *this, token_t *token)
134 obstack_1grow(&symbol_obstack, this->c);
147 obstack_1grow(&symbol_obstack, this->c);
154 put_back(this, this->c);
159 if(replace_trigraph(this))
162 put_back(this, this->c);
171 obstack_1grow(&symbol_obstack, '\0');
173 string = obstack_finish(&symbol_obstack);
174 symbol = symbol_table_insert(string);
177 token->type = symbol->ID;
179 token->type = T_IDENTIFIER;
181 token->v.symbol = symbol;
183 if(symbol->string != string) {
184 obstack_free(&symbol_obstack, string);
190 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
193 obstack_1grow(&symbol_obstack, this->c);
195 } while(is_ident_char(this->c));
196 obstack_1grow(&symbol_obstack, '\0');
198 char *string = obstack_finish(&symbol_obstack);
199 symbol_t *symbol = preprocessor_symbol_table_find(string);
200 obstack_free(&symbol_obstack, string);
210 void parse_number_hex(lexer_t *this, token_t *token)
212 assert(this->c == 'x' || this->c == 'X');
215 if (!isdigit(this->c) &&
216 !('A' <= this->c && this->c <= 'F') &&
217 !('a' <= this->c && this->c <= 'f')) {
218 parse_error(this, "premature end of hex number literal");
219 token->type = T_ERROR;
225 if (isdigit(this->c)) {
226 value = 16 * value + this->c - '0';
227 } else if ('A' <= this->c && this->c <= 'F') {
228 value = 16 * value + this->c - 'A' + 10;
229 } else if ('a' <= this->c && this->c <= 'f') {
230 value = 16 * value + this->c - 'a' + 10;
232 token->type = T_INTEGER;
233 token->v.intvalue = value;
241 void parse_number_oct(lexer_t *this, token_t *token)
243 assert(this->c == 'o' || this->c == 'O');
248 if ('0' <= this->c && this->c <= '7') {
249 value = 8 * value + this->c - '0';
251 token->type = T_INTEGER;
252 token->v.intvalue = value;
260 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
264 assert(first_char >= '0' && first_char <= '9');
265 value = first_char - '0';
269 if (isdigit(this->c)) {
270 value = 10 * value + this->c - '0';
272 token->type = T_INTEGER;
273 token->v.intvalue = value;
281 void parse_number(lexer_t *this, token_t *token)
283 // TODO check for overflow
284 // TODO check for various invalid inputs sequences
286 if (this->c == '0') {
290 case 'x': parse_number_hex(this, token); break;
292 case 'O': parse_number_oct(this, token); break;
293 default: parse_number_dec(this, token, '0');
296 parse_number_dec(this, token, 0);
301 int parse_escape_sequence(lexer_t *this)
308 case '"': return '"';
309 case '\'': return'\'';
313 case 'a': return '\a';
314 case 'b': return '\b';
315 case 'f': return '\f';
316 case 'n': return '\n';
317 case 'r': return '\r';
318 case 't': return '\t';
319 case 'v': return '\v';
320 case 'x': /* TODO parse hex number ... */
321 parse_error(this, "hex escape sequences not implemented yet");
323 case 0 ... 8: /* TODO parse octal number ... */
324 parse_error(this, "octal escape sequences not implemented yet");
330 /* might be a trigraph */
332 if(replace_trigraph(this)) {
335 put_back(this, this->c);
340 parse_error(this, "reached end of file while parsing escape sequence");
343 parse_error(this, "unknown escape sequence");
350 void parse_string_literal(lexer_t *this, token_t *token)
352 unsigned start_linenr = this->source_position.linenr;
356 assert(this->c == '"');
362 obstack_1grow(&symbol_obstack, '?');
370 int c = parse_escape_sequence(this);
371 obstack_1grow(&symbol_obstack, c);
375 error_prefix_at(this, this->source_position.input_name,
377 fprintf(stderr, "string has no end\n");
378 token->type = T_ERROR;
386 obstack_1grow(&symbol_obstack, this->c);
394 /* TODO: concatenate multiple strings separated by whitespace... */
396 /* add finishing 0 to the string */
397 obstack_1grow(&symbol_obstack, '\0');
398 string = obstack_finish(&symbol_obstack);
400 /* check if there is already a copy of the string */
401 result = strset_insert(&this->stringset, string);
402 if(result != string) {
403 obstack_free(&symbol_obstack, string);
406 token->type = T_STRING_LITERAL;
407 token->v.string = result;
410 #define MATCH_NEWLINE(code) \
413 if(this->c == '\n') { \
416 this->source_position.linenr++; \
420 this->source_position.linenr++; \
424 void parse_character_constant(lexer_t *this, token_t *token)
426 assert(this->c == '\'');
444 parse_error(this, "newline while parsing character constant");
450 goto end_of_char_constant;
453 parse_error(this, "EOF while parsing character constant");
454 token->type = T_ERROR;
458 if(found_char != 0) {
459 parse_error(this, "more than 1 characters in character "
461 goto end_of_char_constant;
463 found_char = this->c;
470 end_of_char_constant:
471 token->type = T_INTEGER;
472 token->v.intvalue = found_char;
476 void skip_multiline_comment(lexer_t *this)
478 unsigned start_linenr = this->source_position.linenr;
509 if(replace_trigraph(this))
511 put_back(this, this->c);
514 /* we don't put back the 2nd ? as the comment text is discarded
518 MATCH_NEWLINE(had_star = 0; break;)
521 error_prefix_at(this, this->source_position.input_name,
523 fprintf(stderr, "at end of file while looking for comment end\n");
534 void skip_line_comment(lexer_t *this)
543 if(replace_trigraph(this))
546 /* we don't put back the 2nd ? as the comment text is discarded
552 if(this->c == '\n') {
554 this->source_position.linenr++;
571 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
574 while(this->c != '\n') {
575 printf("%c", this->c);
580 lexer_next_token(this, result_token);
583 void preprocessor_next_token(lexer_t *this, token_t *token)
585 /* skip whitespaces */
586 while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
594 parse_symbol(this, token);
598 #define MAYBE_PROLOG \
603 #define MAYBE(ch, set_type) \
606 token->type = set_type; \
609 #define ELSE_CODE(code) \
616 EAT_NEWLINE(break;) \
621 } /* end of while(1) */ \
624 #define ELSE(set_type) \
626 token->type = set_type; \
631 void eat_whitespace(lexer_t *this)
646 if(this->c == '\n') {
648 this->source_position.linenr++;
652 put_back(this, this->c);
666 skip_multiline_comment(this);
667 eat_whitespace(this);
671 skip_line_comment(this);
672 eat_whitespace(this);
698 void lexer_next_token(lexer_t *this, token_t *token)
708 eat_whitespace(this);
711 parse_preprocessor_directive(this, token);
720 parse_symbol(this, token);
724 parse_number(this, token);
728 parse_string_literal(this, token);
732 parse_character_constant(this, token);
737 if(this->c == '\n') {
739 this->source_position.linenr++;
742 parse_error(this, "unexpected '\\' found");
743 token->type = T_ERROR;
751 MAYBE('.', T_DOTDOTDOT)
753 put_back(this, this->c);
762 MAYBE('=', T_ANDEQUAL)
766 MAYBE('=', T_ASTERISKEQUAL)
770 MAYBE('+', T_PLUSPLUS)
771 MAYBE('=', T_PLUSEQUAL)
775 MAYBE('-', T_MINUSMINUS)
776 MAYBE('=', T_MINUSEQUAL)
780 MAYBE('=', T_EXCLAMATIONMARKEQUAL)
784 MAYBE('=', T_SLASHEQUAL)
787 skip_multiline_comment(this);
788 lexer_next_token(this, token);
792 skip_line_comment(this);
793 lexer_next_token(this, token);
798 MAYBE('>', T_PERCENTGREATER)
799 MAYBE('=', T_PERCENTEQUAL)
804 MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
806 put_back(this, this->c);
808 token->type = T_PERCENTCOLON;
815 MAYBE(':', T_LESSCOLON)
816 MAYBE('%', T_LESSPERCENT)
819 MAYBE('=', T_LESSLESSEQUAL)
826 MAYBE('=', T_GREATERGREATEREQUAL)
827 ELSE(T_GREATERGREATER)
831 MAYBE('=', T_CARETEQUAL)
835 MAYBE('=', T_PIPEEQUAL)
836 MAYBE('|', T_PIPEPIPE)
840 MAYBE('>', T_COLONGREATER)
844 MAYBE('=', T_EQUALEQUAL)
848 MAYBE('#', T_HASHHASH)
853 /* just a simple ? */
858 /* might be a trigraph */
860 if(replace_trigraph(this)) {
863 put_back(this, this->c);
877 token->type = this->c;
888 fprintf(stderr, "unknown character '%c' found\n", this->c);
889 token->type = T_ERROR;
895 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
897 memset(this, 0, sizeof(this[0]));
899 this->input = stream;
901 this->source_position.linenr = 0;
902 this->source_position.input_name = input_name;
903 strset_init(&this->stringset);
905 /* we place a virtual '\n' at the beginning so the lexer knows we're at the
906 * beginning of a line */
910 void lexer_destroy(lexer_t *this)
915 static __attribute__((unused))
916 void dbg_pos(const source_position_t source_position)
918 fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);