7 #include "lang_features.h"
8 #include "diagnostic.h"
9 #include "string_rep.h"
20 struct pp_definition_t {
22 source_position_t source_position;
23 pp_definition_t *parent_expansion;
26 bool is_expanding : 1;
27 size_t argument_count;
30 token_t *replacement_list;
36 static char buf[1024 + MAX_PUTBACK];
37 static const char *bufend;
38 static const char *bufpos;
39 static bool resolve_escape_sequences = false;
40 static bool print_spaces = true;
42 static struct obstack pp_obstack;
43 static unsigned counted_newlines;
44 static unsigned counted_spaces;
45 static source_position_t input_position;
46 static const char *printed_input_name = NULL;
47 static pp_definition_t *current_expansion = NULL;
48 static bool do_expansions;
50 static void next_preprocessing_token(void);
53 * Prints a parse error message at the current token.
55 * @param msg the error message
57 static void parse_error(const char *msg)
59 errorf(&pp_token.source_position, "%s", msg);
62 static inline void next_real_char(void)
64 assert(bufpos <= bufend);
65 if (bufpos >= bufend) {
66 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
72 bufpos = buf + MAX_PUTBACK;
73 bufend = buf + MAX_PUTBACK + s;
79 * Put a character back into the buffer.
81 * @param pc the character to put back
83 static inline void put_back(int pc)
86 *(--bufpos - buf + buf) = (char) pc;
89 printf("putback '%c'\n", pc);
93 static inline void next_char(void);
95 #define MATCH_NEWLINE(code) \
101 ++input_position.linenr; \
105 ++input_position.linenr; \
108 #define eat(c_type) do { assert(c == c_type); next_char(); } while(0)
110 static void maybe_concat_lines(void)
115 MATCH_NEWLINE(return;)
126 * Set c to the next input character, ie.
127 * after expanding trigraphs.
129 static inline void next_char(void)
133 /* filter trigraphs and concatenated lines */
134 if(UNLIKELY(c == '\\')) {
135 maybe_concat_lines();
136 goto end_of_next_char;
140 goto end_of_next_char;
143 if(LIKELY(c != '?')) {
146 goto end_of_next_char;
151 case '=': c = '#'; break;
152 case '(': c = '['; break;
153 case '/': c = '\\'; maybe_concat_lines(); break;
154 case ')': c = ']'; break;
155 case '\'': c = '^'; break;
156 case '<': c = '{'; break;
157 case '!': c = '|'; break;
158 case '>': c = '}'; break;
159 case '-': c = '~'; break;
169 printf("nchar '%c'\n", c);
176 * Returns true if the given char is a octal digit.
178 * @param char the character to check
180 static inline bool is_octal_digit(int chr)
198 * Returns the value of a digit.
199 * The only portable way to do it ...
201 static int digit_value(int digit) {
226 panic("wrong character given");
231 * Parses an octal character sequence.
233 * @param first_digit the already read first digit
235 static int parse_octal_sequence(const int first_digit)
237 assert(is_octal_digit(first_digit));
238 int value = digit_value(first_digit);
239 if (!is_octal_digit(c)) return value;
240 value = 8 * value + digit_value(c);
242 if (!is_octal_digit(c)) return value;
243 value = 8 * value + digit_value(c);
247 return (signed char) value;
249 return (unsigned char) value;
254 * Parses a hex character sequence.
256 static int parse_hex_sequence(void)
260 value = 16 * value + digit_value(c);
265 return (signed char) value;
267 return (unsigned char) value;
272 * Parse an escape sequence.
274 static int parse_escape_sequence(void)
282 case '"': return '"';
283 case '\'': return '\'';
284 case '\\': return '\\';
285 case '?': return '\?';
286 case 'a': return '\a';
287 case 'b': return '\b';
288 case 'f': return '\f';
289 case 'n': return '\n';
290 case 'r': return '\r';
291 case 't': return '\t';
292 case 'v': return '\v';
294 return parse_hex_sequence();
303 return parse_octal_sequence(ec);
305 parse_error("reached end of file while parsing escape sequence");
308 parse_error("unknown escape sequence");
313 static void parse_string_literal(void)
315 const unsigned start_linenr = input_position.linenr;
323 if(resolve_escape_sequences) {
324 tc = parse_escape_sequence();
325 obstack_1grow(&symbol_obstack, (char) tc);
327 obstack_1grow(&symbol_obstack, (char) c);
329 obstack_1grow(&symbol_obstack, (char) c);
335 source_position_t source_position;
336 source_position.input_name = pp_token.source_position.input_name;
337 source_position.linenr = start_linenr;
338 errorf(&source_position, "string has no end");
339 pp_token.type = TP_ERROR;
348 obstack_1grow(&symbol_obstack, (char) c);
355 /* add finishing 0 to the string */
356 obstack_1grow(&symbol_obstack, '\0');
357 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
358 const char *const string = obstack_finish(&symbol_obstack);
360 #if 0 /* TODO hash */
361 /* check if there is already a copy of the string */
362 result = strset_insert(&stringset, string);
363 if(result != string) {
364 obstack_free(&symbol_obstack, string);
367 const char *const result = string;
370 pp_token.type = TP_STRING_LITERAL;
371 pp_token.v.string.begin = result;
372 pp_token.v.string.size = size;
375 static void parse_wide_character_constant(void)
383 found_char = parse_escape_sequence();
387 parse_error("newline while parsing character constant");
393 goto end_of_wide_char_constant;
396 parse_error("EOF while parsing character constant");
397 pp_token.type = TP_ERROR;
401 if(found_char != 0) {
402 parse_error("more than 1 characters in character "
404 goto end_of_wide_char_constant;
413 end_of_wide_char_constant:
414 pp_token.type = TP_WIDE_CHARACTER_CONSTANT;
418 static void parse_wide_string_literal(void)
420 const unsigned start_linenr = input_position.linenr;
428 wchar_rep_t tc = parse_escape_sequence();
429 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
434 source_position_t source_position;
435 source_position.input_name = pp_token.source_position.input_name;
436 source_position.linenr = start_linenr;
437 errorf(&source_position, "string has no end");
438 pp_token.type = TP_ERROR;
448 obstack_grow(&symbol_obstack, &tc, sizeof(tc));
456 /* add finishing 0 to the string */
457 static const wchar_rep_t nul = L'\0';
458 obstack_grow(&symbol_obstack, &nul, sizeof(nul));
461 = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
462 const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
464 #if 0 /* TODO hash */
465 /* check if there is already a copy of the string */
466 const wchar_rep_t *const result = strset_insert(&stringset, string);
467 if(result != string) {
468 obstack_free(&symbol_obstack, string);
471 const wchar_rep_t *const result = string;
474 pp_token.type = TP_WIDE_STRING_LITERAL;
475 pp_token.v.wide_string.begin = result;
476 pp_token.v.wide_string.size = size;
479 static void parse_character_constant(void)
481 const unsigned start_linenr = input_position.linenr;
489 tc = parse_escape_sequence();
490 obstack_1grow(&symbol_obstack, (char) tc);
494 parse_error("newline while parsing character constant");
499 source_position_t source_position;
500 source_position.input_name = pp_token.source_position.input_name;
501 source_position.linenr = start_linenr;
502 errorf(&source_position, "EOF while parsing character constant");
503 pp_token.type = TP_ERROR;
509 goto end_of_char_constant;
512 obstack_1grow(&symbol_obstack, (char) c);
519 end_of_char_constant:;
520 const size_t size = (size_t)obstack_object_size(&symbol_obstack);
521 const char *const string = obstack_finish(&symbol_obstack);
523 pp_token.type = TP_CHARACTER_CONSTANT;
524 pp_token.v.string.begin = string;
525 pp_token.v.string.size = size;
528 #define SYMBOL_CHARS_WITHOUT_E_P \
579 #define SYMBOL_CHARS \
580 SYMBOL_CHARS_WITHOUT_E_P \
599 * returns next final token from a preprocessor macro expansion
601 static void expand_next(void)
603 assert(current_expansion != NULL);
605 pp_definition_t *definition = current_expansion;
608 if(definition->list_len == 0
609 || definition->expand_pos >= definition->list_len) {
610 /* we're finished with the current macro, move up 1 level in the
612 pp_definition_t *parent = definition->parent_expansion;
613 definition->parent_expansion = NULL;
614 definition->is_expanding = false;
616 current_expansion = NULL;
617 next_preprocessing_token();
621 current_expansion = definition;
624 pp_token = definition->replacement_list[definition->expand_pos];
625 ++definition->expand_pos;
627 if(pp_token.type != TP_IDENTIFIER)
630 pp_definition_t *symbol_definition = pp_token.v.symbol->pp_definition;
631 if(symbol_definition != NULL && !symbol_definition->is_expanding) {
632 symbol_definition->parent_expansion = definition;
633 symbol_definition->expand_pos = 0;
634 symbol_definition->is_expanding = true;
635 definition = symbol_definition;
636 current_expansion = definition;
641 static void parse_symbol(void)
643 obstack_1grow(&symbol_obstack, (char) c);
650 obstack_1grow(&symbol_obstack, (char) c);
660 obstack_1grow(&symbol_obstack, '\0');
661 char *string = obstack_finish(&symbol_obstack);
663 /* might be a wide string or character constant ( L"string"/L'c' ) */
664 if(c == '"' && string[0] == 'L' && string[1] == '\0') {
665 obstack_free(&symbol_obstack, string);
666 parse_wide_string_literal();
668 } else if(c == '\'' && string[0] == 'L' && string[1] == '\0') {
669 obstack_free(&symbol_obstack, string);
670 parse_wide_character_constant();
674 symbol_t *symbol = symbol_table_insert(string);
676 pp_token.type = symbol->pp_ID;
677 pp_token.v.symbol = symbol;
679 /* we can free the memory from symbol obstack if we already had an entry in
680 * the symbol table */
681 if(symbol->string != string) {
682 obstack_free(&symbol_obstack, string);
685 pp_definition_t *pp_definition = symbol->pp_definition;
686 if(do_expansions && pp_definition != NULL) {
687 pp_definition->expand_pos = 0;
688 pp_definition->is_expanding = true,
689 current_expansion = pp_definition;
694 static void parse_number(void)
696 obstack_1grow(&symbol_obstack, (char) c);
703 SYMBOL_CHARS_WITHOUT_E_P
704 obstack_1grow(&symbol_obstack, (char) c);
712 obstack_1grow(&symbol_obstack, (char) c);
714 if(c == '+' || c == '-') {
715 obstack_1grow(&symbol_obstack, (char) c);
726 obstack_1grow(&symbol_obstack, '\0');
727 size_t size = obstack_object_size(&symbol_obstack);
728 char *string = obstack_finish(&symbol_obstack);
730 pp_token.type = TP_NUMBER;
731 pp_token.v.string.begin = string;
732 pp_token.v.string.size = size;
735 static void skip_multiline_comment(void)
737 unsigned start_linenr = input_position.linenr;
744 /* TODO: nested comment, warn here */
764 source_position_t source_position;
765 source_position.input_name = pp_token.source_position.input_name;
766 source_position.linenr = start_linenr;
767 errorf(&source_position, "at end of file while looking for comment end");
778 static void skip_line_comment(void)
798 #define MAYBE_PROLOG \
803 #define MAYBE(ch, set_type) \
806 pp_token.type = set_type; \
809 #define ELSE_CODE(code) \
813 } /* end of while(1) */ \
816 #define ELSE(set_type) \
818 pp_token.type = set_type; \
822 static void next_preprocessing_token(void)
824 if(current_expansion != NULL) {
829 pp_token.source_position = input_position;
843 pp_token.type = '\n';
856 parse_string_literal();
860 parse_character_constant();
882 MAYBE('.', TP_DOTDOTDOT)
892 MAYBE('&', TP_ANDAND)
893 MAYBE('=', TP_ANDEQUAL)
897 MAYBE('=', TP_ASTERISKEQUAL)
901 MAYBE('+', TP_PLUSPLUS)
902 MAYBE('=', TP_PLUSEQUAL)
906 MAYBE('>', TP_MINUSGREATER)
907 MAYBE('-', TP_MINUSMINUS)
908 MAYBE('=', TP_MINUSEQUAL)
912 MAYBE('=', TP_EXCLAMATIONMARKEQUAL)
916 MAYBE('=', TP_SLASHEQUAL)
919 skip_multiline_comment();
933 MAYBE('=', TP_PERCENTEQUAL)
938 MAYBE(':', TP_HASHHASH)
951 MAYBE('=', TP_LESSEQUAL)
954 MAYBE('=', TP_LESSLESSEQUAL)
959 MAYBE('=', TP_GREATEREQUAL)
962 MAYBE('=', TP_GREATERGREATEREQUAL)
963 ELSE(TP_GREATERGREATER)
967 MAYBE('=', TP_CARETEQUAL)
971 MAYBE('=', TP_PIPEEQUAL)
972 MAYBE('|', TP_PIPEPIPE)
980 MAYBE('=', TP_EQUALEQUAL)
984 MAYBE('#', TP_HASHHASH)
1003 pp_token.type = TP_EOF;
1008 errorf(&pp_token.source_position, "unknown character '%c' found\n", c);
1009 pp_token.type = TP_ERROR;
1014 static void print_quoted_string(const char *const string)
1017 for (const char *c = string; *c != 0; ++c) {
1019 case '"': fputs("\\\"", out); break;
1020 case '\\': fputs("\\\\", out); break;
1021 case '\a': fputs("\\a", out); break;
1022 case '\b': fputs("\\b", out); break;
1023 case '\f': fputs("\\f", out); break;
1024 case '\n': fputs("\\n", out); break;
1025 case '\r': fputs("\\r", out); break;
1026 case '\t': fputs("\\t", out); break;
1027 case '\v': fputs("\\v", out); break;
1028 case '\?': fputs("\\?", out); break;
1031 fprintf(out, "\\%03o", *c);
1041 static void print_line_directive(const source_position_t *pos)
1043 fprintf(out, "# %d ", pos->linenr);
1044 print_quoted_string(pos->input_name);
1047 printed_input_name = pos->input_name;
1050 static bool had_non_space = false;
1052 static void emit_pp_token(void)
1054 if (printed_input_name != pp_token.source_position.input_name) {
1055 print_line_directive(&pp_token.source_position);
1056 } else if (pp_token.type != '\n') {
1057 if (counted_newlines >= 9) {
1058 if (had_non_space) {
1061 print_line_directive(&pp_token.source_position);
1062 counted_newlines = 0;
1064 for (unsigned i = 0; i < counted_newlines; ++i)
1066 counted_newlines = 0;
1068 for (unsigned i = 0; i < counted_spaces; ++i)
1071 had_non_space = true;
1074 switch(pp_token.type) {
1076 fputs(pp_token.v.symbol->string, out);
1079 fputs(pp_token.v.string.begin, out);
1081 case TP_STRING_LITERAL:
1083 fputs(pp_token.v.string.begin, out);
1089 print_pp_token_type(out, pp_token.type);
1094 static void eat_pp(preprocessor_token_type_t type)
1097 assert(pp_token.type == type);
1098 next_preprocessing_token();
1101 static void eat_pp_directive(void)
1103 while(pp_token.type != '\n' && pp_token.type != TP_EOF) {
1104 next_preprocessing_token();
1108 static bool strings_equal(const string_t *string1, const string_t *string2)
1110 size_t size = string1->size;
1111 if(size != string2->size)
1114 const char *c1 = string1->begin;
1115 const char *c2 = string2->begin;
1116 for(size_t i = 0; i < size; ++i, ++c1, ++c2) {
1123 static bool wide_strings_equal(const wide_string_t *string1,
1124 const wide_string_t *string2)
1126 size_t size = string1->size;
1127 if(size != string2->size)
1130 const wchar_rep_t *c1 = string1->begin;
1131 const wchar_rep_t *c2 = string2->begin;
1132 for(size_t i = 0; i < size; ++i, ++c1, ++c2) {
1139 static bool pp_tokens_equal(const token_t *token1, const token_t *token2)
1141 if(token1->type != token2->type)
1144 switch(token1->type) {
1149 return token1->v.symbol == token2->v.symbol;
1151 case TP_CHARACTER_CONSTANT:
1152 case TP_STRING_LITERAL:
1153 return strings_equal(&token1->v.string, &token2->v.string);
1155 case TP_WIDE_CHARACTER_CONSTANT:
1156 case TP_WIDE_STRING_LITERAL:
1157 return wide_strings_equal(&token1->v.wide_string,
1158 &token2->v.wide_string);
1164 static bool pp_definitions_equal(const pp_definition_t *definition1,
1165 const pp_definition_t *definition2)
1167 if(definition1->list_len != definition2->list_len)
1170 size_t len = definition1->list_len;
1171 const token_t *t1 = definition1->replacement_list;
1172 const token_t *t2 = definition2->replacement_list;
1173 for(size_t i = 0; i < len; ++i, ++t1, ++t2) {
1174 if(!pp_tokens_equal(t1, t2))
1180 static void parse_define_directive(void)
1184 if(pp_token.type != TP_IDENTIFIER) {
1185 errorf(&pp_token.source_position,
1186 "expected identifier after #define, got '%T'", &pp_token);
1190 symbol_t *symbol = pp_token.v.symbol;
1192 pp_definition_t *new_definition
1193 = obstack_alloc(&pp_obstack, sizeof(new_definition[0]));
1194 memset(new_definition, 0, sizeof(new_definition[0]));
1195 new_definition->source_position = input_position;
1197 /* this is probably the only place where spaces are significant in the
1198 * lexer (except for the fact that they separate tokens). #define b(x)
1199 * is something else than #define b (x) */
1200 //token_t *arguments = NULL;
1202 next_preprocessing_token();
1203 while(pp_token.type != ')') {
1204 if(pp_token.type == TP_DOTDOTDOT) {
1205 new_definition->is_variadic = true;
1206 next_preprocessing_token();
1207 if(pp_token.type != ')') {
1208 errorf(&input_position,
1209 "'...' not at end of macro argument list");
1212 } else if(pp_token.type != TP_IDENTIFIER) {
1213 next_preprocessing_token();
1217 next_preprocessing_token();
1220 /* construct a new pp_definition on the obstack */
1221 assert(obstack_object_size(&pp_obstack) == 0);
1222 size_t list_len = 0;
1223 while(pp_token.type != '\n' && pp_token.type != TP_EOF) {
1224 obstack_grow(&pp_obstack, &pp_token, sizeof(pp_token));
1226 next_preprocessing_token();
1229 new_definition->list_len = list_len;
1230 new_definition->replacement_list = obstack_finish(&pp_obstack);
1232 pp_definition_t *old_definition = symbol->pp_definition;
1233 if(old_definition != NULL) {
1234 if(!pp_definitions_equal(old_definition, new_definition)) {
1235 warningf(&input_position, "multiple definition of macro '%Y' (first defined %P)",
1236 symbol, &old_definition->source_position);
1238 /* reuse the old definition */
1239 obstack_free(&pp_obstack, new_definition);
1240 new_definition = old_definition;
1244 symbol->pp_definition = new_definition;
1247 static void parse_undef_directive(void)
1251 if(pp_token.type != TP_IDENTIFIER) {
1252 errorf(&input_position,
1253 "expected identifier after #undef, got '%T'", &pp_token);
1258 symbol_t *symbol = pp_token.v.symbol;
1259 symbol->pp_definition = NULL;
1260 next_preprocessing_token();
1262 if(pp_token.type != '\n') {
1263 warningf(&input_position, "extra tokens at end of #undef directive");
1265 /* eat until '\n' */
1269 static void parse_preprocessing_directive(void)
1271 print_spaces = false;
1272 do_expansions = false;
1275 switch(pp_token.type) {
1277 parse_define_directive();
1280 parse_undef_directive();
1283 errorf(&pp_token.source_position,
1284 "invalid preprocessing directive #%T", &pp_token);
1289 print_spaces = true;
1290 do_expansions = true;
1293 assert(pp_token.type == '\n' || pp_token.type == TP_EOF);
1294 next_preprocessing_token();
1297 int pptest_main(int argc, char **argv);
1299 #define GCC_COMPAT_MODE
1301 int pptest_main(int argc, char **argv)
1303 init_symbol_table();
1306 obstack_init(&pp_obstack);
1308 const char *infname = "t.c";
1312 input = fopen(infname, "r");
1313 assert(input != NULL);
1314 input_position.input_name = infname;
1315 input_position.linenr = 1;
1319 counted_newlines = 0;
1324 #ifdef GCC_COMPAT_MODE
1325 /* this is here so we can directly compare "gcc -E" output and our output */
1326 fprintf(out, "# 1 \"%s\"\n", input_position.input_name);
1327 fputs("# 1 \"<built-in>\"\n", out);
1328 fputs("# 1 \"<command-line>\"\n", out);
1333 next_preprocessing_token();
1336 /* we're at a line begin */
1337 if(pp_token.type == '#') {
1338 parse_preprocessing_directive();
1340 /* parse+emit a line */
1341 while(pp_token.type != '\n') {
1342 if(pp_token.type == TP_EOF)
1343 goto end_of_main_loop;
1345 next_preprocessing_token();
1348 next_preprocessing_token();
1353 if (counted_newlines > 0) {
1357 obstack_free(&pp_obstack, NULL);
1360 exit_symbol_table();