X-Git-Url: http://nsz.repo.hu/git/?a=blobdiff_plain;f=lexer.c;h=daeda8a492c11c85ceb2652ed833cffe71fc3205;hb=f619e0f8119ae30ecd0e4a56505b99fdf0de1ee4;hp=cc528eb6ffcc3e16225fd7877201bff4ae4d101f;hpb=5e80d9852edea8472cd478655d70f5c4a1eddbdc;p=cparser diff --git a/lexer.c b/lexer.c index cc528eb..daeda8a 100644 --- a/lexer.c +++ b/lexer.c @@ -19,6 +19,7 @@ */ #include +#include "adt/strutil.h" #include "input.h" #include "diagnostic.h" #include "lexer.h" @@ -57,7 +58,6 @@ static source_position_t lexer_pos; token_t lexer_token; static symbol_t *symbol_L; static strset_t stringset; -static char *encoding; bool allow_dollar_in_symbol = true; /** @@ -277,14 +277,28 @@ end_symbol: char *string = obstack_finish(&symbol_obstack); symbol_t *symbol = symbol_table_insert(string); - lexer_token.type = symbol->ID; - lexer_token.symbol = symbol; + lexer_token.kind = symbol->ID; + lexer_token.identifier.symbol = symbol; if (symbol->string != string) { obstack_free(&symbol_obstack, string); } } +static string_t identify_string(char *string, size_t len) +{ + /* TODO hash */ +#if 0 + const char *result = strset_insert(&stringset, concat); + if (result != concat) { + obstack_free(&symbol_obstack, concat); + } +#else + const char *result = string; +#endif + return (string_t) {result, len}; +} + /** * parse suffixes like 'LU' or 'f' after numbers */ @@ -304,32 +318,16 @@ static void parse_number_suffix(void) } finish_suffix: if (obstack_object_size(&symbol_obstack) == 0) { - lexer_token.symbol = NULL; + lexer_token.number.suffix.begin = NULL; + lexer_token.number.suffix.size = 0; return; } obstack_1grow(&symbol_obstack, '\0'); + size_t size = obstack_object_size(&symbol_obstack); char *string = obstack_finish(&symbol_obstack); - symbol_t *symbol = symbol_table_insert(string); - if (symbol->string != string) { - obstack_free(&symbol_obstack, string); - } - lexer_token.symbol = symbol; -} - -static string_t identify_string(char *string, size_t len) -{ - /* TODO hash */ -#if 0 - const char *result = strset_insert(&stringset, concat); - if (result != concat) { - obstack_free(&symbol_obstack, concat); - } -#else - const char *result = string; -#endif - return (string_t) {result, len}; + lexer_token.number.suffix = identify_string(string, size); } /** @@ -374,23 +372,23 @@ static void parse_number_hex(void) next_char(); } } else if (is_float) { - errorf(&lexer_token.source_position, + errorf(&lexer_token.base.source_position, "hexadecimal floatingpoint constant requires an exponent"); } obstack_1grow(&symbol_obstack, '\0'); size_t size = obstack_object_size(&symbol_obstack) - 1; char *string = obstack_finish(&symbol_obstack); - lexer_token.literal = identify_string(string, size); + lexer_token.number.number = identify_string(string, size); - lexer_token.type = + lexer_token.kind = is_float ? T_FLOATINGPOINT_HEXADECIMAL : T_INTEGER_HEXADECIMAL; if (!has_digits) { - errorf(&lexer_token.source_position, "invalid number literal '0x%S'", - &lexer_token.literal); - lexer_token.literal.begin = "0"; - lexer_token.literal.size = 1; + errorf(&lexer_token.base.source_position, + "invalid number literal '0x%S'", &lexer_token.number.number); + lexer_token.number.number.begin = "0"; + lexer_token.number.number.size = 1; } parse_number_suffix(); @@ -463,28 +461,28 @@ static void parse_number(void) obstack_1grow(&symbol_obstack, '\0'); size_t size = obstack_object_size(&symbol_obstack) - 1; char *string = obstack_finish(&symbol_obstack); - lexer_token.literal = identify_string(string, size); + lexer_token.number.number = identify_string(string, size); /* is it an octal number? */ if (is_float) { - lexer_token.type = T_FLOATINGPOINT; + lexer_token.kind = T_FLOATINGPOINT; } else if (string[0] == '0') { - lexer_token.type = T_INTEGER_OCTAL; + lexer_token.kind = T_INTEGER_OCTAL; /* check for invalid octal digits */ for (size_t i= 0; i < size; ++i) { char t = string[i]; if (t >= '8') - errorf(&lexer_token.source_position, + errorf(&lexer_token.base.source_position, "invalid digit '%c' in octal number", t); } } else { - lexer_token.type = T_INTEGER; + lexer_token.kind = T_INTEGER; } if (!has_digits) { - errorf(&lexer_token.source_position, "invalid number literal '%S'", - &lexer_token.literal); + errorf(&lexer_token.base.source_position, "invalid number literal '%S'", + &lexer_token.number.number); } parse_number_suffix(); @@ -634,26 +632,6 @@ string_t make_string(const char *string) return identify_string(space, len); } -static void grow_symbol(utf32 const tc) -{ - struct obstack *const o = &symbol_obstack; - if (tc < 0x80U) { - obstack_1grow(o, tc); - } else if (tc < 0x800) { - obstack_1grow(o, 0xC0 | (tc >> 6)); - obstack_1grow(o, 0x80 | (tc & 0x3F)); - } else if (tc < 0x10000) { - obstack_1grow(o, 0xE0 | ( tc >> 12)); - obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F)); - obstack_1grow(o, 0x80 | ( tc & 0x3F)); - } else { - obstack_1grow(o, 0xF0 | ( tc >> 18)); - obstack_1grow(o, 0x80 | ((tc >> 12) & 0x3F)); - obstack_1grow(o, 0x80 | ((tc >> 6) & 0x3F)); - obstack_1grow(o, 0x80 | ( tc & 0x3F)); - } -} - /** * Parse a string literal and set lexer_token. */ @@ -673,8 +651,8 @@ static void parse_string_literal(void) } case EOF: { - errorf(&lexer_token.source_position, "string has no end"); - lexer_token.type = T_ERROR; + errorf(&lexer_token.base.source_position, "string has no end"); + lexer_token.kind = T_ERROR; return; } @@ -683,7 +661,7 @@ static void parse_string_literal(void) goto end_of_string; default: - grow_symbol(c); + obstack_grow_symbol(&symbol_obstack, c); next_char(); break; } @@ -698,8 +676,8 @@ end_of_string: const size_t size = (size_t)obstack_object_size(&symbol_obstack); char *string = obstack_finish(&symbol_obstack); - lexer_token.type = T_STRING_LITERAL; - lexer_token.literal = identify_string(string, size); + lexer_token.kind = T_STRING_LITERAL; + lexer_token.string.string = identify_string(string, size); } /** @@ -713,7 +691,7 @@ static void parse_wide_character_constant(void) switch (c) { case '\\': { const utf32 tc = parse_escape_sequence(); - grow_symbol(tc); + obstack_grow_symbol(&symbol_obstack, tc); break; } @@ -727,13 +705,14 @@ static void parse_wide_character_constant(void) goto end_of_wide_char_constant; case EOF: { - errorf(&lexer_token.source_position, "EOF while parsing character constant"); - lexer_token.type = T_ERROR; + errorf(&lexer_token.base.source_position, + "EOF while parsing character constant"); + lexer_token.kind = T_ERROR; return; } default: - grow_symbol(c); + obstack_grow_symbol(&symbol_obstack, c); next_char(); break; } @@ -744,11 +723,11 @@ end_of_wide_char_constant:; size_t size = (size_t) obstack_object_size(&symbol_obstack) - 1; char *string = obstack_finish(&symbol_obstack); - lexer_token.type = T_WIDE_CHARACTER_CONSTANT; - lexer_token.literal = identify_string(string, size); + lexer_token.kind = T_WIDE_CHARACTER_CONSTANT; + lexer_token.string.string = identify_string(string, size); if (size == 0) { - errorf(&lexer_token.source_position, "empty character constant"); + errorf(&lexer_token.base.source_position, "empty character constant"); } } @@ -758,8 +737,8 @@ end_of_wide_char_constant:; static void parse_wide_string_literal(void) { parse_string_literal(); - if (lexer_token.type == T_STRING_LITERAL) - lexer_token.type = T_WIDE_STRING_LITERAL; + if (lexer_token.kind == T_STRING_LITERAL) + lexer_token.kind = T_WIDE_STRING_LITERAL; } /** @@ -790,13 +769,14 @@ static void parse_character_constant(void) goto end_of_char_constant; case EOF: { - errorf(&lexer_token.source_position, "EOF while parsing character constant"); - lexer_token.type = T_ERROR; + errorf(&lexer_token.base.source_position, + "EOF while parsing character constant"); + lexer_token.kind = T_ERROR; return; } default: - grow_symbol(c); + obstack_grow_symbol(&symbol_obstack, c); next_char(); break; @@ -808,11 +788,11 @@ end_of_char_constant:; const size_t size = (size_t)obstack_object_size(&symbol_obstack)-1; char *const string = obstack_finish(&symbol_obstack); - lexer_token.type = T_CHARACTER_CONSTANT; - lexer_token.literal = identify_string(string, size); + lexer_token.kind = T_CHARACTER_CONSTANT; + lexer_token.string.string = identify_string(string, size); if (size == 0) { - errorf(&lexer_token.source_position, "empty character constant"); + errorf(&lexer_token.base.source_position, "empty character constant"); } } @@ -841,7 +821,8 @@ static void skip_multiline_comment(void) MATCH_NEWLINE(break;) case EOF: { - errorf(&lexer_token.source_position, "at end of file while looking for comment end"); + errorf(&lexer_token.base.source_position, + "at end of file while looking for comment end"); return; } @@ -898,57 +879,43 @@ static inline void next_pp_token(void) */ static void eat_until_newline(void) { - while (pp_token.type != '\n' && pp_token.type != T_EOF) { + while (pp_token.kind != '\n' && pp_token.kind != T_EOF) { next_pp_token(); } } -/** - * Handle the define directive. - */ -static void define_directive(void) -{ - lexer_next_preprocessing_token(); - if (lexer_token.type != T_IDENTIFIER) { - parse_error("expected identifier after #define\n"); - eat_until_newline(); - } -} - -/** - * Handle the ifdef directive. - */ -static void ifdef_directive(int is_ifndef) -{ - (void) is_ifndef; - lexer_next_preprocessing_token(); - //expect_identifier(); - //extect_newline(); -} - -/** - * Handle the endif directive. - */ -static void endif_directive(void) -{ - //expect_newline(); -} - /** * Parse the line directive. */ static void parse_line_directive(void) { - if (pp_token.type != T_INTEGER) { + if (pp_token.kind != T_INTEGER) { parse_error("expected integer"); } else { /* use offset -1 as this is about the next line */ - lexer_pos.lineno = atoi(pp_token.literal.begin) - 1; + lexer_pos.lineno = atoi(pp_token.number.number.begin) - 1; next_pp_token(); } - if (pp_token.type == T_STRING_LITERAL) { - lexer_pos.input_name = pp_token.literal.begin; + if (pp_token.kind == T_STRING_LITERAL) { + lexer_pos.input_name = pp_token.string.string.begin; + lexer_pos.is_system_header = false; next_pp_token(); + + /* attempt to parse numeric flags as outputted by gcc preprocessor */ + while (pp_token.kind == T_INTEGER) { + /* flags: + * 1 - indicates start of a new file + * 2 - indicates return from a file + * 3 - indicates system header + * 4 - indicates implicit extern "C" in C++ mode + * + * currently we're only interested in "3" + */ + if (streq(pp_token.number.number.begin, "3")) { + lexer_pos.is_system_header = true; + } + next_pp_token(); + } } eat_until_newline(); @@ -982,13 +949,21 @@ static void parse_pragma(void) bool unknown_pragma = true; next_pp_token(); - if (pp_token.symbol->pp_ID == TP_STDC) { + if (pp_token.kind != T_IDENTIFIER) { + warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.base.source_position, + "expected identifier after #pragma"); + eat_until_newline(); + return; + } + + symbol_t *symbol = pp_token.identifier.symbol; + if (symbol->pp_ID == TP_STDC) { stdc_pragma_kind_t kind = STDC_UNKNOWN; /* a STDC pragma */ if (c_mode & _C99) { next_pp_token(); - switch (pp_token.symbol->pp_ID) { + switch (pp_token.identifier.symbol->pp_ID) { case TP_FP_CONTRACT: kind = STDC_FP_CONTRACT; break; @@ -1004,7 +979,7 @@ static void parse_pragma(void) if (kind != STDC_UNKNOWN) { stdc_pragma_value_kind_t value = STDC_VALUE_UNKNOWN; next_pp_token(); - switch (pp_token.symbol->pp_ID) { + switch (pp_token.identifier.symbol->pp_ID) { case TP_ON: value = STDC_VALUE_ON; break; @@ -1020,7 +995,8 @@ static void parse_pragma(void) if (value != STDC_VALUE_UNKNOWN) { unknown_pragma = false; } else { - errorf(&pp_token.source_position, "bad STDC pragma argument"); + errorf(&pp_token.base.source_position, + "bad STDC pragma argument"); } } } @@ -1029,7 +1005,8 @@ static void parse_pragma(void) } eat_until_newline(); if (unknown_pragma) { - warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.source_position, "encountered unknown #pragma"); + warningf(WARN_UNKNOWN_PRAGMAS, &pp_token.base.source_position, + "encountered unknown #pragma"); } } @@ -1038,40 +1015,21 @@ static void parse_pragma(void) */ static void parse_preprocessor_identifier(void) { - assert(pp_token.type == T_IDENTIFIER); - symbol_t *symbol = pp_token.symbol; + assert(pp_token.kind == T_IDENTIFIER); + symbol_t *symbol = pp_token.identifier.symbol; switch (symbol->pp_ID) { - case TP_include: - printf("include - enable header name parsing!\n"); - break; - case TP_define: - define_directive(); - break; - case TP_ifdef: - ifdef_directive(0); - break; - case TP_ifndef: - ifdef_directive(1); - break; - case TP_endif: - endif_directive(); - break; case TP_line: next_pp_token(); parse_line_directive(); break; - case TP_if: - case TP_else: - case TP_elif: - case TP_undef: - case TP_error: - /* TODO; output the rest of the line */ - parse_error("#error directive: "); - break; case TP_pragma: parse_pragma(); break; + case TP_error: + /* TODO; output the rest of the line */ + parse_error("#error directive"); + break; } } @@ -1082,7 +1040,7 @@ static void parse_preprocessor_directive(void) { next_pp_token(); - switch (pp_token.type) { + switch (pp_token.kind) { case T_IDENTIFIER: parse_preprocessor_identifier(); break; @@ -1107,7 +1065,7 @@ static void parse_preprocessor_directive(void) #define MAYBE(ch, set_type) \ case ch: \ next_char(); \ - lexer_token.type = set_type; \ + lexer_token.kind = set_type; \ return; /* must use this as last thing */ @@ -1115,7 +1073,7 @@ static void parse_preprocessor_directive(void) case ch: \ if (c_mode & mode) { \ next_char(); \ - lexer_token.type = set_type; \ + lexer_token.kind = set_type; \ return; \ } \ /* fallthrough */ @@ -1129,13 +1087,13 @@ static void parse_preprocessor_directive(void) #define ELSE(set_type) \ ELSE_CODE( \ - lexer_token.type = set_type; \ + lexer_token.kind = set_type; \ ) void lexer_next_preprocessing_token(void) { while (true) { - lexer_token.source_position = lexer_pos; + lexer_token.base.source_position = lexer_pos; switch (c) { case ' ': @@ -1144,14 +1102,14 @@ void lexer_next_preprocessing_token(void) break; MATCH_NEWLINE( - lexer_token.type = '\n'; + lexer_token.kind = '\n'; return; ) SYMBOL_CHARS parse_symbol(); /* might be a wide string ( L"string" ) */ - if (lexer_token.symbol == symbol_L) { + if (lexer_token.identifier.symbol == symbol_L) { switch (c) { case '"': parse_wide_string_literal(); break; case '\'': parse_wide_character_constant(); break; @@ -1185,7 +1143,7 @@ void lexer_next_preprocessing_token(void) ELSE_CODE( put_back(c); c = '.'; - lexer_token.type = '.'; + lexer_token.kind = '.'; ) ELSE('.') case '&': @@ -1238,7 +1196,7 @@ void lexer_next_preprocessing_token(void) ELSE_CODE( put_back(c); c = '%'; - lexer_token.type = '#'; + lexer_token.kind = '#'; ) ELSE('#') ELSE('%') @@ -1294,19 +1252,19 @@ void lexer_next_preprocessing_token(void) case ';': case ',': case '\\': - lexer_token.type = c; + lexer_token.kind = c; next_char(); return; case EOF: - lexer_token.type = T_EOF; + lexer_token.kind = T_EOF; return; default: dollar_sign: errorf(&lexer_pos, "unknown character '%c' found", c); next_char(); - lexer_token.type = T_ERROR; + lexer_token.kind = T_ERROR; return; } } @@ -1316,12 +1274,12 @@ void lexer_next_token(void) { lexer_next_preprocessing_token(); - while (lexer_token.type == '\n') { + while (lexer_token.kind == '\n') { newline_found: lexer_next_preprocessing_token(); } - if (lexer_token.type == '#') { + if (lexer_token.kind == '#') { parse_preprocessor_directive(); goto newline_found; } @@ -1341,26 +1299,14 @@ static void input_error(unsigned delta_lines, unsigned delta_cols, errorf(&lexer_pos, "%s", message); } -void select_input_encoding(char const* new_encoding) -{ - if (encoding != NULL) - xfree(encoding); - encoding = xstrdup(new_encoding); -} - -void lexer_open_stream(FILE *stream, const char *input_name) +void lexer_switch_input(input_t *new_input, const char *input_name) { - if (input != NULL) { - input_free(input); - input = NULL; - } - lexer_pos.lineno = 0; lexer_pos.colno = 0; lexer_pos.input_name = input_name; set_input_error_callback(input_error); - input = input_from_stream(stream, encoding); + input = new_input; bufpos = NULL; bufend = NULL; @@ -1371,10 +1317,6 @@ void lexer_open_stream(FILE *stream, const char *input_name) void exit_lexer(void) { - if (input != NULL) { - input_free(input); - input = NULL; - } strset_destroy(&stringset); } @@ -1382,6 +1324,6 @@ static __attribute__((unused)) void dbg_pos(const source_position_t source_position) { fprintf(stdout, "%s:%u:%u\n", source_position.input_name, - source_position.lineno, source_position.colno); + source_position.lineno, (unsigned)source_position.colno); fflush(stdout); }