From 7a2d15bb1778cedc00dcb12f0249cf5fbac26e46 Mon Sep 17 00:00:00 2001 From: Christoph Mallon Date: Thu, 29 Nov 2007 16:36:36 +0000 Subject: [PATCH] First iteration in adding wide string literal support. No input encoding handling or string concatenation, yet. [r18569] --- Makefile | 4 ++-- ast.c | 48 +++++++++++++++++++++++++++++++++++++++++ ast.h | 47 ++++++++++++++++++++-------------------- ast2firm.c | 61 +++++++++++++++++++++++++++++++++++++++++++--------- ast_t.h | 41 ++++++++++++++++++++--------------- lexer.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- parser.c | 54 ++++++++++++++++++++++++++++------------------ token_t.h | 10 +++++---- tokens.inc | 9 ++++---- 9 files changed, 254 insertions(+), 83 deletions(-) diff --git a/Makefile b/Makefile index 7178571..e078b3a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ GOAL = cparser -FIRM_HOME = $(HOME)/projects/firm +FIRM_HOME = $(HOME)/jambuild/ FIRM_BUILD = $(FIRM_HOME)/build/i686-pc-linux-gnu/debug/ FIRM_CFLAGS = -I$(FIRM_HOME)/libfirm/include -I$(FIRM_HOME)/obstack -I$(FIRM_HOME)/libcore -I$(FIRM_HOME)/libcore/libcore -I$(FIRM_HOME) FIRM_LIBS = -L$(FIRM_BUILD) -lfirm -llpp -lcore -lm -lz -ldl @@ -65,7 +65,7 @@ build/adt: build/%.o: %.c @echo '===> CC $<' - $(Q)icc $(CPPFLAGS) $(ICC_CFLAGS) -c $< -o $@ +# $(Q)icc $(CPPFLAGS) $(ICC_CFLAGS) -c $< -o $@ $(Q)$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ clean: diff --git a/ast.c b/ast.c index 0ba0492..87d490e 100644 --- a/ast.c +++ b/ast.c @@ -74,6 +74,51 @@ static void print_string_literal( print_quoted_string(string_literal->value); } +static void print_wide_string_literal( + const wide_string_literal_expression_t *const wstr) +{ + fputs("L\"", out); + for (const wchar_rep_t *c = wstr->value.begin, + *end = c + wstr->value.size; + c != end; ++c) { + switch (*c) { + case L'\"': fputs("\\\"", out); break; + case L'\\': fputs("\\\\", out); break; + case L'\a': fputs("\\a", out); break; + case L'\b': fputs("\\b", out); break; + case L'\f': fputs("\\f", out); break; + case L'\n': fputs("\\n", out); break; + case L'\r': fputs("\\r", out); break; + case L'\t': fputs("\\t", out); break; + case L'\v': fputs("\\v", out); break; + case L'\?': fputs("\\?", out); break; + default: { + const unsigned tc = *c; + if (tc < 0x80U) { + if (!isprint(*c)) { + fprintf(out, "\\%03o", (char)*c); + } else { + fputc(*c, out); + } + } else if (tc < 0x800) { + fputc(0xC0 | (tc >> 6), out); + fputc(0x80 | (tc & 0x3F), out); + } else if (tc < 0x10000) { + fputc(0xE0 | ( tc >> 12), out); + fputc(0x80 | ((tc >> 6) & 0x3F), out); + fputc(0x80 | ( tc & 0x3F), out); + } else { + fputc(0xF0 | ( tc >> 18), out); + fputc(0x80 | ((tc >> 12) & 0x3F), out); + fputc(0x80 | ((tc >> 6) & 0x3F), out); + fputc(0x80 | ( tc & 0x3F), out); + } + } + } + } + fputc('"', out); +} + static void print_call_expression(const call_expression_t *call) { print_expression(call->function); @@ -274,6 +319,9 @@ void print_expression(const expression_t *expression) case EXPR_STRING_LITERAL: print_string_literal(&expression->string); break; + case EXPR_WIDE_STRING_LITERAL: + print_wide_string_literal(&expression->wide_string); + break; case EXPR_CALL: print_call_expression(&expression->call); break; diff --git a/ast.h b/ast.h index ad83e5f..fdb02b3 100644 --- a/ast.h +++ b/ast.h @@ -5,29 +5,30 @@ typedef struct context_t context_t; -typedef struct expression_base_t expression_base_t; -typedef struct const_expression_t const_expression_t; -typedef struct string_literal_expression_t string_literal_expression_t; -typedef struct reference_expression_t reference_expression_t; -typedef struct cast_expression_t cast_expression_t; -typedef struct call_argument_t call_argument_t; -typedef struct type_argument_t type_argument_t; -typedef struct call_expression_t call_expression_t; -typedef struct binary_expression_t binary_expression_t; -typedef struct unary_expression_t unary_expression_t; -typedef struct select_expression_t select_expression_t; -typedef struct array_access_expression_t array_access_expression_t; -typedef struct sizeof_expression_t sizeof_expression_t; -typedef struct conditional_expression_t conditional_expression_t; -typedef struct expression_list_element_t expression_list_element_t; -typedef struct comma_expression_t comma_expression_t; -typedef struct statement_expression_t statement_expression_t; -typedef struct designator_t designator_t; -typedef struct offsetof_expression_t offsetof_expression_t; -typedef struct va_arg_expression_t va_arg_expression_t; -typedef struct builtin_symbol_expression_t builtin_symbol_expression_t; -typedef struct classify_type_expression_t classify_type_expression_t; -typedef union expression_t expression_t; +typedef struct expression_base_t expression_base_t; +typedef struct const_expression_t const_expression_t; +typedef struct string_literal_expression_t string_literal_expression_t; +typedef struct wide_string_literal_expression_t wide_string_literal_expression_t; +typedef struct reference_expression_t reference_expression_t; +typedef struct cast_expression_t cast_expression_t; +typedef struct call_argument_t call_argument_t; +typedef struct type_argument_t type_argument_t; +typedef struct call_expression_t call_expression_t; +typedef struct binary_expression_t binary_expression_t; +typedef struct unary_expression_t unary_expression_t; +typedef struct select_expression_t select_expression_t; +typedef struct array_access_expression_t array_access_expression_t; +typedef struct sizeof_expression_t sizeof_expression_t; +typedef struct conditional_expression_t conditional_expression_t; +typedef struct expression_list_element_t expression_list_element_t; +typedef struct comma_expression_t comma_expression_t; +typedef struct statement_expression_t statement_expression_t; +typedef struct designator_t designator_t; +typedef struct offsetof_expression_t offsetof_expression_t; +typedef struct va_arg_expression_t va_arg_expression_t; +typedef struct builtin_symbol_expression_t builtin_symbol_expression_t; +typedef struct classify_type_expression_t classify_type_expression_t; +typedef union expression_t expression_t; typedef struct initializer_base_t initializer_base_t; typedef struct initializer_list_t initializer_list_t; diff --git a/ast2firm.c b/ast2firm.c index f5b8d91..315da62 100644 --- a/ast2firm.c +++ b/ast2firm.c @@ -21,6 +21,7 @@ #define MAGIC_DEFAULT_PN_NUMBER (long) -314159265 static ir_type *ir_type_const_char; +static ir_type *ir_type_wchar_t; static ir_type *ir_type_void; static ir_type *ir_type_int; @@ -86,16 +87,6 @@ const char *retrieve_dbg(const dbg_info *dbg, unsigned *line) void init_ast2firm(void) { - type_const_char = make_atomic_type(ATOMIC_TYPE_CHAR, TYPE_QUALIFIER_CONST); - type_void = make_atomic_type(ATOMIC_TYPE_VOID, TYPE_QUALIFIER_NONE); - type_int = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_NONE); - - ir_type_int = get_ir_type(type_int); - ir_type_const_char = get_ir_type(type_const_char); - ir_type_void = get_ir_type(type_int); /* we don't have a real void - type in firm */ - - type_void->base.firm_type = ir_type_void; } void exit_ast2firm(void) @@ -652,6 +643,42 @@ static ir_node *string_literal_to_firm( literal->value); } +static ir_node *wide_string_literal_to_firm( + const wide_string_literal_expression_t* const literal) +{ + ir_type *const global_type = get_glob_type(); + ir_type *const elem_type = ir_type_wchar_t; + ir_type *const type = new_type_array(unique_ident("strtype"), 1, + elem_type); + + ident *const id = unique_ident("Lstr"); + ir_entity *const entity = new_entity(global_type, id, type); + set_entity_ld_ident(entity, id); + set_entity_variability(entity, variability_constant); + + ir_mode *const mode = get_type_mode(elem_type); + + const wchar_rep_t *const string = literal->value.begin; + const size_t slen = literal->value.size; + + set_array_lower_bound_int(type, 0, 0); + set_array_upper_bound_int(type, 0, slen); + set_type_size_bytes(type, slen); + set_type_state(type, layout_fixed); + + tarval **const tvs = xmalloc(slen * sizeof(tvs[0])); + for(size_t i = 0; i < slen; ++i) { + tvs[i] = new_tarval_from_long(string[i], mode); + } + + set_array_entity_values(entity, tvs, slen); + free(tvs); + + dbg_info *const dbgi = get_dbg_info(&literal->expression.source_position); + + return create_symconst(dbgi, entity); +} + static ir_node *deref_address(ir_type *const irtype, ir_node *const addr, dbg_info *const dbgi) { @@ -1707,6 +1734,8 @@ static ir_node *_expression_to_firm(const expression_t *expression) return const_to_firm(&expression->conste); case EXPR_STRING_LITERAL: return string_literal_to_firm(&expression->string); + case EXPR_WIDE_STRING_LITERAL: + return wide_string_literal_to_firm(&expression->wide_string); case EXPR_REFERENCE: return reference_expression_to_firm(&expression->reference); case EXPR_CALL: @@ -3055,6 +3084,18 @@ static void context_to_firm(context_t *context) void translation_unit_to_firm(translation_unit_t *unit) { + type_const_char = make_atomic_type(ATOMIC_TYPE_CHAR, TYPE_QUALIFIER_CONST); + type_void = make_atomic_type(ATOMIC_TYPE_VOID, TYPE_QUALIFIER_NONE); + type_int = make_atomic_type(ATOMIC_TYPE_INT, TYPE_QUALIFIER_NONE); + + ir_type_int = get_ir_type(type_int); + ir_type_const_char = get_ir_type(type_const_char); + ir_type_wchar_t = get_ir_type(type_wchar_t); + ir_type_void = get_ir_type(type_int); /* we don't have a real void + type in firm */ + + type_void->base.firm_type = ir_type_void; + /* just to be sure */ continue_label = NULL; break_label = NULL; diff --git a/ast_t.h b/ast_t.h index 9216030..dbcc4f4 100644 --- a/ast_t.h +++ b/ast_t.h @@ -18,6 +18,7 @@ typedef enum { EXPR_REFERENCE, EXPR_CONST, EXPR_STRING_LITERAL, + EXPR_WIDE_STRING_LITERAL, EXPR_CALL, EXPR_UNARY, EXPR_BINARY, @@ -58,6 +59,11 @@ struct string_literal_expression_t { const char *value; }; +struct wide_string_literal_expression_t { + expression_base_t expression; + wide_string_t value; +}; + struct builtin_symbol_expression_t { expression_base_t expression; symbol_t *symbol; @@ -200,23 +206,24 @@ struct classify_type_expression_t { }; union expression_t { - expression_type_t type; - expression_base_t base; - const_expression_t conste; - string_literal_expression_t string; - builtin_symbol_expression_t builtin_symbol; - reference_expression_t reference; - call_expression_t call; - unary_expression_t unary; - binary_expression_t binary; - select_expression_t select; - array_access_expression_t array_access; - sizeof_expression_t sizeofe; - offsetof_expression_t offsetofe; - va_arg_expression_t va_arge; - conditional_expression_t conditional; - statement_expression_t statement; - classify_type_expression_t classify_type; + expression_type_t type; + expression_base_t base; + const_expression_t conste; + string_literal_expression_t string; + wide_string_literal_expression_t wide_string; + builtin_symbol_expression_t builtin_symbol; + reference_expression_t reference; + call_expression_t call; + unary_expression_t unary; + binary_expression_t binary; + select_expression_t select; + array_access_expression_t array_access; + sizeof_expression_t sizeofe; + offsetof_expression_t offsetofe; + va_arg_expression_t va_arge; + conditional_expression_t conditional; + statement_expression_t statement; + classify_type_expression_t classify_type; }; typedef enum { diff --git a/lexer.c b/lexer.c index 38473a2..82bccd0 100644 --- a/lexer.c +++ b/lexer.c @@ -754,6 +754,66 @@ end_of_string: lexer_token.v.string = result; } +static void parse_wide_string_literal(void) +{ + const unsigned start_linenr = lexer_token.source_position.linenr; + + assert(c == '"'); + next_char(); + + while(1) { + switch(c) { + case '\\': { + wchar_rep_t tc = parse_escape_sequence(); + obstack_grow(&symbol_obstack, &tc, sizeof(tc)); + break; + } + + case EOF: + error_prefix_at(lexer_token.source_position.input_name, + start_linenr); + fprintf(stderr, "string has no end\n"); + lexer_token.type = T_ERROR; + return; + + case '"': + next_char(); + goto end_of_string; + + default: { + wchar_rep_t tc = c; + obstack_grow(&symbol_obstack, &tc, sizeof(tc)); + next_char(); + break; + } + } + } + +end_of_string:; + + /* TODO: concatenate multiple strings separated by whitespace... */ + + /* add finishing 0 to the string */ + wchar_rep_t nul = L'\0'; + obstack_grow(&symbol_obstack, &nul, sizeof(nul)); + const size_t size = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t); + const wchar_rep_t *const string = obstack_finish(&symbol_obstack); + +#if 0 /* TODO hash */ + /* check if there is already a copy of the string */ + const wchar_rep_t *const result = strset_insert(&stringset, string); + if(result != string) { + obstack_free(&symbol_obstack, string); + } +#else + const wchar_rep_t *const result = string; +#endif + + lexer_token.type = T_WIDE_STRING_LITERAL; + lexer_token.v.wide_string.begin = result; + lexer_token.v.wide_string.size = size; +} + static void parse_character_constant(void) { eat('\''); @@ -1003,8 +1063,7 @@ void lexer_next_preprocessing_token(void) /* might be a wide string ( L"string" ) */ if(c == '"' && (lexer_token.type == T_IDENTIFIER && lexer_token.v.symbol == symbol_L)) { - parse_string_literal(); - return; + parse_wide_string_literal(); } return; diff --git a/parser.c b/parser.c index 26c9fc0..a7b334b 100644 --- a/parser.c +++ b/parser.c @@ -59,7 +59,7 @@ static type_t *type_void_ptr = NULL; type_t *type_size_t = NULL; type_t *type_ptrdiff_t = NULL; type_t *type_wchar_t = NULL; -type_t *type_wchar_ptr_t = NULL; +type_t *type_wchar_t_ptr = NULL; static statement_t *parse_compound_statement(void); static statement_t *parse_statement(void); @@ -168,24 +168,25 @@ static statement_t *allocate_statement_zero(statement_type_t type) static size_t get_expression_struct_size(expression_type_t type) { static const size_t sizes[] = { - [EXPR_INVALID] = sizeof(expression_base_t), - [EXPR_REFERENCE] = sizeof(reference_expression_t), - [EXPR_CONST] = sizeof(const_expression_t), - [EXPR_STRING_LITERAL] = sizeof(string_literal_expression_t), - [EXPR_CALL] = sizeof(call_expression_t), - [EXPR_UNARY] = sizeof(unary_expression_t), - [EXPR_BINARY] = sizeof(binary_expression_t), - [EXPR_CONDITIONAL] = sizeof(conditional_expression_t), - [EXPR_SELECT] = sizeof(select_expression_t), - [EXPR_ARRAY_ACCESS] = sizeof(array_access_expression_t), - [EXPR_SIZEOF] = sizeof(sizeof_expression_t), - [EXPR_CLASSIFY_TYPE] = sizeof(classify_type_expression_t), - [EXPR_FUNCTION] = sizeof(string_literal_expression_t), - [EXPR_PRETTY_FUNCTION] = sizeof(string_literal_expression_t), - [EXPR_BUILTIN_SYMBOL] = sizeof(builtin_symbol_expression_t), - [EXPR_OFFSETOF] = sizeof(offsetof_expression_t), - [EXPR_VA_ARG] = sizeof(va_arg_expression_t), - [EXPR_STATEMENT] = sizeof(statement_expression_t) + [EXPR_INVALID] = sizeof(expression_base_t), + [EXPR_REFERENCE] = sizeof(reference_expression_t), + [EXPR_CONST] = sizeof(const_expression_t), + [EXPR_STRING_LITERAL] = sizeof(string_literal_expression_t), + [EXPR_WIDE_STRING_LITERAL] = sizeof(wide_string_literal_expression_t), + [EXPR_CALL] = sizeof(call_expression_t), + [EXPR_UNARY] = sizeof(unary_expression_t), + [EXPR_BINARY] = sizeof(binary_expression_t), + [EXPR_CONDITIONAL] = sizeof(conditional_expression_t), + [EXPR_SELECT] = sizeof(select_expression_t), + [EXPR_ARRAY_ACCESS] = sizeof(array_access_expression_t), + [EXPR_SIZEOF] = sizeof(sizeof_expression_t), + [EXPR_CLASSIFY_TYPE] = sizeof(classify_type_expression_t), + [EXPR_FUNCTION] = sizeof(string_literal_expression_t), + [EXPR_PRETTY_FUNCTION] = sizeof(string_literal_expression_t), + [EXPR_BUILTIN_SYMBOL] = sizeof(builtin_symbol_expression_t), + [EXPR_OFFSETOF] = sizeof(offsetof_expression_t), + [EXPR_VA_ARG] = sizeof(va_arg_expression_t), + [EXPR_STATEMENT] = sizeof(statement_expression_t) }; assert(sizeof(sizes) / sizeof(sizes[0]) == EXPR_STATEMENT + 1); assert(type <= EXPR_STATEMENT); @@ -2803,6 +2804,15 @@ static expression_t *parse_string_const(void) return cnst; } +static expression_t *parse_wide_string_const(void) +{ + expression_t *const cnst = allocate_expression_zero(EXPR_WIDE_STRING_LITERAL); + cnst->base.datatype = type_wchar_t_ptr; + cnst->wide_string.value = token.v.wide_string; /* TODO concatenate */ + next_token(); + return cnst; +} + static expression_t *parse_int_const(void) { expression_t *cnst = allocate_expression_zero(EXPR_CONST); @@ -3237,8 +3247,10 @@ static expression_t *parse_primary_expression(void) return parse_int_const(); case T_FLOATINGPOINT: return parse_float_const(); - case T_STRING_LITERAL: + case T_STRING_LITERAL: /* TODO merge */ return parse_string_const(); + case T_WIDE_STRING_LITERAL: + return parse_wide_string_const(); case T_IDENTIFIER: return parse_reference(); case T___FUNCTION__: @@ -4959,7 +4971,7 @@ static statement_t *parse_compound_statement(void) static void initialize_builtins(void) { type_wchar_t = make_global_typedef("__WCHAR_TYPE__", type_int); - type_wchar_ptr_t = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE); + type_wchar_t_ptr = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE); type_size_t = make_global_typedef("__SIZE_TYPE__", make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_NONE)); type_ptrdiff_t = make_global_typedef("__PTRDIFF_TYPE__", diff --git a/token_t.h b/token_t.h index da4bf3d..822d666 100644 --- a/token_t.h +++ b/token_t.h @@ -2,6 +2,7 @@ #define TOKEN_T_H #include +#include "string_rep.h" #include "symbol.h" #include "symbol_table.h" #include "type.h" @@ -37,10 +38,11 @@ extern source_position_t builtin_source_position; typedef struct { int type; union { - symbol_t *symbol; - long long intvalue; - long double floatvalue; - const char *string; + symbol_t *symbol; + long long intvalue; + long double floatvalue; + const char *string; + wide_string_t wide_string; } v; type_t *datatype; source_position_t source_position; diff --git a/tokens.inc b/tokens.inc index 6af3859..6a72d84 100644 --- a/tokens.inc +++ b/tokens.inc @@ -2,10 +2,11 @@ #define TS(x,str,val) #endif -TS(IDENTIFIER, "identifier", = 256) -TS(INTEGER, "integer number",) -TS(FLOATINGPOINT, "floatingpoint number",) -TS(STRING_LITERAL, "string literal",) +TS(IDENTIFIER, "identifier", = 256) +TS(INTEGER, "integer number",) +TS(FLOATINGPOINT, "floatingpoint number",) +TS(STRING_LITERAL, "string literal",) +TS(WIDE_STRING_LITERAL, "wide string literal",) #define S(x) T(x,#x,) S(auto) -- 2.20.1