First iteration in adding wide string literal support. No input encoding handling...
authorChristoph Mallon <christoph.mallon@gmx.de>
Thu, 29 Nov 2007 16:36:36 +0000 (16:36 +0000)
committerChristoph Mallon <christoph.mallon@gmx.de>
Thu, 29 Nov 2007 16:36:36 +0000 (16:36 +0000)
[r18569]

Makefile
ast.c
ast.h
ast2firm.c
ast_t.h
lexer.c
parser.c
token_t.h
tokens.inc

index 7178571..e078b3a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 GOAL = cparser
 
-FIRM_HOME = $(HOME)/projects/firm
+FIRM_HOME = $(HOME)/jambuild/
 FIRM_BUILD = $(FIRM_HOME)/build/i686-pc-linux-gnu/debug/
 FIRM_CFLAGS = -I$(FIRM_HOME)/libfirm/include -I$(FIRM_HOME)/obstack -I$(FIRM_HOME)/libcore -I$(FIRM_HOME)/libcore/libcore -I$(FIRM_HOME)
 FIRM_LIBS = -L$(FIRM_BUILD) -lfirm -llpp -lcore -lm -lz -ldl
@@ -65,7 +65,7 @@ build/adt:
 
 build/%.o: %.c
        @echo '===> CC $<'
-       $(Q)icc $(CPPFLAGS) $(ICC_CFLAGS) -c $< -o $@
+#      $(Q)icc $(CPPFLAGS) $(ICC_CFLAGS) -c $< -o $@
        $(Q)$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@
 
 clean:
diff --git a/ast.c b/ast.c
index 0ba0492..87d490e 100644 (file)
--- a/ast.c
+++ b/ast.c
@@ -74,6 +74,51 @@ static void print_string_literal(
        print_quoted_string(string_literal->value);
 }
 
+static void print_wide_string_literal(
+       const wide_string_literal_expression_t *const wstr)
+{
+       fputs("L\"", out);
+       for (const wchar_rep_t *c   = wstr->value.begin,
+                              *end = c + wstr->value.size;
+            c != end; ++c) {
+               switch (*c) {
+                       case L'\"':  fputs("\\\"", out); break;
+                       case L'\\':  fputs("\\\\", out); break;
+                       case L'\a':  fputs("\\a",  out); break;
+                       case L'\b':  fputs("\\b",  out); break;
+                       case L'\f':  fputs("\\f",  out); break;
+                       case L'\n':  fputs("\\n",  out); break;
+                       case L'\r':  fputs("\\r",  out); break;
+                       case L'\t':  fputs("\\t",  out); break;
+                       case L'\v':  fputs("\\v",  out); break;
+                       case L'\?':  fputs("\\?",  out); break;
+                       default: {
+                               const unsigned tc = *c;
+                               if (tc < 0x80U) {
+                                       if (!isprint(*c))  {
+                                               fprintf(out, "\\%03o", (char)*c);
+                                       } else {
+                                               fputc(*c, out);
+                                       }
+                               } else if (tc < 0x800) {
+                                       fputc(0xC0 | (tc >> 6),   out);
+                                       fputc(0x80 | (tc & 0x3F), out);
+                               } else if (tc < 0x10000) {
+                                       fputc(0xE0 | ( tc >> 12),         out);
+                                       fputc(0x80 | ((tc >>  6) & 0x3F), out);
+                                       fputc(0x80 | ( tc        & 0x3F), out);
+                               } else {
+                                       fputc(0xF0 | ( tc >> 18),         out);
+                                       fputc(0x80 | ((tc >> 12) & 0x3F), out);
+                                       fputc(0x80 | ((tc >>  6) & 0x3F), out);
+                                       fputc(0x80 | ( tc        & 0x3F), out);
+                               }
+                       }
+               }
+       }
+       fputc('"', out);
+}
+
 static void print_call_expression(const call_expression_t *call)
 {
        print_expression(call->function);
@@ -274,6 +319,9 @@ void print_expression(const expression_t *expression)
        case EXPR_STRING_LITERAL:
                print_string_literal(&expression->string);
                break;
+       case EXPR_WIDE_STRING_LITERAL:
+               print_wide_string_literal(&expression->wide_string);
+               break;
        case EXPR_CALL:
                print_call_expression(&expression->call);
                break;
diff --git a/ast.h b/ast.h
index ad83e5f..fdb02b3 100644 (file)
--- a/ast.h
+++ b/ast.h
@@ -5,29 +5,30 @@
 
 typedef struct context_t                    context_t;
 
-typedef struct expression_base_t            expression_base_t;
-typedef struct const_expression_t           const_expression_t;
-typedef struct string_literal_expression_t  string_literal_expression_t;
-typedef struct reference_expression_t       reference_expression_t;
-typedef struct cast_expression_t            cast_expression_t;
-typedef struct call_argument_t              call_argument_t;
-typedef struct type_argument_t              type_argument_t;
-typedef struct call_expression_t            call_expression_t;
-typedef struct binary_expression_t          binary_expression_t;
-typedef struct unary_expression_t           unary_expression_t;
-typedef struct select_expression_t          select_expression_t;
-typedef struct array_access_expression_t    array_access_expression_t;
-typedef struct sizeof_expression_t          sizeof_expression_t;
-typedef struct conditional_expression_t     conditional_expression_t;
-typedef struct expression_list_element_t    expression_list_element_t;
-typedef struct comma_expression_t           comma_expression_t;
-typedef struct statement_expression_t       statement_expression_t;
-typedef struct designator_t                 designator_t;
-typedef struct offsetof_expression_t        offsetof_expression_t;
-typedef struct va_arg_expression_t          va_arg_expression_t;
-typedef struct builtin_symbol_expression_t  builtin_symbol_expression_t;
-typedef struct classify_type_expression_t   classify_type_expression_t;
-typedef union  expression_t                 expression_t;
+typedef struct expression_base_t                expression_base_t;
+typedef struct const_expression_t               const_expression_t;
+typedef struct string_literal_expression_t      string_literal_expression_t;
+typedef struct wide_string_literal_expression_t wide_string_literal_expression_t;
+typedef struct reference_expression_t           reference_expression_t;
+typedef struct cast_expression_t                cast_expression_t;
+typedef struct call_argument_t                  call_argument_t;
+typedef struct type_argument_t                  type_argument_t;
+typedef struct call_expression_t                call_expression_t;
+typedef struct binary_expression_t              binary_expression_t;
+typedef struct unary_expression_t               unary_expression_t;
+typedef struct select_expression_t              select_expression_t;
+typedef struct array_access_expression_t        array_access_expression_t;
+typedef struct sizeof_expression_t              sizeof_expression_t;
+typedef struct conditional_expression_t         conditional_expression_t;
+typedef struct expression_list_element_t        expression_list_element_t;
+typedef struct comma_expression_t               comma_expression_t;
+typedef struct statement_expression_t           statement_expression_t;
+typedef struct designator_t                     designator_t;
+typedef struct offsetof_expression_t            offsetof_expression_t;
+typedef struct va_arg_expression_t              va_arg_expression_t;
+typedef struct builtin_symbol_expression_t      builtin_symbol_expression_t;
+typedef struct classify_type_expression_t       classify_type_expression_t;
+typedef union  expression_t                     expression_t;
 
 typedef struct initializer_base_t           initializer_base_t;
 typedef struct initializer_list_t           initializer_list_t;
index f5b8d91..315da62 100644 (file)
@@ -21,6 +21,7 @@
 #define MAGIC_DEFAULT_PN_NUMBER            (long) -314159265
 
 static ir_type *ir_type_const_char;
+static ir_type *ir_type_wchar_t;
 static ir_type *ir_type_void;
 static ir_type *ir_type_int;
 
@@ -86,16 +87,6 @@ const char *retrieve_dbg(const dbg_info *dbg, unsigned *line)
 
 void init_ast2firm(void)
 {
-       type_const_char = make_atomic_type(ATOMIC_TYPE_CHAR, TYPE_QUALIFIER_CONST);
-       type_void       = make_atomic_type(ATOMIC_TYPE_VOID, TYPE_QUALIFIER_NONE);
-       type_int        = make_atomic_type(ATOMIC_TYPE_INT,  TYPE_QUALIFIER_NONE);
-
-       ir_type_int        = get_ir_type(type_int);
-       ir_type_const_char = get_ir_type(type_const_char);
-       ir_type_void       = get_ir_type(type_int); /* we don't have a real void
-                                                      type in firm */
-
-       type_void->base.firm_type = ir_type_void;
 }
 
 void exit_ast2firm(void)
@@ -652,6 +643,42 @@ static ir_node *string_literal_to_firm(
                              literal->value);
 }
 
+static ir_node *wide_string_literal_to_firm(
+       const wide_string_literal_expression_t* const literal)
+{
+       ir_type *const global_type = get_glob_type();
+       ir_type *const elem_type   = ir_type_wchar_t;
+       ir_type *const type        = new_type_array(unique_ident("strtype"), 1,
+                                                   elem_type);
+
+       ident     *const id     = unique_ident("Lstr");
+       ir_entity *const entity = new_entity(global_type, id, type);
+       set_entity_ld_ident(entity, id);
+       set_entity_variability(entity, variability_constant);
+
+       ir_mode *const mode      = get_type_mode(elem_type);
+
+       const wchar_rep_t *const string = literal->value.begin;
+       const size_t             slen   = literal->value.size;
+
+       set_array_lower_bound_int(type, 0, 0);
+       set_array_upper_bound_int(type, 0, slen);
+       set_type_size_bytes(type, slen);
+       set_type_state(type, layout_fixed);
+
+       tarval **const tvs = xmalloc(slen * sizeof(tvs[0]));
+       for(size_t i = 0; i < slen; ++i) {
+               tvs[i] = new_tarval_from_long(string[i], mode);
+       }
+
+       set_array_entity_values(entity, tvs, slen);
+       free(tvs);
+
+       dbg_info *const dbgi = get_dbg_info(&literal->expression.source_position);
+
+       return create_symconst(dbgi, entity);
+}
+
 static ir_node *deref_address(ir_type *const irtype, ir_node *const addr,
                               dbg_info *const dbgi)
 {
@@ -1707,6 +1734,8 @@ static ir_node *_expression_to_firm(const expression_t *expression)
                return const_to_firm(&expression->conste);
        case EXPR_STRING_LITERAL:
                return string_literal_to_firm(&expression->string);
+       case EXPR_WIDE_STRING_LITERAL:
+               return wide_string_literal_to_firm(&expression->wide_string);
        case EXPR_REFERENCE:
                return reference_expression_to_firm(&expression->reference);
        case EXPR_CALL:
@@ -3055,6 +3084,18 @@ static void context_to_firm(context_t *context)
 
 void translation_unit_to_firm(translation_unit_t *unit)
 {
+       type_const_char = make_atomic_type(ATOMIC_TYPE_CHAR, TYPE_QUALIFIER_CONST);
+       type_void       = make_atomic_type(ATOMIC_TYPE_VOID, TYPE_QUALIFIER_NONE);
+       type_int        = make_atomic_type(ATOMIC_TYPE_INT,  TYPE_QUALIFIER_NONE);
+
+       ir_type_int        = get_ir_type(type_int);
+       ir_type_const_char = get_ir_type(type_const_char);
+       ir_type_wchar_t    = get_ir_type(type_wchar_t);
+       ir_type_void       = get_ir_type(type_int); /* we don't have a real void
+                                                      type in firm */
+
+       type_void->base.firm_type = ir_type_void;
+
        /* just to be sure */
        continue_label      = NULL;
        break_label         = NULL;
diff --git a/ast_t.h b/ast_t.h
index 9216030..dbcc4f4 100644 (file)
--- a/ast_t.h
+++ b/ast_t.h
@@ -18,6 +18,7 @@ typedef enum {
        EXPR_REFERENCE,
        EXPR_CONST,
        EXPR_STRING_LITERAL,
+       EXPR_WIDE_STRING_LITERAL,
        EXPR_CALL,
        EXPR_UNARY,
        EXPR_BINARY,
@@ -58,6 +59,11 @@ struct string_literal_expression_t {
        const char        *value;
 };
 
+struct wide_string_literal_expression_t {
+       expression_base_t  expression;
+       wide_string_t      value;
+};
+
 struct builtin_symbol_expression_t {
        expression_base_t  expression;
        symbol_t          *symbol;
@@ -200,23 +206,24 @@ struct classify_type_expression_t {
 };
 
 union expression_t {
-       expression_type_t            type;
-       expression_base_t            base;
-       const_expression_t           conste;
-       string_literal_expression_t  string;
-       builtin_symbol_expression_t  builtin_symbol;
-       reference_expression_t       reference;
-       call_expression_t            call;
-       unary_expression_t           unary;
-       binary_expression_t          binary;
-       select_expression_t          select;
-       array_access_expression_t    array_access;
-       sizeof_expression_t          sizeofe;
-       offsetof_expression_t        offsetofe;
-       va_arg_expression_t          va_arge;
-       conditional_expression_t     conditional;
-       statement_expression_t       statement;
-       classify_type_expression_t   classify_type;
+       expression_type_t                type;
+       expression_base_t                base;
+       const_expression_t               conste;
+       string_literal_expression_t      string;
+       wide_string_literal_expression_t wide_string;
+       builtin_symbol_expression_t      builtin_symbol;
+       reference_expression_t           reference;
+       call_expression_t                call;
+       unary_expression_t               unary;
+       binary_expression_t              binary;
+       select_expression_t              select;
+       array_access_expression_t        array_access;
+       sizeof_expression_t              sizeofe;
+       offsetof_expression_t            offsetofe;
+       va_arg_expression_t              va_arge;
+       conditional_expression_t         conditional;
+       statement_expression_t           statement;
+       classify_type_expression_t       classify_type;
 };
 
 typedef enum {
diff --git a/lexer.c b/lexer.c
index 38473a2..82bccd0 100644 (file)
--- a/lexer.c
+++ b/lexer.c
@@ -754,6 +754,66 @@ end_of_string:
        lexer_token.v.string = result;
 }
 
+static void parse_wide_string_literal(void)
+{
+       const unsigned start_linenr = lexer_token.source_position.linenr;
+
+       assert(c == '"');
+       next_char();
+
+       while(1) {
+               switch(c) {
+                       case '\\': {
+                               wchar_rep_t tc = parse_escape_sequence();
+                               obstack_grow(&symbol_obstack, &tc, sizeof(tc));
+                               break;
+                       }
+
+                       case EOF:
+                               error_prefix_at(lexer_token.source_position.input_name,
+                                               start_linenr);
+                               fprintf(stderr, "string has no end\n");
+                               lexer_token.type = T_ERROR;
+                               return;
+
+                       case '"':
+                               next_char();
+                               goto end_of_string;
+
+                       default: {
+                               wchar_rep_t tc = c;
+                               obstack_grow(&symbol_obstack, &tc, sizeof(tc));
+                               next_char();
+                               break;
+                       }
+               }
+       }
+
+end_of_string:;
+
+       /* TODO: concatenate multiple strings separated by whitespace... */
+
+       /* add finishing 0 to the string */
+       wchar_rep_t nul = L'\0';
+       obstack_grow(&symbol_obstack, &nul, sizeof(nul));
+       const size_t             size   = (size_t)obstack_object_size(&symbol_obstack) / sizeof(wchar_rep_t);
+       const wchar_rep_t *const string = obstack_finish(&symbol_obstack);
+
+#if 0 /* TODO hash */
+       /* check if there is already a copy of the string */
+       const wchar_rep_t *const result = strset_insert(&stringset, string);
+       if(result != string) {
+               obstack_free(&symbol_obstack, string);
+       }
+#else
+       const wchar_rep_t *const result = string;
+#endif
+
+       lexer_token.type                = T_WIDE_STRING_LITERAL;
+       lexer_token.v.wide_string.begin = result;
+       lexer_token.v.wide_string.size  = size;
+}
+
 static void parse_character_constant(void)
 {
        eat('\'');
@@ -1003,8 +1063,7 @@ void lexer_next_preprocessing_token(void)
                        /* might be a wide string ( L"string" ) */
                        if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
                           lexer_token.v.symbol == symbol_L)) {
-                               parse_string_literal();
-                               return;
+                               parse_wide_string_literal();
                        }
                        return;
 
index 26c9fc0..a7b334b 100644 (file)
--- a/parser.c
+++ b/parser.c
@@ -59,7 +59,7 @@ static type_t         *type_void_ptr    = NULL;
 type_t *type_size_t      = NULL;
 type_t *type_ptrdiff_t   = NULL;
 type_t *type_wchar_t     = NULL;
-type_t *type_wchar_ptr_t = NULL;
+type_t *type_wchar_t_ptr = NULL;
 
 static statement_t *parse_compound_statement(void);
 static statement_t *parse_statement(void);
@@ -168,24 +168,25 @@ static statement_t *allocate_statement_zero(statement_type_t type)
 static size_t get_expression_struct_size(expression_type_t type)
 {
        static const size_t sizes[] = {
-               [EXPR_INVALID]         = sizeof(expression_base_t),
-               [EXPR_REFERENCE]       = sizeof(reference_expression_t),
-               [EXPR_CONST]           = sizeof(const_expression_t),
-               [EXPR_STRING_LITERAL]  = sizeof(string_literal_expression_t),
-               [EXPR_CALL]            = sizeof(call_expression_t),
-               [EXPR_UNARY]           = sizeof(unary_expression_t),
-               [EXPR_BINARY]          = sizeof(binary_expression_t),
-               [EXPR_CONDITIONAL]     = sizeof(conditional_expression_t),
-               [EXPR_SELECT]          = sizeof(select_expression_t),
-               [EXPR_ARRAY_ACCESS]    = sizeof(array_access_expression_t),
-               [EXPR_SIZEOF]          = sizeof(sizeof_expression_t),
-               [EXPR_CLASSIFY_TYPE]   = sizeof(classify_type_expression_t),
-               [EXPR_FUNCTION]        = sizeof(string_literal_expression_t),
-               [EXPR_PRETTY_FUNCTION] = sizeof(string_literal_expression_t),
-               [EXPR_BUILTIN_SYMBOL]  = sizeof(builtin_symbol_expression_t),
-               [EXPR_OFFSETOF]        = sizeof(offsetof_expression_t),
-               [EXPR_VA_ARG]          = sizeof(va_arg_expression_t),
-               [EXPR_STATEMENT]       = sizeof(statement_expression_t)
+               [EXPR_INVALID]             = sizeof(expression_base_t),
+               [EXPR_REFERENCE]           = sizeof(reference_expression_t),
+               [EXPR_CONST]               = sizeof(const_expression_t),
+               [EXPR_STRING_LITERAL]      = sizeof(string_literal_expression_t),
+               [EXPR_WIDE_STRING_LITERAL] = sizeof(wide_string_literal_expression_t),
+               [EXPR_CALL]                = sizeof(call_expression_t),
+               [EXPR_UNARY]               = sizeof(unary_expression_t),
+               [EXPR_BINARY]              = sizeof(binary_expression_t),
+               [EXPR_CONDITIONAL]         = sizeof(conditional_expression_t),
+               [EXPR_SELECT]              = sizeof(select_expression_t),
+               [EXPR_ARRAY_ACCESS]        = sizeof(array_access_expression_t),
+               [EXPR_SIZEOF]              = sizeof(sizeof_expression_t),
+               [EXPR_CLASSIFY_TYPE]       = sizeof(classify_type_expression_t),
+               [EXPR_FUNCTION]            = sizeof(string_literal_expression_t),
+               [EXPR_PRETTY_FUNCTION]     = sizeof(string_literal_expression_t),
+               [EXPR_BUILTIN_SYMBOL]      = sizeof(builtin_symbol_expression_t),
+               [EXPR_OFFSETOF]            = sizeof(offsetof_expression_t),
+               [EXPR_VA_ARG]              = sizeof(va_arg_expression_t),
+               [EXPR_STATEMENT]           = sizeof(statement_expression_t)
        };
        assert(sizeof(sizes) / sizeof(sizes[0]) == EXPR_STATEMENT + 1);
        assert(type <= EXPR_STATEMENT);
@@ -2803,6 +2804,15 @@ static expression_t *parse_string_const(void)
        return cnst;
 }
 
+static expression_t *parse_wide_string_const(void)
+{
+       expression_t *const cnst = allocate_expression_zero(EXPR_WIDE_STRING_LITERAL);
+       cnst->base.datatype      = type_wchar_t_ptr;
+       cnst->wide_string.value  = token.v.wide_string; /* TODO concatenate */
+       next_token();
+       return cnst;
+}
+
 static expression_t *parse_int_const(void)
 {
        expression_t *cnst       = allocate_expression_zero(EXPR_CONST);
@@ -3237,8 +3247,10 @@ static expression_t *parse_primary_expression(void)
                return parse_int_const();
        case T_FLOATINGPOINT:
                return parse_float_const();
-       case T_STRING_LITERAL:
+       case T_STRING_LITERAL: /* TODO merge */
                return parse_string_const();
+       case T_WIDE_STRING_LITERAL:
+               return parse_wide_string_const();
        case T_IDENTIFIER:
                return parse_reference();
        case T___FUNCTION__:
@@ -4959,7 +4971,7 @@ static statement_t *parse_compound_statement(void)
 static void initialize_builtins(void)
 {
        type_wchar_t     = make_global_typedef("__WCHAR_TYPE__", type_int);
-       type_wchar_ptr_t = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE);
+       type_wchar_t_ptr = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE);
        type_size_t      = make_global_typedef("__SIZE_TYPE__",
                        make_atomic_type(ATOMIC_TYPE_ULONG, TYPE_QUALIFIER_NONE));
        type_ptrdiff_t   = make_global_typedef("__PTRDIFF_TYPE__",
index da4bf3d..822d666 100644 (file)
--- a/token_t.h
+++ b/token_t.h
@@ -2,6 +2,7 @@
 #define TOKEN_T_H
 
 #include <stdio.h>
+#include "string_rep.h"
 #include "symbol.h"
 #include "symbol_table.h"
 #include "type.h"
@@ -37,10 +38,11 @@ extern source_position_t builtin_source_position;
 typedef struct {
        int type;
        union {
-               symbol_t   *symbol;
-               long long   intvalue;
-               long double floatvalue;
-               const char *string;
+               symbol_t      *symbol;
+               long long      intvalue;
+               long double    floatvalue;
+               const char    *string;
+               wide_string_t  wide_string;
        } v;
        type_t            *datatype;
        source_position_t  source_position;
index 6af3859..6a72d84 100644 (file)
@@ -2,10 +2,11 @@
 #define TS(x,str,val)
 #endif
 
-TS(IDENTIFIER,     "identifier", = 256)
-TS(INTEGER,        "integer number",)
-TS(FLOATINGPOINT,  "floatingpoint number",)
-TS(STRING_LITERAL, "string literal",)
+TS(IDENTIFIER,          "identifier", = 256)
+TS(INTEGER,             "integer number",)
+TS(FLOATINGPOINT,       "floatingpoint number",)
+TS(STRING_LITERAL,      "string literal",)
+TS(WIDE_STRING_LITERAL, "wide string literal",)
 
 #define S(x)   T(x,#x,)
 S(auto)