Implement U, u and u8 strings.
authorChristoph Mallon <christoph.mallon@gmx.de>
Sat, 27 Oct 2012 18:58:19 +0000 (20:58 +0200)
committerChristoph Mallon <christoph.mallon@gmx.de>
Tue, 30 Oct 2012 09:39:50 +0000 (10:39 +0100)
ast2firm.c
parser.c
preprocessor.c
string_rep.c
string_rep.h
token.c
types.c
types.h

index 11a4844..31aa71d 100644 (file)
@@ -64,7 +64,6 @@ fp_model_t firm_fp_model = fp_model_precise;
 static const backend_params *be_params;
 
 static ir_type *ir_type_char;
-static ir_type *ir_type_wchar_t;
 
 /* architecture specific floating point arithmetic mode (if any) */
 static ir_mode *mode_float_arithmetic;
@@ -1134,7 +1133,8 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
        ir_initializer_t *const initializer = create_initializer_compound(slen);
        ir_type          *      elem_type;
        switch (value->encoding) {
-       case STRING_ENCODING_CHAR: {
+       case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8: {
                elem_type = ir_type_char;
 
                ir_mode *const mode = get_type_mode(elem_type);
@@ -1147,8 +1147,13 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
                goto finish;
        }
 
-       case STRING_ENCODING_WIDE: {
-               elem_type = ir_type_wchar_t;
+       {
+               type_t *type;
+       case STRING_ENCODING_CHAR16: type = type_char16_t; goto init_wide;
+       case STRING_ENCODING_CHAR32: type = type_char32_t; goto init_wide;
+       case STRING_ENCODING_WIDE:   type = type_wchar_t;  goto init_wide;
+init_wide:;
+               elem_type = get_ir_type(type);
 
                ir_mode *const mode = get_type_mode(elem_type);
                char const    *p    = value->begin;
@@ -3826,6 +3831,7 @@ static ir_initializer_t *create_ir_initializer_string(initializer_t const *const
        char const       *      p       = str->value.begin;
        switch (str->value.encoding) {
        case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:
                for (size_t i = 0; i != arr_len; ++i) {
                        char              const c      = i < str_len ? *p++ : 0;
                        ir_tarval        *const tv     = new_tarval_from_long(c, mode);
@@ -3834,6 +3840,8 @@ static ir_initializer_t *create_ir_initializer_string(initializer_t const *const
                }
                break;
 
+       case STRING_ENCODING_CHAR16:
+       case STRING_ENCODING_CHAR32:
        case STRING_ENCODING_WIDE:
                for (size_t i = 0; i != arr_len; ++i) {
                        utf32             const c      = i < str_len ? read_utf8_char(&p) : 0;
@@ -5348,8 +5356,7 @@ static void init_ir_types(void)
                return;
        ir_types_initialized = 1;
 
-       ir_type_char    = get_ir_type(type_char);
-       ir_type_wchar_t = get_ir_type(type_wchar_t);
+       ir_type_char = get_ir_type(type_char);
 
        be_params             = be_get_backend_param();
        mode_float_arithmetic = be_params->mode_float_arithmetic;
index 869cc25..8b0aadb 100644 (file)
--- a/parser.c
+++ b/parser.c
@@ -1060,8 +1060,13 @@ static string_t concat_string_literals(void)
                warningf(WARN_TRADITIONAL, HERE, "traditional C rejects string constant concatenation");
                string_encoding_t enc = token.literal.string.encoding;
                do {
-                       if (token.literal.string.encoding != STRING_ENCODING_CHAR) {
-                               enc = token.literal.string.encoding;
+                       string_encoding_t const new_enc = token.literal.string.encoding;
+                       if (new_enc != enc && new_enc != STRING_ENCODING_CHAR) {
+                               if (enc == STRING_ENCODING_CHAR) {
+                                       enc = new_enc;
+                               } else {
+                                       errorf(HERE, "concatenating string literals with encodings %s and %s", get_string_encoding_prefix(enc), get_string_encoding_prefix(new_enc));
+                               }
                        }
                        append_string(&token.literal.string);
                        eat(T_STRING_LITERAL);
@@ -1084,7 +1089,7 @@ static string_t parse_string_literals(char const *const context)
        string_t          const res = concat_string_literals();
 
        if (res.encoding != STRING_ENCODING_CHAR) {
-               errorf(&pos, "expected plain string literal, got wide string literal");
+               errorf(&pos, "expected plain string literal, got %s string literal", get_string_encoding_prefix(res.encoding));
        }
 
        return res;
@@ -1565,7 +1570,8 @@ static initializer_t *initializer_from_expression(type_t *orig_type,
                array_type_t *const array_type   = &type->array;
                type_t       *const element_type = skip_typeref(array_type->element_type);
                switch (expression->string_literal.value.encoding) {
-               case STRING_ENCODING_CHAR: {
+               case STRING_ENCODING_CHAR:
+               case STRING_ENCODING_UTF8: {
                        if (is_type_atomic(element_type, ATOMIC_TYPE_CHAR)  ||
                            is_type_atomic(element_type, ATOMIC_TYPE_SCHAR) ||
                            is_type_atomic(element_type, ATOMIC_TYPE_UCHAR)) {
@@ -1574,9 +1580,12 @@ static initializer_t *initializer_from_expression(type_t *orig_type,
                        break;
                }
 
+               case STRING_ENCODING_CHAR16:
+               case STRING_ENCODING_CHAR32:
                case STRING_ENCODING_WIDE: {
-                       type_t *bare_wchar_type = skip_typeref(type_wchar_t);
-                       if (get_unqualified_type(element_type) == bare_wchar_type) {
+                       assert(is_type_pointer(expression->base.type));
+                       type_t *const init_type = get_unqualified_type(expression->base.type->pointer.points_to);
+                       if (types_compatible(get_unqualified_type(element_type), init_type)) {
 make_string_init:;
                                initializer_t *const init = allocate_initializer_zero(INITIALIZER_STRING);
                                init->value.value = expression;
@@ -5633,8 +5642,11 @@ static type_t *get_string_type(string_encoding_t const enc)
 {
        bool const warn = is_warn_on(WARN_WRITE_STRINGS);
        switch (enc) {
-       case STRING_ENCODING_CHAR: return warn ? type_const_char_ptr    : type_char_ptr;
-       case STRING_ENCODING_WIDE: return warn ? type_const_wchar_t_ptr : type_wchar_t_ptr;
+       case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:   return warn ? type_const_char_ptr     : type_char_ptr;
+       case STRING_ENCODING_CHAR16: return warn ? type_char16_t_const_ptr : type_char16_t_ptr;
+       case STRING_ENCODING_CHAR32: return warn ? type_char32_t_const_ptr : type_char32_t_ptr;
+       case STRING_ENCODING_WIDE:   return warn ? type_const_wchar_t_ptr  : type_wchar_t_ptr;
        }
        panic("invalid string encoding");
 }
@@ -5875,6 +5887,7 @@ static expression_t *parse_character_constant(void)
        size_t const size = get_string_len(&token.literal.string);
        switch (token.literal.string.encoding) {
        case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:
                literal->base.type = c_mode & _CXX ? type_char : type_int;
                if (size > 1) {
                        if (!GNU_MODE && !(c_mode & _C99)) {
@@ -5886,8 +5899,10 @@ static expression_t *parse_character_constant(void)
                }
                break;
 
-       case STRING_ENCODING_WIDE:
-               literal->base.type = type_int;
+       case STRING_ENCODING_CHAR16: literal->base.type = type_char16_t; goto warn_multi;
+       case STRING_ENCODING_CHAR32: literal->base.type = type_char32_t; goto warn_multi;
+       case STRING_ENCODING_WIDE:   literal->base.type = type_wchar_t;  goto warn_multi;
+warn_multi:
                if (size > 1) {
                        warningf(WARN_MULTICHAR, HERE, "multi-character character constant");
                }
index 6e3daf3..55c1e3e 100644 (file)
@@ -136,6 +136,11 @@ static symbol_t *symbol_percentcolon;
 static symbol_t *symbol_percentcolonpercentcolon;
 static symbol_t *symbol_percentgreater;
 
+static symbol_t *symbol_L;
+static symbol_t *symbol_U;
+static symbol_t *symbol_u;
+static symbol_t *symbol_u8;
+
 static void init_symbols(void)
 {
        symbol_colongreater             = symbol_table_insert(":>");
@@ -144,6 +149,11 @@ static void init_symbols(void)
        symbol_percentcolon             = symbol_table_insert("%:");
        symbol_percentcolonpercentcolon = symbol_table_insert("%:%:");
        symbol_percentgreater           = symbol_table_insert("%>");
+
+       symbol_L  = symbol_table_insert("L");
+       symbol_U  = symbol_table_insert("U");
+       symbol_u  = symbol_table_insert("u");
+       symbol_u8 = symbol_table_insert("u8");
 }
 
 void switch_pp_input(FILE *const file, char const *const filename, searchpath_entry_t *const path, bool const is_system_header)
@@ -684,6 +694,18 @@ string_t make_string(char const *const string)
        return sym_make_string(STRING_ENCODING_CHAR);
 }
 
+static utf32 get_string_encoding_limit(string_encoding_t const enc)
+{
+       switch (enc) {
+       case STRING_ENCODING_CHAR:   return 0xFF;
+       case STRING_ENCODING_CHAR16: return 0xFFFF;
+       case STRING_ENCODING_CHAR32: return 0xFFFFFFFF;
+       case STRING_ENCODING_UTF8:   return 0xFFFFFFFF;
+       case STRING_ENCODING_WIDE:   return 0xFFFFFFFF; // FIXME depends on settings
+       }
+       panic("invalid string encoding");
+}
+
 static void parse_string(utf32 const delimiter, token_kind_t const kind,
                          string_encoding_t const enc,
                          char const *const context)
@@ -692,15 +714,16 @@ static void parse_string(utf32 const delimiter, token_kind_t const kind,
 
        eat(delimiter);
 
+       utf32 const limit = get_string_encoding_limit(enc);
        while (true) {
                switch (input.c) {
                case '\\': {
                        if (resolve_escape_sequences) {
                                utf32 const tc = parse_escape_sequence();
+                               if (tc > limit) {
+                                       warningf(WARN_OTHER, &pp_token.base.source_position, "escape sequence out of range");
+                               }
                                if (enc == STRING_ENCODING_CHAR) {
-                                       if (tc >= 0x100) {
-                                               warningf(WARN_OTHER, &pp_token.base.source_position, "escape sequence out of range");
-                                       }
                                        obstack_1grow(&symbol_obstack, tc);
                                } else {
                                        obstack_grow_utf8(&symbol_obstack, tc);
@@ -1133,6 +1156,17 @@ static inline void eat_token(token_kind_t const kind)
        next_input_token();
 }
 
+static string_encoding_t identify_encoding_prefix(symbol_t *const sym)
+{
+       if (sym == symbol_L) return STRING_ENCODING_WIDE;
+       if (c_mode & _C11) {
+               if (sym == symbol_U)  return STRING_ENCODING_CHAR32;
+               if (sym == symbol_u)  return STRING_ENCODING_CHAR16;
+               if (sym == symbol_u8) return STRING_ENCODING_UTF8;
+       }
+       return STRING_ENCODING_CHAR;
+}
+
 static void parse_symbol(void)
 {
        assert(obstack_object_size(&symbol_obstack) == 0);
@@ -1190,19 +1224,26 @@ end_symbol:
        obstack_1grow(&symbol_obstack, '\0');
        char *string = obstack_finish(&symbol_obstack);
 
-       /* might be a wide string or character constant ( L"string"/L'c' ) */
-       if (input.c == '"' && string[0] == 'L' && string[1] == '\0') {
-               obstack_free(&symbol_obstack, string);
-               parse_string_literal(STRING_ENCODING_WIDE);
-               return;
-       } else if (input.c == '\'' && string[0] == 'L' && string[1] == '\0') {
-               obstack_free(&symbol_obstack, string);
-               parse_character_constant(STRING_ENCODING_WIDE);
-               return;
-       }
-
        symbol_t *symbol = symbol_table_insert(string);
 
+       /* Might be a prefixed string or character constant: L/U/u/u8"string". */
+       if (input.c == '"') {
+               string_encoding_t const enc = identify_encoding_prefix(symbol);
+               if (enc != STRING_ENCODING_CHAR) {
+                       parse_string_literal(enc);
+                       return;
+               }
+       } else if (input.c == '\'') {
+               string_encoding_t const enc = identify_encoding_prefix(symbol);
+               if (enc != STRING_ENCODING_CHAR) {
+                       if (enc == STRING_ENCODING_UTF8) {
+                               errorf(&pp_token.base.source_position, "'u8' is not a valid encoding for a chracter constant");
+                       }
+                       parse_character_constant(enc);
+                       return;
+               }
+       }
+
        pp_token.kind        = symbol->ID;
        pp_token.base.symbol = symbol;
 
index ff58aad..28b0746 100644 (file)
@@ -16,8 +16,11 @@ static inline size_t wstrlen(const string_t *string)
 size_t get_string_len(string_t const *const str)
 {
        switch (str->encoding) {
-       case STRING_ENCODING_CHAR: return str->size;
-       case STRING_ENCODING_WIDE: return wstrlen(str);
+       case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:   return str->size;
+       case STRING_ENCODING_CHAR16:
+       case STRING_ENCODING_CHAR32:
+       case STRING_ENCODING_WIDE:   return wstrlen(str);
        }
        panic("invalid string encoding");
 }
index c0868eb..ce4ca3e 100644 (file)
@@ -25,6 +25,9 @@
 
 enum string_encoding_t {
        STRING_ENCODING_CHAR,
+       STRING_ENCODING_CHAR16,
+       STRING_ENCODING_CHAR32,
+       STRING_ENCODING_UTF8,
        STRING_ENCODING_WIDE
 };
 typedef enum string_encoding_t string_encoding_t;
diff --git a/token.c b/token.c
index 7d2104c..50de656 100644 (file)
--- a/token.c
+++ b/token.c
@@ -95,8 +95,11 @@ void print_token_kind(FILE *f, token_kind_t token_kind)
 char const *get_string_encoding_prefix(string_encoding_t const enc)
 {
        switch (enc) {
-       case STRING_ENCODING_CHAR: return "";
-       case STRING_ENCODING_WIDE: return "L";
+       case STRING_ENCODING_CHAR:   return "";
+       case STRING_ENCODING_CHAR16: return "u";
+       case STRING_ENCODING_CHAR32: return "U";
+       case STRING_ENCODING_UTF8:   return "u8";
+       case STRING_ENCODING_WIDE:   return "L";
        }
        panic("invalid string encoding");
 }
diff --git a/types.c b/types.c
index 24d38d3..d6ff22e 100644 (file)
--- a/types.c
+++ b/types.c
@@ -61,6 +61,10 @@ type_t *type_const_void_ptr_restrict;
 
 type_t *type_char_ptr_ptr;
 
+type_t *type_char16_t;
+type_t *type_char32_t;
+type_t *type_char16_t_const;
+type_t *type_char32_t_const;
 type_t *type_intmax_t;
 type_t *type_ptrdiff_t;
 type_t *type_size_t;
@@ -73,6 +77,10 @@ type_t *type_wint_t;
 type_t *type_int32_t;
 type_t *type_int64_t;
 
+type_t *type_char16_t_ptr;
+type_t *type_char32_t_ptr;
+type_t *type_char16_t_const_ptr;
+type_t *type_char32_t_const_ptr;
 type_t *type_intmax_t_ptr;
 type_t *type_ptrdiff_t_ptr;
 type_t *type_ssize_t_ptr;
@@ -198,4 +206,16 @@ void init_wchar_types(atomic_type_kind_t akind)
        type_wchar_t_ptr   = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE);
        type_const_wchar_t_ptr
                = make_pointer_type(type_const_wchar_t, TYPE_QUALIFIER_NONE);
+
+       atomic_type_kind_t const u2 = find_unsigned_int_atomic_type_kind_for_size(2);
+       type_char16_t           = make_atomic_type(u2, TYPE_QUALIFIER_NONE);
+       type_char16_t_const     = make_atomic_type(u2, TYPE_QUALIFIER_CONST);
+       type_char16_t_ptr       = make_pointer_type(type_char16_t,       TYPE_QUALIFIER_NONE);
+       type_char16_t_const_ptr = make_pointer_type(type_char16_t_const, TYPE_QUALIFIER_NONE);
+
+       atomic_type_kind_t const u4 = find_unsigned_int_atomic_type_kind_for_size(4);
+       type_char32_t           = make_atomic_type(u4, TYPE_QUALIFIER_NONE);
+       type_char32_t_const     = make_atomic_type(u4, TYPE_QUALIFIER_CONST);
+       type_char32_t_ptr       = make_pointer_type(type_char32_t,       TYPE_QUALIFIER_NONE);
+       type_char32_t_const_ptr = make_pointer_type(type_char32_t_const, TYPE_QUALIFIER_NONE);
 }
diff --git a/types.h b/types.h
index b173a00..e2cc949 100644 (file)
--- a/types.h
+++ b/types.h
@@ -60,6 +60,10 @@ extern type_t *type_const_void_ptr_restrict;
 
 extern type_t *type_char_ptr_ptr;
 
+extern type_t *type_char16_t;
+extern type_t *type_char32_t;
+extern type_t *type_char16_t_const;
+extern type_t *type_char32_t_const;
 extern type_t *type_intmax_t;
 extern type_t *type_ptrdiff_t;
 extern type_t *type_size_t;
@@ -73,6 +77,10 @@ extern type_t *type_wint_t;
 extern type_t *type_int32_t;
 extern type_t *type_int64_t;
 
+extern type_t *type_char16_t_ptr;
+extern type_t *type_char32_t_ptr;
+extern type_t *type_char16_t_const_ptr;
+extern type_t *type_char32_t_const_ptr;
 extern type_t *type_intmax_t_ptr;
 extern type_t *type_ptrdiff_t_ptr;
 extern type_t *type_ssize_t_ptr;