Implement U, u and u8 strings.

author Christoph Mallon <christoph.mallon@gmx.de>

Sat, 27 Oct 2012 18:58:19 +0000 (20:58 +0200)

committer Christoph Mallon <christoph.mallon@gmx.de>

Tue, 30 Oct 2012 09:39:50 +0000 (10:39 +0100)
author Christoph Mallon <christoph.mallon@gmx.de>
Sat, 27 Oct 2012 18:58:19 +0000 (20:58 +0200)
committer Christoph Mallon <christoph.mallon@gmx.de>
Tue, 30 Oct 2012 09:39:50 +0000 (10:39 +0100)
diff --git a/ast2firm.c b/ast2firm.c

index 11a4844..31aa71d 100644 (file)
--- a/ast2firm.c
+++ b/ast2firm.c
@@ -64,7 +64,6 @@ fp_model_t firm_fp_model = fp_model_precise;
  static const backend_params *be_params;
  
  static ir_type *ir_type_char;
-static ir_type *ir_type_wchar_t;
  
  /* architecture specific floating point arithmetic mode (if any) */
  static ir_mode *mode_float_arithmetic;
@@ -1134,7 +1133,8 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
         ir_initializer_t *const initializer = create_initializer_compound(slen);
         ir_type          *      elem_type;
         switch (value->encoding) {
-       case STRING_ENCODING_CHAR: {
+       case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8: {
                 elem_type = ir_type_char;
  
                 ir_mode *const mode = get_type_mode(elem_type);
@@ -1147,8 +1147,13 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
                 goto finish;
         }
  
-       case STRING_ENCODING_WIDE: {
-               elem_type = ir_type_wchar_t;
+       {
+               type_t *type;
+       case STRING_ENCODING_CHAR16: type = type_char16_t; goto init_wide;
+       case STRING_ENCODING_CHAR32: type = type_char32_t; goto init_wide;
+       case STRING_ENCODING_WIDE:   type = type_wchar_t;  goto init_wide;
+init_wide:;
+               elem_type = get_ir_type(type);
  
                 ir_mode *const mode = get_type_mode(elem_type);
                 char const    *p    = value->begin;
@@ -3826,6 +3831,7 @@ static ir_initializer_t *create_ir_initializer_string(initializer_t const *const
         char const       *      p       = str->value.begin;
         switch (str->value.encoding) {
         case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:
                 for (size_t i = 0; i != arr_len; ++i) {
                         char              const c      = i < str_len ? *p++ : 0;
                         ir_tarval        *const tv     = new_tarval_from_long(c, mode);
@@ -3834,6 +3840,8 @@ static ir_initializer_t *create_ir_initializer_string(initializer_t const *const
                 }
                 break;
  
+       case STRING_ENCODING_CHAR16:
+       case STRING_ENCODING_CHAR32:
         case STRING_ENCODING_WIDE:
                 for (size_t i = 0; i != arr_len; ++i) {
                         utf32             const c      = i < str_len ? read_utf8_char(&p) : 0;
@@ -5348,8 +5356,7 @@ static void init_ir_types(void)
                 return;
         ir_types_initialized = 1;
  
-       ir_type_char    = get_ir_type(type_char);
-       ir_type_wchar_t = get_ir_type(type_wchar_t);
+       ir_type_char = get_ir_type(type_char);
  
         be_params             = be_get_backend_param();
         mode_float_arithmetic = be_params->mode_float_arithmetic;
diff --git a/parser.c b/parser.c

index 869cc25..8b0aadb 100644 (file)
--- a/parser.c
+++ b/parser.c
@@ -1060,8 +1060,13 @@ static string_t concat_string_literals(void)
                 warningf(WARN_TRADITIONAL, HERE, "traditional C rejects string constant concatenation");
                 string_encoding_t enc = token.literal.string.encoding;
                 do {
-                       if (token.literal.string.encoding != STRING_ENCODING_CHAR) {
-                               enc = token.literal.string.encoding;
+                       string_encoding_t const new_enc = token.literal.string.encoding;
+                       if (new_enc != enc && new_enc != STRING_ENCODING_CHAR) {
+                               if (enc == STRING_ENCODING_CHAR) {
+                                       enc = new_enc;
+                               } else {
+                                       errorf(HERE, "concatenating string literals with encodings %s and %s", get_string_encoding_prefix(enc), get_string_encoding_prefix(new_enc));
+                               }
                         }
                         append_string(&token.literal.string);
                         eat(T_STRING_LITERAL);
@@ -1084,7 +1089,7 @@ static string_t parse_string_literals(char const *const context)
         string_t          const res = concat_string_literals();
  
         if (res.encoding != STRING_ENCODING_CHAR) {
-               errorf(&pos, "expected plain string literal, got wide string literal");
+               errorf(&pos, "expected plain string literal, got %s string literal", get_string_encoding_prefix(res.encoding));
         }
  
         return res;
@@ -1565,7 +1570,8 @@ static initializer_t *initializer_from_expression(type_t *orig_type,
                 array_type_t *const array_type   = &type->array;
                 type_t       *const element_type = skip_typeref(array_type->element_type);
                 switch (expression->string_literal.value.encoding) {
-               case STRING_ENCODING_CHAR: {
+               case STRING_ENCODING_CHAR:
+               case STRING_ENCODING_UTF8: {
                         if (is_type_atomic(element_type, ATOMIC_TYPE_CHAR)  ||
                             is_type_atomic(element_type, ATOMIC_TYPE_SCHAR) ||
                             is_type_atomic(element_type, ATOMIC_TYPE_UCHAR)) {
@@ -1574,9 +1580,12 @@ static initializer_t *initializer_from_expression(type_t *orig_type,
                         break;
                 }
  
+               case STRING_ENCODING_CHAR16:
+               case STRING_ENCODING_CHAR32:
                 case STRING_ENCODING_WIDE: {
-                       type_t *bare_wchar_type = skip_typeref(type_wchar_t);
-                       if (get_unqualified_type(element_type) == bare_wchar_type) {
+                       assert(is_type_pointer(expression->base.type));
+                       type_t *const init_type = get_unqualified_type(expression->base.type->pointer.points_to);
+                       if (types_compatible(get_unqualified_type(element_type), init_type)) {
  make_string_init:;
                                 initializer_t *const init = allocate_initializer_zero(INITIALIZER_STRING);
                                 init->value.value = expression;
@@ -5633,8 +5642,11 @@ static type_t *get_string_type(string_encoding_t const enc)
  {
         bool const warn = is_warn_on(WARN_WRITE_STRINGS);
         switch (enc) {
-       case STRING_ENCODING_CHAR: return warn ? type_const_char_ptr    : type_char_ptr;
-       case STRING_ENCODING_WIDE: return warn ? type_const_wchar_t_ptr : type_wchar_t_ptr;
+       case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:   return warn ? type_const_char_ptr     : type_char_ptr;
+       case STRING_ENCODING_CHAR16: return warn ? type_char16_t_const_ptr : type_char16_t_ptr;
+       case STRING_ENCODING_CHAR32: return warn ? type_char32_t_const_ptr : type_char32_t_ptr;
+       case STRING_ENCODING_WIDE:   return warn ? type_const_wchar_t_ptr  : type_wchar_t_ptr;
         }
         panic("invalid string encoding");
  }
@@ -5875,6 +5887,7 @@ static expression_t *parse_character_constant(void)
         size_t const size = get_string_len(&token.literal.string);
         switch (token.literal.string.encoding) {
         case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:
                 literal->base.type = c_mode & _CXX ? type_char : type_int;
                 if (size > 1) {
                         if (!GNU_MODE && !(c_mode & _C99)) {
@@ -5886,8 +5899,10 @@ static expression_t *parse_character_constant(void)
                 }
                 break;
  
-       case STRING_ENCODING_WIDE:
-               literal->base.type = type_int;
+       case STRING_ENCODING_CHAR16: literal->base.type = type_char16_t; goto warn_multi;
+       case STRING_ENCODING_CHAR32: literal->base.type = type_char32_t; goto warn_multi;
+       case STRING_ENCODING_WIDE:   literal->base.type = type_wchar_t;  goto warn_multi;
+warn_multi:
                 if (size > 1) {
                         warningf(WARN_MULTICHAR, HERE, "multi-character character constant");
                 }
diff --git a/preprocessor.c b/preprocessor.c

index 6e3daf3..55c1e3e 100644 (file)
--- a/preprocessor.c
+++ b/preprocessor.c
@@ -136,6 +136,11 @@ static symbol_t *symbol_percentcolon;
  static symbol_t *symbol_percentcolonpercentcolon;
  static symbol_t *symbol_percentgreater;
  
+static symbol_t *symbol_L;
+static symbol_t *symbol_U;
+static symbol_t *symbol_u;
+static symbol_t *symbol_u8;
+
  static void init_symbols(void)
  {
         symbol_colongreater             = symbol_table_insert(":>");
@@ -144,6 +149,11 @@ static void init_symbols(void)
         symbol_percentcolon             = symbol_table_insert("%:");
         symbol_percentcolonpercentcolon = symbol_table_insert("%:%:");
         symbol_percentgreater           = symbol_table_insert("%>");
+
+       symbol_L  = symbol_table_insert("L");
+       symbol_U  = symbol_table_insert("U");
+       symbol_u  = symbol_table_insert("u");
+       symbol_u8 = symbol_table_insert("u8");
  }
  
  void switch_pp_input(FILE *const file, char const *const filename, searchpath_entry_t *const path, bool const is_system_header)
@@ -684,6 +694,18 @@ string_t make_string(char const *const string)
         return sym_make_string(STRING_ENCODING_CHAR);
  }
  
+static utf32 get_string_encoding_limit(string_encoding_t const enc)
+{
+       switch (enc) {
+       case STRING_ENCODING_CHAR:   return 0xFF;
+       case STRING_ENCODING_CHAR16: return 0xFFFF;
+       case STRING_ENCODING_CHAR32: return 0xFFFFFFFF;
+       case STRING_ENCODING_UTF8:   return 0xFFFFFFFF;
+       case STRING_ENCODING_WIDE:   return 0xFFFFFFFF; // FIXME depends on settings
+       }
+       panic("invalid string encoding");
+}
+
  static void parse_string(utf32 const delimiter, token_kind_t const kind,
                           string_encoding_t const enc,
                           char const *const context)
@@ -692,15 +714,16 @@ static void parse_string(utf32 const delimiter, token_kind_t const kind,
  
         eat(delimiter);
  
+       utf32 const limit = get_string_encoding_limit(enc);
         while (true) {
                 switch (input.c) {
                 case '\\': {
                         if (resolve_escape_sequences) {
                                 utf32 const tc = parse_escape_sequence();
+                               if (tc > limit) {
+                                       warningf(WARN_OTHER, &pp_token.base.source_position, "escape sequence out of range");
+                               }
                                 if (enc == STRING_ENCODING_CHAR) {
-                                       if (tc >= 0x100) {
-                                               warningf(WARN_OTHER, &pp_token.base.source_position, "escape sequence out of range");
-                                       }
                                         obstack_1grow(&symbol_obstack, tc);
                                 } else {
                                         obstack_grow_utf8(&symbol_obstack, tc);
@@ -1133,6 +1156,17 @@ static inline void eat_token(token_kind_t const kind)
         next_input_token();
  }
  
+static string_encoding_t identify_encoding_prefix(symbol_t *const sym)
+{
+       if (sym == symbol_L) return STRING_ENCODING_WIDE;
+       if (c_mode & _C11) {
+               if (sym == symbol_U)  return STRING_ENCODING_CHAR32;
+               if (sym == symbol_u)  return STRING_ENCODING_CHAR16;
+               if (sym == symbol_u8) return STRING_ENCODING_UTF8;
+       }
+       return STRING_ENCODING_CHAR;
+}
+
  static void parse_symbol(void)
  {
         assert(obstack_object_size(&symbol_obstack) == 0);
@@ -1190,19 +1224,26 @@ end_symbol:
         obstack_1grow(&symbol_obstack, '\0');
         char *string = obstack_finish(&symbol_obstack);
  
-       /* might be a wide string or character constant ( L"string"/L'c' ) */
-       if (input.c == '"' && string[0] == 'L' && string[1] == '\0') {
-               obstack_free(&symbol_obstack, string);
-               parse_string_literal(STRING_ENCODING_WIDE);
-               return;
-       } else if (input.c == '\'' && string[0] == 'L' && string[1] == '\0') {
-               obstack_free(&symbol_obstack, string);
-               parse_character_constant(STRING_ENCODING_WIDE);
-               return;
-       }
-
         symbol_t *symbol = symbol_table_insert(string);
  
+       /* Might be a prefixed string or character constant: L/U/u/u8"string". */
+       if (input.c == '"') {
+               string_encoding_t const enc = identify_encoding_prefix(symbol);
+               if (enc != STRING_ENCODING_CHAR) {
+                       parse_string_literal(enc);
+                       return;
+               }
+       } else if (input.c == '\'') {
+               string_encoding_t const enc = identify_encoding_prefix(symbol);
+               if (enc != STRING_ENCODING_CHAR) {
+                       if (enc == STRING_ENCODING_UTF8) {
+                               errorf(&pp_token.base.source_position, "'u8' is not a valid encoding for a chracter constant");
+                       }
+                       parse_character_constant(enc);
+                       return;
+               }
+       }
+
         pp_token.kind        = symbol->ID;
         pp_token.base.symbol = symbol;
  
diff --git a/string_rep.c b/string_rep.c

index ff58aad..28b0746 100644 (file)
--- a/string_rep.c
+++ b/string_rep.c
@@ -16,8 +16,11 @@ static inline size_t wstrlen(const string_t *string)
  size_t get_string_len(string_t const *const str)
  {
         switch (str->encoding) {
-       case STRING_ENCODING_CHAR: return str->size;
-       case STRING_ENCODING_WIDE: return wstrlen(str);
+       case STRING_ENCODING_CHAR:
+       case STRING_ENCODING_UTF8:   return str->size;
+       case STRING_ENCODING_CHAR16:
+       case STRING_ENCODING_CHAR32:
+       case STRING_ENCODING_WIDE:   return wstrlen(str);
         }
         panic("invalid string encoding");
  }
diff --git a/string_rep.h b/string_rep.h

index c0868eb..ce4ca3e 100644 (file)
--- a/string_rep.h
+++ b/string_rep.h
@@ -25,6 +25,9 @@
  
  enum string_encoding_t {
         STRING_ENCODING_CHAR,
+       STRING_ENCODING_CHAR16,
+       STRING_ENCODING_CHAR32,
+       STRING_ENCODING_UTF8,
         STRING_ENCODING_WIDE
  };
  typedef enum string_encoding_t string_encoding_t;
diff --git a/token.c b/token.c

index 7d2104c..50de656 100644 (file)
--- a/token.c
+++ b/token.c
@@ -95,8 +95,11 @@ void print_token_kind(FILE *f, token_kind_t token_kind)
  char const *get_string_encoding_prefix(string_encoding_t const enc)
  {
         switch (enc) {
-       case STRING_ENCODING_CHAR: return "";
-       case STRING_ENCODING_WIDE: return "L";
+       case STRING_ENCODING_CHAR:   return "";
+       case STRING_ENCODING_CHAR16: return "u";
+       case STRING_ENCODING_CHAR32: return "U";
+       case STRING_ENCODING_UTF8:   return "u8";
+       case STRING_ENCODING_WIDE:   return "L";
         }
         panic("invalid string encoding");
  }
diff --git a/types.c b/types.c

index 24d38d3..d6ff22e 100644 (file)
--- a/types.c
+++ b/types.c
@@ -61,6 +61,10 @@ type_t *type_const_void_ptr_restrict;
  
  type_t *type_char_ptr_ptr;
  
+type_t *type_char16_t;
+type_t *type_char32_t;
+type_t *type_char16_t_const;
+type_t *type_char32_t_const;
  type_t *type_intmax_t;
  type_t *type_ptrdiff_t;
  type_t *type_size_t;
@@ -73,6 +77,10 @@ type_t *type_wint_t;
  type_t *type_int32_t;
  type_t *type_int64_t;
  
+type_t *type_char16_t_ptr;
+type_t *type_char32_t_ptr;
+type_t *type_char16_t_const_ptr;
+type_t *type_char32_t_const_ptr;
  type_t *type_intmax_t_ptr;
  type_t *type_ptrdiff_t_ptr;
  type_t *type_ssize_t_ptr;
@@ -198,4 +206,16 @@ void init_wchar_types(atomic_type_kind_t akind)
         type_wchar_t_ptr   = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE);
         type_const_wchar_t_ptr
                 = make_pointer_type(type_const_wchar_t, TYPE_QUALIFIER_NONE);
+
+       atomic_type_kind_t const u2 = find_unsigned_int_atomic_type_kind_for_size(2);
+       type_char16_t           = make_atomic_type(u2, TYPE_QUALIFIER_NONE);
+       type_char16_t_const     = make_atomic_type(u2, TYPE_QUALIFIER_CONST);
+       type_char16_t_ptr       = make_pointer_type(type_char16_t,       TYPE_QUALIFIER_NONE);
+       type_char16_t_const_ptr = make_pointer_type(type_char16_t_const, TYPE_QUALIFIER_NONE);
+
+       atomic_type_kind_t const u4 = find_unsigned_int_atomic_type_kind_for_size(4);
+       type_char32_t           = make_atomic_type(u4, TYPE_QUALIFIER_NONE);
+       type_char32_t_const     = make_atomic_type(u4, TYPE_QUALIFIER_CONST);
+       type_char32_t_ptr       = make_pointer_type(type_char32_t,       TYPE_QUALIFIER_NONE);
+       type_char32_t_const_ptr = make_pointer_type(type_char32_t_const, TYPE_QUALIFIER_NONE);
  }
diff --git a/types.h b/types.h

index b173a00..e2cc949 100644 (file)
--- a/types.h
+++ b/types.h
@@ -60,6 +60,10 @@ extern type_t *type_const_void_ptr_restrict;
  
  extern type_t *type_char_ptr_ptr;
  
+extern type_t *type_char16_t;
+extern type_t *type_char32_t;
+extern type_t *type_char16_t_const;
+extern type_t *type_char32_t_const;
  extern type_t *type_intmax_t;
  extern type_t *type_ptrdiff_t;
  extern type_t *type_size_t;
@@ -73,6 +77,10 @@ extern type_t *type_wint_t;
  extern type_t *type_int32_t;
  extern type_t *type_int64_t;
  
+extern type_t *type_char16_t_ptr;
+extern type_t *type_char32_t_ptr;
+extern type_t *type_char16_t_const_ptr;
+extern type_t *type_char32_t_const_ptr;
  extern type_t *type_intmax_t_ptr;
  extern type_t *type_ptrdiff_t_ptr;
  extern type_t *type_ssize_t_ptr;
author	Christoph Mallon <christoph.mallon@gmx.de>
	Sat, 27 Oct 2012 18:58:19 +0000 (20:58 +0200)
committer	Christoph Mallon <christoph.mallon@gmx.de>
	Tue, 30 Oct 2012 09:39:50 +0000 (10:39 +0100)
ast2firm.c		patch \| blob \| history
parser.c		patch \| blob \| history
preprocessor.c		patch \| blob \| history
string_rep.c		patch \| blob \| history
string_rep.h		patch \| blob \| history
token.c		patch \| blob \| history
types.c		patch \| blob \| history
types.h		patch \| blob \| history