From 6d30fa5c7d03437d992a80234c88d8eb6431b35e Mon Sep 17 00:00:00 2001
From: Christoph Mallon <christoph.mallon@gmx.de>
Date: Wed, 13 Jun 2012 12:37:01 +0200
Subject: [PATCH] Consistently use get_string_len() to correct the size
 calculation of wide string literals.

---
 Makefile     |  1 +
 ast2firm.c   | 14 +++++---------
 parser.c     |  9 +++++----
 string_rep.c | 23 +++++++++++++++++++++++
 string_rep.h | 18 +++++++-----------
 token_t.h    |  6 ------
 6 files changed, 41 insertions(+), 30 deletions(-)
 create mode 100644 string_rep.c

diff --git a/Makefile b/Makefile
index 2299308..88333e1 100644
--- a/Makefile
+++ b/Makefile
@@ -52,6 +52,7 @@ SOURCES := \
 	mangle.c \
 	preprocessor.c \
 	printer.c \
+	string_rep.c \
 	symbol_table.c \
 	token.c \
 	type.c \
diff --git a/ast2firm.c b/ast2firm.c
index 9c150b9..c85a8a1 100644
--- a/ast2firm.c
+++ b/ast2firm.c
@@ -1103,14 +1103,12 @@ static ir_node *create_conv(dbg_info *dbgi, ir_node *value, ir_mode *dest_mode)
  */
 static ir_node *string_to_firm(source_position_t const *const src_pos, char const *const id_prefix, string_encoding_t const enc, string_t const *const value)
 {
-	size_t            slen;
-	ir_type          *elem_type;
-	ir_initializer_t *initializer;
+	size_t            const slen        = get_string_len(enc, value) + 1;
+	ir_initializer_t *const initializer = create_initializer_compound(slen);
+	ir_type          *      elem_type;
 	switch (enc) {
 	case STRING_ENCODING_CHAR: {
-		slen        = value->size + 1;
-		elem_type   = ir_type_char;
-		initializer = create_initializer_compound(slen);
+		elem_type = ir_type_char;
 
 		ir_mode *const mode = get_type_mode(elem_type);
 		char const    *p    = value->begin;
@@ -1123,9 +1121,7 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
 	}
 
 	case STRING_ENCODING_WIDE: {
-		slen        = wstrlen(value) + 1;
-		elem_type   = ir_type_wchar_t;
-		initializer = create_initializer_compound(slen);
+		elem_type = ir_type_wchar_t;
 
 		ir_mode *const mode = get_type_mode(elem_type);
 		char const    *p    = value->begin;
diff --git a/parser.c b/parser.c
index 1dc9abb..4f3cddc 100644
--- a/parser.c
+++ b/parser.c
@@ -2222,7 +2222,7 @@ static initializer_t *parse_initializer(parse_initializer_env_t *env)
 			break;
 
 		case INITIALIZER_STRING:
-			size = result->string.string.size + 1;
+			size = get_string_len(result->string.encoding, &result->string.string) + 1;
 			break;
 
 		case INITIALIZER_DESIGNATOR:
@@ -5839,10 +5839,11 @@ static expression_t *parse_character_constant(void)
 	literal->string_literal.encoding = token.string.encoding;
 	literal->string_literal.value    = token.string.string;
 
+	size_t const size = get_string_len(token.string.encoding, &token.string.string);
 	switch (token.string.encoding) {
 	case STRING_ENCODING_CHAR:
 		literal->base.type = c_mode & _CXX ? type_char : type_int;
-		if (literal->string_literal.value.size > 1) {
+		if (size > 1) {
 			if (!GNU_MODE && !(c_mode & _C99)) {
 				errorf(HERE, "more than 1 character in character constant");
 			} else {
@@ -5854,7 +5855,7 @@ static expression_t *parse_character_constant(void)
 
 	case STRING_ENCODING_WIDE:
 		literal->base.type = type_int;
-		if (wstrlen(&literal->string_literal.value) > 1) {
+		if (size > 1) {
 			warningf(WARN_MULTICHAR, HERE, "multi-character character constant");
 		}
 		break;
@@ -5949,7 +5950,7 @@ type_t *revert_automatic_type_conversion(const expression_t *expression)
 	}
 
 	case EXPR_STRING_LITERAL: {
-		size_t  const size = expression->string_literal.value.size + 1;
+		size_t  const size = get_string_len(expression->string_literal.encoding, &expression->string_literal.value) + 1;
 		type_t *const elem = get_unqualified_type(expression->base.type->pointer.points_to);
 		return make_array_type(elem, size, TYPE_QUALIFIER_NONE);
 	}
diff --git a/string_rep.c b/string_rep.c
new file mode 100644
index 0000000..19c59da
--- /dev/null
+++ b/string_rep.c
@@ -0,0 +1,23 @@
+#include "adt/error.h"
+#include "string_rep.h"
+
+static inline size_t wstrlen(const string_t *string)
+{
+	size_t      result = 0;
+	const char *p      = string->begin;
+	const char *end    = p + string->size;
+	while (p < end) {
+		read_utf8_char(&p);
+		++result;
+	}
+	return result;
+}
+
+size_t get_string_len(string_encoding_t const enc, string_t const *const str)
+{
+	switch (enc) {
+	case STRING_ENCODING_CHAR: return str->size;
+	case STRING_ENCODING_WIDE: return wstrlen(str);
+	}
+	panic("invalid string encoding");
+}
diff --git a/string_rep.h b/string_rep.h
index f3a1e6b..fd6c00b 100644
--- a/string_rep.h
+++ b/string_rep.h
@@ -23,22 +23,18 @@
 #include <stdlib.h>
 #include "unicode.h"
 
+enum string_encoding_t {
+	STRING_ENCODING_CHAR,
+	STRING_ENCODING_WIDE
+};
+typedef enum string_encoding_t string_encoding_t;
+
 typedef struct string_t {
 	const char *begin; /**< UTF-8 encoded string, the last character is
 						* guaranteed to be 0 */
 	size_t      size;  /**< size of string in bytes (not characters) */
 } string_t;
 
-static inline size_t wstrlen(const string_t *string)
-{
-	size_t      result = 0;
-	const char *p      = string->begin;
-	const char *end    = p + string->size;
-	while (p < end) {
-		read_utf8_char(&p);
-		++result;
-	}
-	return result;
-}
+size_t get_string_len(string_encoding_t enc, string_t const *str);
 
 #endif
diff --git a/token_t.h b/token_t.h
index 9a68f81..5d66af4 100644
--- a/token_t.h
+++ b/token_t.h
@@ -70,12 +70,6 @@ struct token_base_t {
 	symbol_t         *symbol;
 };
 
-enum string_encoding_t {
-	STRING_ENCODING_CHAR,
-	STRING_ENCODING_WIDE
-};
-typedef enum string_encoding_t string_encoding_t;
-
 struct string_literal_t {
 	token_base_t      base;
 	string_encoding_t encoding;
-- 
2.20.1