Parse __extension__ like GCC: External declarations may start with it.

[cparser] / string_rep.h
diff --git a/string_rep.h b/string_rep.h

index e24b40c..13a1674 100644 (file)
--- a/string_rep.h
+++ b/string_rep.h
@@ -20,18 +20,66 @@
  #ifndef STRING_REP_H
  #define STRING_REP_H
  
-#include <wchar.h>
-
-typedef wchar_t wchar_rep_t;
+#include <assert.h>
+#include <stdlib.h>
  
  typedef struct string_t {
-       const char *begin;
-       size_t      size;
+       const char *begin; /**< UTF-8 encoded string, the last character is
+                                               * guaranteed to be 0 */
+       size_t      size;  /**< size of string in bytes (not characters) */
  } string_t;
  
-typedef struct wide_string_t {
-       const wchar_rep_t *begin;
-       size_t             size;
-} wide_string_t;
+typedef unsigned int utf32;
+#define UTF32_PRINTF_FORMAT "%u"
+
+/**
+ * "parse" an utf8 character from a string.
+ * Warning: This function only works for valid utf-8 inputs. The behaviour
+ * is undefined for invalid utf-8 input.
+ *
+ * @param p    A pointer to a pointer into the string. The pointer
+ *             is incremented for each consumed char
+ */
+static inline utf32 read_utf8_char(const char **p)
+{
+       const unsigned char *c      = (const unsigned char *) *p;
+       utf32                result;
+
+       if ((*c & 0x80) == 0) {
+               /* 1 character encoding: 0b0??????? */
+               result = *c++;
+       } else if ((*c & 0xE0) == 0xC0) {
+               /* 2 character encoding: 0b110?????, 0b10?????? */
+               result = *c++ & 0x1F;
+               result = (result << 6) | (*c++ & 0x3F);
+       } else if ((*c & 0xF0) == 0xE0) {
+               /* 3 character encoding: 0b1110????, 0b10??????, 0b10?????? */
+               result = *c++ & 0x0F;
+               result = (result << 6) | (*c++ & 0x3F);
+               result = (result << 6) | (*c++ & 0x3F);
+       } else {
+               /* 4 character enc.: 0b11110???, 0b10??????, 0b10??????, 0b10?????? */
+               assert((*c & 0xF8) == 0xF0);
+               result = *c++ & 0x07;
+               result = (result << 6) | (*c++ & 0x3F);
+               result = (result << 6) | (*c++ & 0x3F);
+               result = (result << 6) | (*c++ & 0x3F);
+       }
+
+       *p = (const char*) c;
+       return result;
+}
+
+static inline size_t wstrlen(const string_t *string)
+{
+       size_t      result = 0;
+       const char *p      = string->begin;
+       const char *end    = p + string->size;
+       while (p < end) {
+               read_utf8_char(&p);
+               ++result;
+       }
+       return result;
+}
  
  #endif