X-Git-Url: http://nsz.repo.hu/git/?a=blobdiff_plain;f=string_rep.h;h=13a167462aec90712a94e5b832de380b917f1cc8;hb=103d024a3572a6607bd7d18a129b93d2dbfeff82;hp=1df0b5ba9057999748731add70a92c62fcc629c2;hpb=975933a9f57835385489ecdeda04a65894705eca;p=cparser diff --git a/string_rep.h b/string_rep.h index 1df0b5b..13a1674 100644 --- a/string_rep.h +++ b/string_rep.h @@ -1,20 +1,85 @@ +/* + * This file is part of cparser. + * Copyright (C) 2007-2009 Matthias Braun + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ #ifndef STRING_REP_H #define STRING_REP_H -#include +#include +#include -typedef wchar_t wchar_rep_t; - -#if 0 /* TODO */ typedef struct string_t { - const char *begin; - const char *end; + const char *begin; /**< UTF-8 encoded string, the last character is + * guaranteed to be 0 */ + size_t size; /**< size of string in bytes (not characters) */ } string_t; -#endif -typedef struct wide_string_t { - const wchar_rep_t *begin; - size_t size; -} wide_string_t; +typedef unsigned int utf32; +#define UTF32_PRINTF_FORMAT "%u" + +/** + * "parse" an utf8 character from a string. + * Warning: This function only works for valid utf-8 inputs. The behaviour + * is undefined for invalid utf-8 input. + * + * @param p A pointer to a pointer into the string. The pointer + * is incremented for each consumed char + */ +static inline utf32 read_utf8_char(const char **p) +{ + const unsigned char *c = (const unsigned char *) *p; + utf32 result; + + if ((*c & 0x80) == 0) { + /* 1 character encoding: 0b0??????? */ + result = *c++; + } else if ((*c & 0xE0) == 0xC0) { + /* 2 character encoding: 0b110?????, 0b10?????? */ + result = *c++ & 0x1F; + result = (result << 6) | (*c++ & 0x3F); + } else if ((*c & 0xF0) == 0xE0) { + /* 3 character encoding: 0b1110????, 0b10??????, 0b10?????? */ + result = *c++ & 0x0F; + result = (result << 6) | (*c++ & 0x3F); + result = (result << 6) | (*c++ & 0x3F); + } else { + /* 4 character enc.: 0b11110???, 0b10??????, 0b10??????, 0b10?????? */ + assert((*c & 0xF8) == 0xF0); + result = *c++ & 0x07; + result = (result << 6) | (*c++ & 0x3F); + result = (result << 6) | (*c++ & 0x3F); + result = (result << 6) | (*c++ & 0x3F); + } + + *p = (const char*) c; + return result; +} + +static inline size_t wstrlen(const string_t *string) +{ + size_t result = 0; + const char *p = string->begin; + const char *end = p + string->size; + while (p < end) { + read_utf8_char(&p); + ++result; + } + return result; +} #endif