}
}
-#define SYMBOL_CHARS \
- case '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
+#define SYMBOL_CHARS_WITHOUT_E_P \
+ '$': if (!allow_dollar_in_symbol) goto dollar_sign; \
case 'a': \
case 'b': \
case 'c': \
case 'd': \
- case 'e': \
case 'f': \
case 'g': \
case 'h': \
case 'm': \
case 'n': \
case 'o': \
- case 'p': \
case 'q': \
case 'r': \
case 's': \
case 'B': \
case 'C': \
case 'D': \
- case 'E': \
case 'F': \
case 'G': \
case 'H': \
case 'M': \
case 'N': \
case 'O': \
- case 'P': \
case 'Q': \
case 'R': \
case 'S': \
case 'X': \
case 'Y': \
case 'Z': \
- case '_':
+ case '_'
+
+#define SYMBOL_CHARS_E_P \
+ 'E': \
+ case 'P': \
+ case 'e': \
+ case 'p'
+
+#define SYMBOL_CHARS \
+ SYMBOL_CHARS_WITHOUT_E_P: \
+ case SYMBOL_CHARS_E_P
#define DIGITS \
- case '0': \
+ '0': \
case '1': \
case '2': \
case '3': \
case '6': \
case '7': \
case '8': \
- case '9':
+ case '9'
+
+static bool is_universal_char_valid(utf32 const v)
+{
+ /* C11 ยง6.4.3:2 */
+ if (v < 0xA0U && v != 0x24 && v != 0x40 && v != 0x60)
+ return false;
+ if (0xD800 <= v && v <= 0xDFFF)
+ return false;
+ return true;
+}
+
+static int digit_value(utf32 digit);
+
+static utf32 parse_universal_char(unsigned const n_digits)
+{
+ utf32 v = 0;
+ for (unsigned k = n_digits; k != 0; --k) {
+ if (isxdigit(c)) {
+ v = 16 * v + digit_value(c);
+ next_char();
+ } else {
+ errorf(&lexer_pos, "short universal character name, expected %u more digits", k);
+ break;
+ }
+ }
+ if (!is_universal_char_valid(v)) {
+ errorf(&lexer_pos, "\\%c%0*X is not a valid universal character name", n_digits == 4 ? 'u' : 'U', (int)n_digits, v);
+ }
+ return v;
+}
+
+static bool is_universal_char_valid_identifier(utf32 const v)
+{
+ /* C11 Annex D.1 */
+ if ( v == 0x000A8) return true;
+ if ( v == 0x000AA) return true;
+ if ( v == 0x000AD) return true;
+ if ( v == 0x000AF) return true;
+ if (0x000B2 <= v && v <= 0x000B5) return true;
+ if (0x000B7 <= v && v <= 0x000BA) return true;
+ if (0x000BC <= v && v <= 0x000BE) return true;
+ if (0x000C0 <= v && v <= 0x000D6) return true;
+ if (0x000D8 <= v && v <= 0x000F6) return true;
+ if (0x000F8 <= v && v <= 0x000FF) return true;
+ if (0x00100 <= v && v <= 0x0167F) return true;
+ if (0x01681 <= v && v <= 0x0180D) return true;
+ if (0x0180F <= v && v <= 0x01FFF) return true;
+ if (0x0200B <= v && v <= 0x0200D) return true;
+ if (0x0202A <= v && v <= 0x0202E) return true;
+ if (0x0203F <= v && v <= 0x02040) return true;
+ if ( v == 0x02054) return true;
+ if (0x02060 <= v && v <= 0x0206F) return true;
+ if (0x02070 <= v && v <= 0x0218F) return true;
+ if (0x02460 <= v && v <= 0x024FF) return true;
+ if (0x02776 <= v && v <= 0x02793) return true;
+ if (0x02C00 <= v && v <= 0x02DFF) return true;
+ if (0x02E80 <= v && v <= 0x02FFF) return true;
+ if (0x03004 <= v && v <= 0x03007) return true;
+ if (0x03021 <= v && v <= 0x0302F) return true;
+ if (0x03031 <= v && v <= 0x0303F) return true;
+ if (0x03040 <= v && v <= 0x0D7FF) return true;
+ if (0x0F900 <= v && v <= 0x0FD3D) return true;
+ if (0x0FD40 <= v && v <= 0x0FDCF) return true;
+ if (0x0FDF0 <= v && v <= 0x0FE44) return true;
+ if (0x0FE47 <= v && v <= 0x0FFFD) return true;
+ if (0x10000 <= v && v <= 0x1FFFD) return true;
+ if (0x20000 <= v && v <= 0x2FFFD) return true;
+ if (0x30000 <= v && v <= 0x3FFFD) return true;
+ if (0x40000 <= v && v <= 0x4FFFD) return true;
+ if (0x50000 <= v && v <= 0x5FFFD) return true;
+ if (0x60000 <= v && v <= 0x6FFFD) return true;
+ if (0x70000 <= v && v <= 0x7FFFD) return true;
+ if (0x80000 <= v && v <= 0x8FFFD) return true;
+ if (0x90000 <= v && v <= 0x9FFFD) return true;
+ if (0xA0000 <= v && v <= 0xAFFFD) return true;
+ if (0xB0000 <= v && v <= 0xBFFFD) return true;
+ if (0xC0000 <= v && v <= 0xCFFFD) return true;
+ if (0xD0000 <= v && v <= 0xDFFFD) return true;
+ if (0xE0000 <= v && v <= 0xEFFFD) return true;
+ return false;
+}
+
+static bool is_universal_char_valid_identifier_start(utf32 const v)
+{
+ /* C11 Annex D.2 */
+ if (0x0300 <= v && v <= 0x036F) return false;
+ if (0x1DC0 <= v && v <= 0x1DFF) return false;
+ if (0x20D0 <= v && v <= 0x20FF) return false;
+ if (0xFE20 <= v && v <= 0xFE2F) return false;
+ return true;
+}
/**
* Read a symbol from the input and build
*/
static void parse_symbol(void)
{
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
-
while (true) {
switch (c) {
- DIGITS
- SYMBOL_CHARS
+ case DIGITS:
+ case SYMBOL_CHARS:
obstack_1grow(&symbol_obstack, (char) c);
next_char();
break;
+ case '\\':
+ next_char();
+ switch (c) {
+ {
+ unsigned n;
+ case 'U': n = 8; goto universal;
+ case 'u': n = 4; goto universal;
+universal:
+ next_char();
+ utf32 const v = parse_universal_char(n);
+ if (!is_universal_char_valid_identifier(v)) {
+ if (is_universal_char_valid(v)) {
+ errorf(&lexer_pos, "universal character \\%c%0*X is not valid in an identifier", n == 4 ? 'u' : 'U', (int)n, v);
+ }
+ } else if (obstack_object_size(&symbol_obstack) == 0 && !is_universal_char_valid_identifier_start(v)) {
+ errorf(&lexer_pos, "universal character \\%c%0*X is not valid as start of an identifier", n == 4 ? 'u' : 'U', (int)n, v);
+ } else {
+ obstack_grow_utf8(&symbol_obstack, v);
+ }
+ break;
+ }
+
+ default:
+ put_back(c);
+ c = '\\';
+ goto end_symbol;
+ }
+
default:
dollar_sign:
goto end_symbol;
}
}
-static string_t identify_string(char *string, size_t len)
+static string_t sym_make_string(string_encoding_t const enc)
{
+ obstack_1grow(&symbol_obstack, '\0');
+ size_t const len = obstack_object_size(&symbol_obstack) - 1;
+ char *const string = obstack_finish(&symbol_obstack);
+
/* TODO hash */
#if 0
const char *result = strset_insert(&stringset, concat);
#else
const char *result = string;
#endif
- return (string_t) {result, len};
+ return (string_t){ result, len, enc };
}
-/**
- * parse suffixes like 'LU' or 'f' after numbers
- */
-static void parse_number_suffix(void)
+static void parse_pp_number(void)
{
- assert(obstack_object_size(&symbol_obstack) == 0);
- while (true) {
+ for (;;) {
switch (c) {
- SYMBOL_CHARS
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
- break;
- default:
- dollar_sign:
- goto finish_suffix;
- }
- }
-finish_suffix:
- if (obstack_object_size(&symbol_obstack) == 0) {
- lexer_token.number.suffix.begin = NULL;
- lexer_token.number.suffix.size = 0;
- return;
- }
-
- obstack_1grow(&symbol_obstack, '\0');
- size_t size = obstack_object_size(&symbol_obstack) - 1;
- char *string = obstack_finish(&symbol_obstack);
-
- lexer_token.number.suffix = identify_string(string, size);
-}
-
-static void parse_exponent(void)
-{
- if (c == '-' || c == '+') {
- obstack_1grow(&symbol_obstack, (char)c);
- next_char();
- }
-
- if (isdigit(c)) {
- do {
+ case SYMBOL_CHARS_E_P:
obstack_1grow(&symbol_obstack, (char)c);
next_char();
- } while (isdigit(c));
- } else {
- errorf(&lexer_token.base.source_position, "exponent has no digits");
- }
-}
-
-/**
- * Parses a hex number including hex floats and set the
- * lexer_token.
- */
-static void parse_number_hex(void)
-{
- bool is_float = false;
- bool has_digits = false;
-
- while (isxdigit(c)) {
- has_digits = true;
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
- }
-
- if (c == '.') {
- is_float = true;
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
+ if (c == '+' || c == '-') {
+ case '.':
+ case DIGITS:
+ case SYMBOL_CHARS_WITHOUT_E_P:
+ obstack_1grow(&symbol_obstack, (char)c);
+ next_char();
+ }
+ break;
- while (isxdigit(c)) {
- has_digits = true;
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
+ default:
+dollar_sign:
+ lexer_token.kind = T_NUMBER;
+ lexer_token.literal.string = sym_make_string(STRING_ENCODING_CHAR);
+ return;
}
}
- if (c == 'p' || c == 'P') {
- is_float = true;
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
- parse_exponent();
- } else if (is_float) {
- errorf(&lexer_token.base.source_position,
- "hexadecimal floatingpoint constant requires an exponent");
- }
- obstack_1grow(&symbol_obstack, '\0');
-
- size_t size = obstack_object_size(&symbol_obstack) - 1;
- char *string = obstack_finish(&symbol_obstack);
- lexer_token.number.number = identify_string(string, size);
-
- lexer_token.kind = is_float ? T_FLOATINGPOINT : T_INTEGER;
-
- if (!has_digits) {
- errorf(&lexer_token.base.source_position, "invalid number literal '%S'", &lexer_token.number.number);
- lexer_token.number.number.begin = "0";
- lexer_token.number.number.size = 1;
- }
-
- parse_number_suffix();
-}
-
-static void parse_number_bin(void)
-{
- bool has_digits = false;
-
- while (c == '0' || c == '1') {
- has_digits = true;
- obstack_1grow(&symbol_obstack, (char)c);
- next_char();
- }
- obstack_1grow(&symbol_obstack, '\0');
-
- size_t const size = obstack_object_size(&symbol_obstack) - 1;
- char *const string = obstack_finish(&symbol_obstack);
- lexer_token.number.number = identify_string(string, size);
- lexer_token.kind = T_INTEGER;
-
- if (!has_digits) {
- errorf(&lexer_token.base.source_position, "invalid number literal '%S'", &lexer_token.number.number);
- lexer_token.number.number.begin = "0";
- lexer_token.number.number.size = 1;
- }
-
- parse_number_suffix();
}
/**
return '0' <= chr && chr <= '7';
}
-/**
- * Parses a number and sets the lexer_token.
- */
-static void parse_number(void)
-{
- bool is_float = false;
- bool has_digits = false;
-
- assert(obstack_object_size(&symbol_obstack) == 0);
- if (c == '0') {
- obstack_1grow(&symbol_obstack, (char)c);
- next_char();
- if (c == 'x' || c == 'X') {
- obstack_1grow(&symbol_obstack, (char)c);
- next_char();
- parse_number_hex();
- return;
- } else if (c == 'b' || c == 'B') {
- /* GCC extension: binary constant 0x[bB][01]+. */
- obstack_1grow(&symbol_obstack, (char)c);
- next_char();
- parse_number_bin();
- return;
- }
- has_digits = true;
- }
-
- while (isdigit(c)) {
- has_digits = true;
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
- }
-
- if (c == '.') {
- is_float = true;
- obstack_1grow(&symbol_obstack, '.');
- next_char();
-
- while (isdigit(c)) {
- has_digits = true;
- obstack_1grow(&symbol_obstack, (char) c);
- next_char();
- }
- }
- if (c == 'e' || c == 'E') {
- is_float = true;
- obstack_1grow(&symbol_obstack, 'e');
- next_char();
- parse_exponent();
- }
-
- obstack_1grow(&symbol_obstack, '\0');
- size_t size = obstack_object_size(&symbol_obstack) - 1;
- char *string = obstack_finish(&symbol_obstack);
- lexer_token.number.number = identify_string(string, size);
-
- if (is_float) {
- lexer_token.kind = T_FLOATINGPOINT;
- } else {
- lexer_token.kind = T_INTEGER;
-
- if (string[0] == '0') {
- /* check for invalid octal digits */
- for (size_t i= 0; i < size; ++i) {
- char t = string[i];
- if (t >= '8')
- errorf(&lexer_token.base.source_position, "invalid digit '%c' in octal number", t);
- }
- }
- }
-
- if (!has_digits) {
- errorf(&lexer_token.base.source_position, "invalid number literal '%S'",
- &lexer_token.number.number);
- }
-
- parse_number_suffix();
-}
-
/**
* Returns the value of a digit.
* The only portable way to do it ...
if (c_mode & _GNUC)
return 27; /* hopefully 27 is ALWAYS the code for ESCAPE */
break;
- case 'u':
- case 'U':
- parse_error("universal character parsing not implemented yet");
- return EOF;
+
+ case 'U': return parse_universal_char(8);
+ case 'u': return parse_universal_char(4);
+
default:
break;
}
string_t make_string(const char *string)
{
- size_t len = strlen(string) + 1;
- char *const space = obstack_alloc(&symbol_obstack, len);
- memcpy(space, string, len);
-
- return identify_string(space, len);
+ obstack_grow(&symbol_obstack, string, strlen(string));
+ return sym_make_string(STRING_ENCODING_CHAR);
}
static void parse_string(utf32 const delim, token_kind_t const kind, string_encoding_t const enc, char const *const context)
}
obstack_1grow(&symbol_obstack, tc);
} else {
- obstack_grow_symbol(&symbol_obstack, tc);
+ obstack_grow_utf8(&symbol_obstack, tc);
}
break;
}
next_char();
goto end_of_string;
} else {
- obstack_grow_symbol(&symbol_obstack, c);
+ obstack_grow_utf8(&symbol_obstack, c);
next_char();
break;
}
}
end_of_string:
- obstack_1grow(&symbol_obstack, '\0');
- size_t const size = obstack_object_size(&symbol_obstack) - 1;
- char *const string = obstack_finish(&symbol_obstack);
-
- lexer_token.kind = kind;
- lexer_token.string.encoding = enc;
- lexer_token.string.string = identify_string(string, size);
+ lexer_token.kind = kind;
+ lexer_token.literal.string = sym_make_string(enc);
}
/**
static void parse_character_constant(string_encoding_t const enc)
{
parse_string('\'', T_CHARACTER_CONSTANT, enc, "character constant");
- if (lexer_token.string.string.size == 0) {
+ if (lexer_token.literal.string.size == 0) {
errorf(&lexer_token.base.source_position, "empty character constant");
}
}
*/
static void parse_line_directive(void)
{
- if (pp_token.kind != T_INTEGER) {
+ if (pp_token.kind != T_NUMBER) {
parse_error("expected integer");
} else {
/* use offset -1 as this is about the next line */
- lexer_pos.lineno = atoi(pp_token.number.number.begin) - 1;
+ char *end;
+ long const line = strtol(pp_token.literal.string.begin, &end, 0);
+ if (*end == '\0') {
+ lexer_pos.lineno = line - 1;
+ } else {
+ errorf(&lexer_pos, "'%S' is not a valid line number", &pp_token.literal.string);
+ }
next_pp_token();
}
- if (pp_token.kind == T_STRING_LITERAL && pp_token.string.encoding == STRING_ENCODING_CHAR) {
- lexer_pos.input_name = pp_token.string.string.begin;
+ if (pp_token.kind == T_STRING_LITERAL && pp_token.literal.string.encoding == STRING_ENCODING_CHAR) {
+ lexer_pos.input_name = pp_token.literal.string.begin;
lexer_pos.is_system_header = false;
next_pp_token();
/* attempt to parse numeric flags as outputted by gcc preprocessor */
- while (pp_token.kind == T_INTEGER) {
+ while (pp_token.kind == T_NUMBER) {
/* flags:
* 1 - indicates start of a new file
* 2 - indicates return from a file
*
* currently we're only interested in "3"
*/
- if (streq(pp_token.number.number.begin, "3")) {
+ if (streq(pp_token.literal.string.begin, "3")) {
lexer_pos.is_system_header = true;
}
next_pp_token();
case T_IDENTIFIER:
parse_preprocessor_identifier();
break;
- case T_INTEGER:
+ case T_NUMBER:
parse_line_directive();
break;
case '\n':
lexer_token.kind = '\n';
return;
- SYMBOL_CHARS {
+ case SYMBOL_CHARS: {
parse_symbol();
/* might be a wide string ( L"string" ) */
string_encoding_t const enc = STRING_ENCODING_WIDE;
return;
}
- DIGITS
- parse_number();
+ case DIGITS:
+ parse_pp_number();
return;
case '"':
case '.':
MAYBE_PROLOG
- DIGITS
+ case DIGITS:
put_back(c);
c = '.';
- parse_number();
+ parse_pp_number();
return;
case '.':
MAYBE('#', T_HASHHASH)
ELSE('#')
+ case '\\':
+ next_char();
+ if (c == 'U' || c == 'u') {
+ put_back(c);
+ c = '\\';
+ parse_symbol();
+ } else {
+ lexer_token.kind = '\\';
+ }
+ return;
+
case '?':
case '[':
case ']':
case '~':
case ';':
case ',':
- case '\\':
lexer_token.kind = c;
next_char();
return;
default:
dollar_sign:
- errorf(&lexer_pos, "unknown character '%c' found", c);
+ errorf(&lexer_pos, "unknown character '%lc' found", c);
next_char();
break;
}