#include <config.h>
-#include "lexer_t.h"
+#include "lexer.h"
#include "token_t.h"
#include "symbol_table_t.h"
#include "adt/error.h"
+#include "adt/strset.h"
#include <assert.h>
#include <errno.h>
//#define DEBUG_CHARS
#define MAX_PUTBACK 3
-static int c;
-source_position_t source_position;
-static FILE *input;
-static char buf[1027];
-static const char *bufend;
-static const char *bufpos;
-static strset_t stringset;
-//static FILE **input_stack;
-//static char **buf_stack;
+static int c;
+token_t lexer_token;
+static FILE *input;
+static char buf[1024 + MAX_PUTBACK];
+static const char *bufend;
+static const char *bufpos;
+static strset_t stringset;
+//static FILE **input_stack;
+//static char **buf_stack;
static
void error_prefix_at(const char *input_name, unsigned linenr)
}
static
-void error_prefix()
+void error_prefix(void)
{
- error_prefix_at(source_position.input_name, source_position.linenr);
+ error_prefix_at(lexer_token.source_position.input_name,
+ lexer_token.source_position.linenr);
}
static
}
static inline
-void next_char()
+void next_char(void)
{
bufpos++;
if(bufpos >= bufend) {
next_char(); \
if(c == '\n') \
next_char(); \
- source_position.linenr++; \
+ lexer_token.source_position.linenr++; \
newline_code; \
} else if(c == '\n') { \
next_char(); \
- source_position.linenr++; \
+ lexer_token.source_position.linenr++; \
newline_code; \
}
+#define SYMBOL_CHARS \
+ case 'a': \
+ case 'b': \
+ case 'c': \
+ case 'd': \
+ case 'e': \
+ case 'f': \
+ case 'g': \
+ case 'h': \
+ case 'i': \
+ case 'j': \
+ case 'k': \
+ case 'l': \
+ case 'm': \
+ case 'n': \
+ case 'o': \
+ case 'p': \
+ case 'q': \
+ case 'r': \
+ case 's': \
+ case 't': \
+ case 'u': \
+ case 'v': \
+ case 'w': \
+ case 'x': \
+ case 'y': \
+ case 'z': \
+ case 'A': \
+ case 'B': \
+ case 'C': \
+ case 'D': \
+ case 'E': \
+ case 'F': \
+ case 'G': \
+ case 'H': \
+ case 'I': \
+ case 'J': \
+ case 'K': \
+ case 'L': \
+ case 'M': \
+ case 'N': \
+ case 'O': \
+ case 'P': \
+ case 'Q': \
+ case 'R': \
+ case 'S': \
+ case 'T': \
+ case 'U': \
+ case 'V': \
+ case 'W': \
+ case 'X': \
+ case 'Y': \
+ case 'Z': \
+ case '_':
+
+#define DIGITS \
+ case '0': \
+ case '1': \
+ case '2': \
+ case '3': \
+ case '4': \
+ case '5': \
+ case '6': \
+ case '7': \
+ case '8': \
+ case '9':
+
static
-void parse_symbol(token_t *token)
+void parse_symbol(void)
{
symbol_t *symbol;
char *string;
EAT_NEWLINE(break;)
goto end_symbol;
- case 'A' ... 'Z':
- case 'a' ... 'z':
- case '_':
+ DIGITS
+ SYMBOL_CHARS
obstack_1grow(&symbol_obstack, c);
next_char();
break;
string = obstack_finish(&symbol_obstack);
symbol = symbol_table_insert(string);
- if(symbol->ID > 0) {
- token->type = symbol->ID;
- } else {
- token->type = T_IDENTIFIER;
- }
- token->v.symbol = symbol;
+ lexer_token.type = symbol->ID;
+ lexer_token.v.symbol = symbol;
if(symbol->string != string) {
obstack_free(&symbol_obstack, string);
}
static
-void parse_number_hex(token_t *token)
+void parse_number_hex(void)
{
assert(c == 'x' || c == 'X');
next_char();
!('A' <= c && c <= 'F') &&
!('a' <= c && c <= 'f')) {
parse_error("premature end of hex number literal");
- token->type = T_ERROR;
+ lexer_token.type = T_ERROR;
return;
}
} else if ('a' <= c && c <= 'f') {
value = 16 * value + c - 'a' + 10;
} else {
- token->type = T_INTEGER;
- token->v.intvalue = value;
+ lexer_token.type = T_INTEGER;
+ lexer_token.v.intvalue = value;
return;
}
next_char();
}
static
-void parse_number_oct(token_t *token)
+void parse_number_oct(void)
{
assert(c == 'o' || c == 'O');
next_char();
if ('0' <= c && c <= '7') {
value = 8 * value + c - '0';
} else {
- token->type = T_INTEGER;
- token->v.intvalue = value;
+ lexer_token.type = T_INTEGER;
+ lexer_token.v.intvalue = value;
return;
}
next_char();
}
static
-void parse_number_dec(token_t *token, int first_char)
+void parse_number_dec(int first_char)
{
int value = 0;
if(first_char > 0) {
if (isdigit(c)) {
value = 10 * value + c - '0';
} else {
- token->type = T_INTEGER;
- token->v.intvalue = value;
+ lexer_token.type = T_INTEGER;
+ lexer_token.v.intvalue = value;
return;
}
next_char();
}
static
-void parse_number(token_t *token)
+void parse_number(void)
{
// TODO check for overflow
// TODO check for various invalid inputs sequences
next_char();
switch (c) {
case 'X':
- case 'x': parse_number_hex(token); break;
+ case 'x': parse_number_hex(); break;
case 'o':
- case 'O': parse_number_oct(token); break;
- default: parse_number_dec(token, '0');
+ case 'O': parse_number_oct(); break;
+ default: parse_number_dec('0');
}
} else {
- parse_number_dec(token, 0);
+ parse_number_dec(0);
}
}
static
-int parse_escape_sequence()
+int parse_escape_sequence(void)
{
while(1) {
int ec = c;
case 'x': /* TODO parse hex number ... */
parse_error("hex escape sequences not implemented yet");
return EOF;
- case 0 ... 8: /* TODO parse octal number ... */
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ /* TODO parse octal number ... */
parse_error("octal escape sequences not implemented yet");
return EOF;
case '?':
}
}
+const char *concat_strings(const char *s1, const char *s2)
+{
+ size_t len1 = strlen(s1);
+ size_t len2 = strlen(s2);
+
+ char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
+ memcpy(concat, s1, len1);
+ memcpy(concat + len1, s2, len2 + 1);
+
+ const char *result = strset_insert(&stringset, concat);
+ if(result != concat) {
+ obstack_free(&symbol_obstack, concat);
+ }
+
+ return result;
+}
+
static
-void parse_string_literal(token_t *token)
+void parse_string_literal(void)
{
- unsigned start_linenr = source_position.linenr;
+ unsigned start_linenr = lexer_token.source_position.linenr;
char *string;
const char *result;
break;
case EOF:
- error_prefix_at(source_position.input_name, start_linenr);
+ error_prefix_at(lexer_token.source_position.input_name,
+ start_linenr);
fprintf(stderr, "string has no end\n");
- token->type = T_ERROR;
+ lexer_token.type = T_ERROR;
return;
case '"':
obstack_free(&symbol_obstack, string);
}
- token->type = T_STRING_LITERAL;
- token->v.string = result;
-}
-
-#define MATCH_NEWLINE(code) \
- case '\r': \
- next_char(); \
- if(c == '\n') { \
- next_char(); \
- } \
- source_position.linenr++; \
- code; \
- case '\n': \
- next_char(); \
- source_position.linenr++; \
+ lexer_token.type = T_STRING_LITERAL;
+ lexer_token.v.string = result;
+}
+
+#define MATCH_NEWLINE(code) \
+ case '\r': \
+ next_char(); \
+ if(c == '\n') { \
+ next_char(); \
+ } \
+ lexer_token.source_position.linenr++; \
+ code; \
+ case '\n': \
+ next_char(); \
+ lexer_token.source_position.linenr++; \
code;
static
-void parse_character_constant(token_t *token)
+void parse_character_constant(void)
{
assert(c == '\'');
next_char();
case EOF:
parse_error("EOF while parsing character constant");
- token->type = T_ERROR;
+ lexer_token.type = T_ERROR;
return;
default:
}
end_of_char_constant:
- token->type = T_INTEGER;
- token->v.intvalue = found_char;
+ lexer_token.type = T_INTEGER;
+ lexer_token.v.intvalue = found_char;
}
static
void skip_multiline_comment(void)
{
- unsigned start_linenr = source_position.linenr;
+ unsigned start_linenr = lexer_token.source_position.linenr;
int had_star = 0;
while(1) {
MATCH_NEWLINE(had_star = 0; break;)
case EOF:
- error_prefix_at(source_position.input_name, start_linenr);
+ error_prefix_at(lexer_token.source_position.input_name,
+ start_linenr);
fprintf(stderr, "at end of file while looking for comment end\n");
return;
default:
next_char();
if(c == '\n') {
next_char();
- source_position.linenr++;
+ lexer_token.source_position.linenr++;
}
break;
}
}
-static
-void lexer_next_preprocessing_token(token_t *token);
+static token_t pp_token;
+
+static inline
+void next_pp_token(void)
+{
+ lexer_next_preprocessing_token();
+ pp_token = lexer_token;
+}
static
void eat_until_newline(void)
{
- /* TODO */
+ while(pp_token.type != '\n' && pp_token.type != T_EOF) {
+ next_pp_token();
+ }
}
static
static
void define_directive(void)
{
- token_t temptoken;
-
- lexer_next_preprocessing_token(&temptoken);
- if(temptoken.type != T_IDENTIFIER) {
+ lexer_next_preprocessing_token();
+ if(lexer_token.type != T_IDENTIFIER) {
parse_error("expected identifier after #define\n");
eat_until_newline();
}
void ifdef_directive(int is_ifndef)
{
(void) is_ifndef;
- token_t temptoken;
- lexer_next_preprocessing_token(&temptoken);
+ lexer_next_preprocessing_token();
//expect_identifier();
//extect_newline();
}
}
static
-void found_preprocessor_identifier(symbol_t *symbol)
+void parse_line_directive(void)
+{
+ if(pp_token.type != T_INTEGER) {
+ parse_error("expected integer");
+ } else {
+ lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
+ next_pp_token();
+ }
+ if(pp_token.type == T_STRING_LITERAL) {
+ lexer_token.source_position.input_name = pp_token.v.string;
+ next_pp_token();
+ }
+
+ eat_until_newline();
+}
+
+static
+void parse_preprocessor_identifier(void)
{
+ assert(pp_token.type == T_IDENTIFIER);
+ symbol_t *symbol = pp_token.v.symbol;
+
switch(symbol->pp_ID) {
case TP_include:
printf("include - enable header name parsing!\n");
case TP_endif:
endif_directive();
break;
+ case TP_line:
+ next_pp_token();
+ parse_line_directive();
+ break;
case TP_if:
case TP_else:
case TP_elif:
case TP_undef:
- case TP_line:
case TP_error:
error_directive();
break;
}
static
-void parse_preprocessor_directive(token_t *result_token)
+void parse_preprocessor_directive()
{
- token_t temptoken;
+ next_pp_token();
- (void) result_token;
- lexer_next_preprocessing_token(&temptoken);
- switch(temptoken.type) {
+ switch(pp_token.type) {
case T_IDENTIFIER:
- found_preprocessor_identifier(temptoken.v.symbol);
+ parse_preprocessor_identifier();
+ break;
+ case T_INTEGER:
+ parse_line_directive();
+ break;
+ default:
+ parse_error("invalid preprocessor directive");
+ eat_until_newline();
break;
}
}
#define MAYBE(ch, set_type) \
case ch: \
next_char(); \
- token->type = set_type; \
+ lexer_token.type = set_type; \
return;
#define ELSE_CODE(code) \
#define ELSE(set_type) \
ELSE_CODE( \
- token->type = set_type; \
+ lexer_token.type = set_type; \
return; \
)
-static
-void eat_whitespace()
-{
- while(1) {
- switch(c) {
- case ' ':
- case '\t':
- next_char();
- break;
-
- case '\r':
- case '\n':
- return;
-
- case '\\':
- next_char();
- if(c == '\n') {
- next_char();
- source_position.linenr++;
- break;
- }
-
- put_back(c);
- c = '\\';
- return;
-
- SKIP_TRIGRAPHS(,
- return;
- )
-
- case '/':
- next_char();
- while(1) {
- switch(c) {
- case '*':
- next_char();
- skip_multiline_comment();
- eat_whitespace();
- return;
- case '/':
- next_char();
- skip_line_comment();
- eat_whitespace();
- return;
-
- SKIP_TRIGRAPHS(
- put_back('?');
- ,
- c = '/';
- return;
- )
-
- case '\\':
- next_char();
- EAT_NEWLINE(break;)
- /* fallthrough */
- default:
- return;
- }
- }
- break;
-
- default:
- return;
- }
- }
-}
-
-static
-void lexer_next_preprocessing_token(token_t *token)
+void lexer_next_preprocessing_token(void)
{
while(1) {
switch(c) {
break;
MATCH_NEWLINE(
- eat_whitespace();
- if(c == '#') {
- next_char();
- parse_preprocessor_directive(token);
- return;
- }
- token->type = '\n';
+ lexer_token.type = '\n';
return;
)
- case 'A' ... 'Z':
- case 'a' ... 'z':
- case '_':
- parse_symbol(token);
+ SYMBOL_CHARS
+ parse_symbol();
return;
- case '0' ... '9':
- parse_number(token);
+ DIGITS
+ parse_number();
return;
case '"':
- parse_string_literal(token);
+ parse_string_literal();
return;
case '\'':
- parse_character_constant(token);
+ parse_character_constant();
return;
case '\\':
next_char();
if(c == '\n') {
next_char();
- source_position.linenr++;
+ lexer_token.source_position.linenr++;
break;
} else {
parse_error("unexpected '\\' found");
- token->type = T_ERROR;
+ lexer_token.type = T_ERROR;
}
return;
ELSE_CODE(
put_back(c);
c = '.';
- token->type = '.';
+ lexer_token.type = '.';
return;
)
ELSE('.')
case '*':
next_char();
skip_multiline_comment();
- lexer_next_preprocessing_token(token);
+ lexer_next_preprocessing_token();
return;
case '/':
next_char();
skip_line_comment();
- lexer_next_preprocessing_token(token);
+ lexer_next_preprocessing_token();
return;
ELSE('/')
case '%':
ELSE_CODE(
put_back(c);
c = '%';
- token->type = T_PERCENTCOLON;
+ lexer_token.type = T_PERCENTCOLON;
return;
)
ELSE(T_PERCENTCOLON)
next_char();
/* just a simple ? */
if(c != '?') {
- token->type = '?';
+ lexer_token.type = '?';
return;
}
/* might be a trigraph */
}
put_back(c);
c = '?';
- token->type = '?';
+ lexer_token.type = '?';
return;
case '[':
case '~':
case ';':
case ',':
- token->type = c;
+ lexer_token.type = c;
next_char();
return;
case EOF:
- token->type = T_EOF;
+ lexer_token.type = T_EOF;
return;
default:
next_char();
error_prefix();
fprintf(stderr, "unknown character '%c' found\n", c);
- token->type = T_ERROR;
+ lexer_token.type = T_ERROR;
return;
}
}
}
-void lexer_next_token(token_t *token)
+void lexer_next_token(void)
{
+ lexer_next_preprocessing_token();
+ if(lexer_token.type != '\n')
+ return;
+
+newline_found:
do {
- lexer_next_preprocessing_token(token);
- } while(token->type == '\n');
+ lexer_next_preprocessing_token();
+ } while(lexer_token.type == '\n');
+
+ if(lexer_token.type == '#') {
+ parse_preprocessor_directive();
+ goto newline_found;
+ }
}
void init_lexer(void)
void lexer_open_stream(FILE *stream, const char *input_name)
{
- input = stream;
- source_position.linenr = 0;
- source_position.input_name = input_name;
+ input = stream;
+ lexer_token.source_position.linenr = 0;
+ lexer_token.source_position.input_name = input_name;
/* we place a virtual '\n' at the beginning so the lexer knows we're at the
* beginning of a line */
static __attribute__((unused))
void dbg_pos(const source_position_t source_position)
{
- fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
+ fprintf(stdout, "%s:%d\n", source_position.input_name,
+ source_position.linenr);
fflush(stdout);
}