X-Git-Url: http://nsz.repo.hu/git/?a=blobdiff_plain;f=lexer.c;h=7732c9dee4415c075bb216ad0f0496bc0d0239c5;hb=3e3bf65b33ece40f474c91dac37cf8ea716a5b03;hp=406bc7a984c7883f39fde4446e839b840150feec;hpb=b4e35bf2387e9b22ae6a97bad28daa8a438483ed;p=cparser diff --git a/lexer.c b/lexer.c index 406bc7a..7732c9d 100644 --- a/lexer.c +++ b/lexer.c @@ -1,9 +1,10 @@ #include -#include "lexer_t.h" +#include "lexer.h" #include "token_t.h" #include "symbol_table_t.h" #include "adt/error.h" +#include "adt/strset.h" #include #include @@ -13,15 +14,15 @@ //#define DEBUG_CHARS #define MAX_PUTBACK 3 -static int c; -source_position_t source_position; -static FILE *input; -static char buf[1027]; -static const char *bufend; -static const char *bufpos; -static strset_t stringset; -//static FILE **input_stack; -//static char **buf_stack; +static int c; +token_t lexer_token; +static FILE *input; +static char buf[1024 + MAX_PUTBACK]; +static const char *bufend; +static const char *bufpos; +static strset_t stringset; +//static FILE **input_stack; +//static char **buf_stack; static void error_prefix_at(const char *input_name, unsigned linenr) @@ -30,9 +31,10 @@ void error_prefix_at(const char *input_name, unsigned linenr) } static -void error_prefix() +void error_prefix(void) { - error_prefix_at(source_position.input_name, source_position.linenr); + error_prefix_at(lexer_token.source_position.input_name, + lexer_token.source_position.linenr); } static @@ -43,7 +45,7 @@ void parse_error(const char *msg) } static inline -void next_char() +void next_char(void) { bufpos++; if(bufpos >= bufend) { @@ -125,16 +127,83 @@ int replace_trigraph(void) next_char(); \ if(c == '\n') \ next_char(); \ - source_position.linenr++; \ + lexer_token.source_position.linenr++; \ newline_code; \ } else if(c == '\n') { \ next_char(); \ - source_position.linenr++; \ + lexer_token.source_position.linenr++; \ newline_code; \ } +#define SYMBOL_CHARS \ + case 'a': \ + case 'b': \ + case 'c': \ + case 'd': \ + case 'e': \ + case 'f': \ + case 'g': \ + case 'h': \ + case 'i': \ + case 'j': \ + case 'k': \ + case 'l': \ + case 'm': \ + case 'n': \ + case 'o': \ + case 'p': \ + case 'q': \ + case 'r': \ + case 's': \ + case 't': \ + case 'u': \ + case 'v': \ + case 'w': \ + case 'x': \ + case 'y': \ + case 'z': \ + case 'A': \ + case 'B': \ + case 'C': \ + case 'D': \ + case 'E': \ + case 'F': \ + case 'G': \ + case 'H': \ + case 'I': \ + case 'J': \ + case 'K': \ + case 'L': \ + case 'M': \ + case 'N': \ + case 'O': \ + case 'P': \ + case 'Q': \ + case 'R': \ + case 'S': \ + case 'T': \ + case 'U': \ + case 'V': \ + case 'W': \ + case 'X': \ + case 'Y': \ + case 'Z': \ + case '_': + +#define DIGITS \ + case '0': \ + case '1': \ + case '2': \ + case '3': \ + case '4': \ + case '5': \ + case '6': \ + case '7': \ + case '8': \ + case '9': + static -void parse_symbol(token_t *token) +void parse_symbol(void) { symbol_t *symbol; char *string; @@ -149,9 +218,8 @@ void parse_symbol(token_t *token) EAT_NEWLINE(break;) goto end_symbol; - case 'A' ... 'Z': - case 'a' ... 'z': - case '_': + DIGITS + SYMBOL_CHARS obstack_1grow(&symbol_obstack, c); next_char(); break; @@ -181,12 +249,8 @@ end_symbol: string = obstack_finish(&symbol_obstack); symbol = symbol_table_insert(string); - if(symbol->ID > 0) { - token->type = symbol->ID; - } else { - token->type = T_IDENTIFIER; - } - token->v.symbol = symbol; + lexer_token.type = symbol->ID; + lexer_token.v.symbol = symbol; if(symbol->string != string) { obstack_free(&symbol_obstack, string); @@ -194,7 +258,7 @@ end_symbol: } static -void parse_number_hex(token_t *token) +void parse_number_hex(void) { assert(c == 'x' || c == 'X'); next_char(); @@ -203,7 +267,7 @@ void parse_number_hex(token_t *token) !('A' <= c && c <= 'F') && !('a' <= c && c <= 'f')) { parse_error("premature end of hex number literal"); - token->type = T_ERROR; + lexer_token.type = T_ERROR; return; } @@ -216,8 +280,8 @@ void parse_number_hex(token_t *token) } else if ('a' <= c && c <= 'f') { value = 16 * value + c - 'a' + 10; } else { - token->type = T_INTEGER; - token->v.intvalue = value; + lexer_token.type = T_INTEGER; + lexer_token.v.intvalue = value; return; } next_char(); @@ -225,7 +289,7 @@ void parse_number_hex(token_t *token) } static -void parse_number_oct(token_t *token) +void parse_number_oct(void) { assert(c == 'o' || c == 'O'); next_char(); @@ -235,8 +299,8 @@ void parse_number_oct(token_t *token) if ('0' <= c && c <= '7') { value = 8 * value + c - '0'; } else { - token->type = T_INTEGER; - token->v.intvalue = value; + lexer_token.type = T_INTEGER; + lexer_token.v.intvalue = value; return; } next_char(); @@ -244,7 +308,7 @@ void parse_number_oct(token_t *token) } static -void parse_number_dec(token_t *token, int first_char) +void parse_number_dec(int first_char) { int value = 0; if(first_char > 0) { @@ -256,8 +320,8 @@ void parse_number_dec(token_t *token, int first_char) if (isdigit(c)) { value = 10 * value + c - '0'; } else { - token->type = T_INTEGER; - token->v.intvalue = value; + lexer_token.type = T_INTEGER; + lexer_token.v.intvalue = value; return; } next_char(); @@ -265,7 +329,7 @@ void parse_number_dec(token_t *token, int first_char) } static -void parse_number(token_t *token) +void parse_number(void) { // TODO check for overflow // TODO check for various invalid inputs sequences @@ -274,18 +338,18 @@ void parse_number(token_t *token) next_char(); switch (c) { case 'X': - case 'x': parse_number_hex(token); break; + case 'x': parse_number_hex(); break; case 'o': - case 'O': parse_number_oct(token); break; - default: parse_number_dec(token, '0'); + case 'O': parse_number_oct(); break; + default: parse_number_dec('0'); } } else { - parse_number_dec(token, 0); + parse_number_dec(0); } } static -int parse_escape_sequence() +int parse_escape_sequence(void) { while(1) { int ec = c; @@ -307,7 +371,15 @@ int parse_escape_sequence() case 'x': /* TODO parse hex number ... */ parse_error("hex escape sequences not implemented yet"); return EOF; - case 0 ... 8: /* TODO parse octal number ... */ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* TODO parse octal number ... */ parse_error("octal escape sequences not implemented yet"); return EOF; case '?': @@ -333,10 +405,27 @@ int parse_escape_sequence() } } +const char *concat_strings(const char *s1, const char *s2) +{ + size_t len1 = strlen(s1); + size_t len2 = strlen(s2); + + char *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1); + memcpy(concat, s1, len1); + memcpy(concat + len1, s2, len2 + 1); + + const char *result = strset_insert(&stringset, concat); + if(result != concat) { + obstack_free(&symbol_obstack, concat); + } + + return result; +} + static -void parse_string_literal(token_t *token) +void parse_string_literal(void) { - unsigned start_linenr = source_position.linenr; + unsigned start_linenr = lexer_token.source_position.linenr; char *string; const char *result; @@ -359,9 +448,10 @@ void parse_string_literal(token_t *token) break; case EOF: - error_prefix_at(source_position.input_name, start_linenr); + error_prefix_at(lexer_token.source_position.input_name, + start_linenr); fprintf(stderr, "string has no end\n"); - token->type = T_ERROR; + lexer_token.type = T_ERROR; return; case '"': @@ -389,25 +479,25 @@ end_of_string: obstack_free(&symbol_obstack, string); } - token->type = T_STRING_LITERAL; - token->v.string = result; -} - -#define MATCH_NEWLINE(code) \ - case '\r': \ - next_char(); \ - if(c == '\n') { \ - next_char(); \ - } \ - source_position.linenr++; \ - code; \ - case '\n': \ - next_char(); \ - source_position.linenr++; \ + lexer_token.type = T_STRING_LITERAL; + lexer_token.v.string = result; +} + +#define MATCH_NEWLINE(code) \ + case '\r': \ + next_char(); \ + if(c == '\n') { \ + next_char(); \ + } \ + lexer_token.source_position.linenr++; \ + code; \ + case '\n': \ + next_char(); \ + lexer_token.source_position.linenr++; \ code; static -void parse_character_constant(token_t *token) +void parse_character_constant(void) { assert(c == '\''); next_char(); @@ -437,7 +527,7 @@ void parse_character_constant(token_t *token) case EOF: parse_error("EOF while parsing character constant"); - token->type = T_ERROR; + lexer_token.type = T_ERROR; return; default: @@ -454,14 +544,14 @@ void parse_character_constant(token_t *token) } end_of_char_constant: - token->type = T_INTEGER; - token->v.intvalue = found_char; + lexer_token.type = T_INTEGER; + lexer_token.v.intvalue = found_char; } static void skip_multiline_comment(void) { - unsigned start_linenr = source_position.linenr; + unsigned start_linenr = lexer_token.source_position.linenr; int had_star = 0; while(1) { @@ -504,7 +594,8 @@ void skip_multiline_comment(void) MATCH_NEWLINE(had_star = 0; break;) case EOF: - error_prefix_at(source_position.input_name, start_linenr); + error_prefix_at(lexer_token.source_position.input_name, + start_linenr); fprintf(stderr, "at end of file while looking for comment end\n"); return; default: @@ -536,7 +627,7 @@ void skip_line_comment(void) next_char(); if(c == '\n') { next_char(); - source_position.linenr++; + lexer_token.source_position.linenr++; } break; @@ -552,13 +643,21 @@ void skip_line_comment(void) } } -static -void lexer_next_preprocessing_token(token_t *token); +static token_t pp_token; + +static inline +void next_pp_token(void) +{ + lexer_next_preprocessing_token(); + pp_token = lexer_token; +} static void eat_until_newline(void) { - /* TODO */ + while(pp_token.type != '\n' && pp_token.type != T_EOF) { + next_pp_token(); + } } static @@ -573,10 +672,8 @@ void error_directive(void) static void define_directive(void) { - token_t temptoken; - - lexer_next_preprocessing_token(&temptoken); - if(temptoken.type != T_IDENTIFIER) { + lexer_next_preprocessing_token(); + if(lexer_token.type != T_IDENTIFIER) { parse_error("expected identifier after #define\n"); eat_until_newline(); } @@ -586,8 +683,7 @@ static void ifdef_directive(int is_ifndef) { (void) is_ifndef; - token_t temptoken; - lexer_next_preprocessing_token(&temptoken); + lexer_next_preprocessing_token(); //expect_identifier(); //extect_newline(); } @@ -599,8 +695,28 @@ void endif_directive(void) } static -void found_preprocessor_identifier(symbol_t *symbol) +void parse_line_directive(void) +{ + if(pp_token.type != T_INTEGER) { + parse_error("expected integer"); + } else { + lexer_token.source_position.linenr = pp_token.v.intvalue - 1; + next_pp_token(); + } + if(pp_token.type == T_STRING_LITERAL) { + lexer_token.source_position.input_name = pp_token.v.string; + next_pp_token(); + } + + eat_until_newline(); +} + +static +void parse_preprocessor_identifier(void) { + assert(pp_token.type == T_IDENTIFIER); + symbol_t *symbol = pp_token.v.symbol; + switch(symbol->pp_ID) { case TP_include: printf("include - enable header name parsing!\n"); @@ -617,11 +733,14 @@ void found_preprocessor_identifier(symbol_t *symbol) case TP_endif: endif_directive(); break; + case TP_line: + next_pp_token(); + parse_line_directive(); + break; case TP_if: case TP_else: case TP_elif: case TP_undef: - case TP_line: case TP_error: error_directive(); break; @@ -631,15 +750,20 @@ void found_preprocessor_identifier(symbol_t *symbol) } static -void parse_preprocessor_directive(token_t *result_token) +void parse_preprocessor_directive() { - token_t temptoken; + next_pp_token(); - (void) result_token; - lexer_next_preprocessing_token(&temptoken); - switch(temptoken.type) { + switch(pp_token.type) { case T_IDENTIFIER: - found_preprocessor_identifier(temptoken.v.symbol); + parse_preprocessor_identifier(); + break; + case T_INTEGER: + parse_line_directive(); + break; + default: + parse_error("invalid preprocessor directive"); + eat_until_newline(); break; } } @@ -652,7 +776,7 @@ void parse_preprocessor_directive(token_t *result_token) #define MAYBE(ch, set_type) \ case ch: \ next_char(); \ - token->type = set_type; \ + lexer_token.type = set_type; \ return; #define ELSE_CODE(code) \ @@ -672,80 +796,11 @@ void parse_preprocessor_directive(token_t *result_token) #define ELSE(set_type) \ ELSE_CODE( \ - token->type = set_type; \ + lexer_token.type = set_type; \ return; \ ) -static -void eat_whitespace() -{ - while(1) { - switch(c) { - case ' ': - case '\t': - next_char(); - break; - - case '\r': - case '\n': - return; - - case '\\': - next_char(); - if(c == '\n') { - next_char(); - source_position.linenr++; - break; - } - - put_back(c); - c = '\\'; - return; - - SKIP_TRIGRAPHS(, - return; - ) - - case '/': - next_char(); - while(1) { - switch(c) { - case '*': - next_char(); - skip_multiline_comment(); - eat_whitespace(); - return; - case '/': - next_char(); - skip_line_comment(); - eat_whitespace(); - return; - - SKIP_TRIGRAPHS( - put_back('?'); - , - c = '/'; - return; - ) - - case '\\': - next_char(); - EAT_NEWLINE(break;) - /* fallthrough */ - default: - return; - } - } - break; - - default: - return; - } - } -} - -static -void lexer_next_preprocessing_token(token_t *token) +void lexer_next_preprocessing_token(void) { while(1) { switch(c) { @@ -755,43 +810,35 @@ void lexer_next_preprocessing_token(token_t *token) break; MATCH_NEWLINE( - eat_whitespace(); - if(c == '#') { - next_char(); - parse_preprocessor_directive(token); - return; - } - token->type = '\n'; + lexer_token.type = '\n'; return; ) - case 'A' ... 'Z': - case 'a' ... 'z': - case '_': - parse_symbol(token); + SYMBOL_CHARS + parse_symbol(); return; - case '0' ... '9': - parse_number(token); + DIGITS + parse_number(); return; case '"': - parse_string_literal(token); + parse_string_literal(); return; case '\'': - parse_character_constant(token); + parse_character_constant(); return; case '\\': next_char(); if(c == '\n') { next_char(); - source_position.linenr++; + lexer_token.source_position.linenr++; break; } else { parse_error("unexpected '\\' found"); - token->type = T_ERROR; + lexer_token.type = T_ERROR; } return; @@ -803,7 +850,7 @@ void lexer_next_preprocessing_token(token_t *token) ELSE_CODE( put_back(c); c = '.'; - token->type = '.'; + lexer_token.type = '.'; return; ) ELSE('.') @@ -836,12 +883,12 @@ void lexer_next_preprocessing_token(token_t *token) case '*': next_char(); skip_multiline_comment(); - lexer_next_preprocessing_token(token); + lexer_next_preprocessing_token(); return; case '/': next_char(); skip_line_comment(); - lexer_next_preprocessing_token(token); + lexer_next_preprocessing_token(); return; ELSE('/') case '%': @@ -856,7 +903,7 @@ void lexer_next_preprocessing_token(token_t *token) ELSE_CODE( put_back(c); c = '%'; - token->type = T_PERCENTCOLON; + lexer_token.type = T_PERCENTCOLON; return; ) ELSE(T_PERCENTCOLON) @@ -903,7 +950,7 @@ void lexer_next_preprocessing_token(token_t *token) next_char(); /* just a simple ? */ if(c != '?') { - token->type = '?'; + lexer_token.type = '?'; return; } /* might be a trigraph */ @@ -913,7 +960,7 @@ void lexer_next_preprocessing_token(token_t *token) } put_back(c); c = '?'; - token->type = '?'; + lexer_token.type = '?'; return; case '[': @@ -925,29 +972,39 @@ void lexer_next_preprocessing_token(token_t *token) case '~': case ';': case ',': - token->type = c; + lexer_token.type = c; next_char(); return; case EOF: - token->type = T_EOF; + lexer_token.type = T_EOF; return; default: next_char(); error_prefix(); fprintf(stderr, "unknown character '%c' found\n", c); - token->type = T_ERROR; + lexer_token.type = T_ERROR; return; } } } -void lexer_next_token(token_t *token) +void lexer_next_token(void) { + lexer_next_preprocessing_token(); + if(lexer_token.type != '\n') + return; + +newline_found: do { - lexer_next_preprocessing_token(token); - } while(token->type == '\n'); + lexer_next_preprocessing_token(); + } while(lexer_token.type == '\n'); + + if(lexer_token.type == '#') { + parse_preprocessor_directive(); + goto newline_found; + } } void init_lexer(void) @@ -957,9 +1014,9 @@ void init_lexer(void) void lexer_open_stream(FILE *stream, const char *input_name) { - input = stream; - source_position.linenr = 0; - source_position.input_name = input_name; + input = stream; + lexer_token.source_position.linenr = 0; + lexer_token.source_position.input_name = input_name; /* we place a virtual '\n' at the beginning so the lexer knows we're at the * beginning of a line */ @@ -974,6 +1031,7 @@ void exit_lexer(void) static __attribute__((unused)) void dbg_pos(const source_position_t source_position) { - fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr); + fprintf(stdout, "%s:%d\n", source_position.input_name, + source_position.linenr); fflush(stdout); }