nsz Git - cparser/blob - lexer.c

   1 #include <config.h>
   2
   3 #include "lexer_t.h"
   4 #include "token_t.h"
   5 #include "symbol_table_t.h"
   6 #include "adt/error.h"
   7
   8 #include <assert.h>
   9 #include <errno.h>
  10 #include <string.h>
  11 #include <ctype.h>
  12
  13 //#define DEBUG_CHARS
  14 #define MAX_PUTBACK 1
  15
  16 static
  17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
  18 {
  19         (void) this;
  20         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
  21 }
  22
  23 static
  24 void error_prefix(lexer_t *this)
  25 {
  26         error_prefix_at(this, this->source_position.input_name,
  27                         this->source_position.linenr);
  28 }
  29
  30 static
  31 void parse_error(lexer_t *this, const char *msg)
  32 {
  33         error_prefix(this);
  34         fprintf(stderr, "%s\n", msg);
  35 }
  36
  37 static inline
  38 void next_char(lexer_t *this)
  39 {
  40         this->bufpos++;
  41         if(this->bufpos >= this->bufend) {
  42                 size_t s = fread(this->buf + MAX_PUTBACK, 1,
  43                                  sizeof(this->buf) - MAX_PUTBACK, this->input);
  44                 if(s == 0) {
  45                         this->c = EOF;
  46                         return;
  47                 }
  48                 this->bufpos = this->buf + MAX_PUTBACK;
  49                 this->bufend = this->buf + MAX_PUTBACK + s;
  50         }
  51         this->c = *(this->bufpos);
  52 #ifdef DEBUG_CHARS
  53         printf("nchar '%c'\n", this->c);
  54 #endif
  55 }
  56
  57 static inline
  58 void put_back(lexer_t *this, int c)
  59 {
  60         char *p = (char*) this->bufpos - 1;
  61         this->bufpos--;
  62         assert(p >= this->buf);
  63         *p = c;
  64
  65 #ifdef DEBUG_CHARS
  66         printf("putback '%c'\n", c);
  67 #endif
  68 }
  69
  70 static
  71 int replace_trigraph(lexer_t *this)
  72 {
  73 #define MATCH_TRIGRAPH(ch,replacement)           \
  74         case ch:                                     \
  75                 this->c = replacement;                   \
  76                 return 1;
  77
  78         switch(this->c) {
  79         MATCH_TRIGRAPH('=', '#')
  80         MATCH_TRIGRAPH('(', '[')
  81         MATCH_TRIGRAPH('/', '\\')
  82         MATCH_TRIGRAPH(')', ']')
  83         MATCH_TRIGRAPH('\'', '^')
  84         MATCH_TRIGRAPH('<', '{')
  85         MATCH_TRIGRAPH('!', '|')
  86         MATCH_TRIGRAPH('>', '}')
  87         MATCH_TRIGRAPH('-', '~')
  88         default:
  89                 break;
  90         }
  91
  92         return 0;
  93 }
  94
  95 static
  96 void parse_symbol(lexer_t *this, token_t *token)
  97 {
  98         symbol_t *symbol;
  99         char     *string;
 100
 101         obstack_1grow(&symbol_obstack, this->c);
 102         next_char(this);
 103
 104         while(1) {
 105                 switch(this->c) {
 106                 case '\\':
 107                         next_char(this);
 108                         if(this->c == '\n') {
 109                                 next_char(this);
 110                                 this->source_position.linenr++;
 111                                 break;
 112                         }
 113
 114                 case 'A' ... 'Z':
 115                 case 'a' ... 'z':
 116                 case '_':
 117                         obstack_1grow(&symbol_obstack, this->c);
 118                         next_char(this);
 119                         break;
 120
 121                 case '?':
 122                         next_char(this);
 123                         if(this->c != '?') {
 124                                 put_back(this, this->c);
 125                                 this->c = '?';
 126                                 goto end_symbol;
 127                         }
 128                         next_char(this);
 129                         if(replace_trigraph(this))
 130                                 break;
 131                         put_back(this, '?');
 132                         put_back(this, this->c);
 133                         this->c = '?';
 134                         goto end_symbol;
 135
 136                 default:
 137                         goto end_symbol;
 138                 }
 139         }
 140 end_symbol:
 141         obstack_1grow(&symbol_obstack, '\0');
 142
 143         string = obstack_finish(&symbol_obstack);
 144         symbol = symbol_table_insert(string);
 145
 146         if(symbol->ID > 0) {
 147                 token->type = symbol->ID;
 148         } else {
 149                 token->type = T_IDENTIFIER;
 150         }
 151         token->v.symbol = symbol;
 152
 153         if(symbol->string != string) {
 154                 obstack_free(&symbol_obstack, string);
 155         }
 156 }
 157
 158 #if 0
 159 static
 160 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
 161 {
 162         do {
 163                 obstack_1grow(&symbol_obstack, this->c);
 164                 next_char(this);
 165         } while(is_ident_char(this->c));
 166         obstack_1grow(&symbol_obstack, '\0');
 167
 168         char     *string = obstack_finish(&symbol_obstack);
 169         symbol_t *symbol = preprocessor_symbol_table_find(string);
 170         obstack_free(&symbol_obstack, string);
 171
 172         if(symbol == 0)
 173                 return TP_ERROR;
 174
 175         return symbol->ID;
 176 }
 177 #endif
 178
 179 static
 180 void parse_number_hex(lexer_t *this, token_t *token)
 181 {
 182         assert(this->c == 'x' || this->c == 'X');
 183         next_char(this);
 184
 185         if (!isdigit(this->c) &&
 186                 !('A' <= this->c && this->c <= 'F') &&
 187                 !('a' <= this->c && this->c <= 'f')) {
 188                 parse_error(this, "premature end of hex number literal");
 189                 token->type = T_ERROR;
 190                 return;
 191         }
 192
 193         int value = 0;
 194         for(;;) {
 195                 if (isdigit(this->c)) {
 196                         value = 16 * value + this->c - '0';
 197                 } else if ('A' <= this->c && this->c <= 'F') {
 198                         value = 16 * value + this->c - 'A' + 10;
 199                 } else if ('a' <= this->c && this->c <= 'f') {
 200                         value = 16 * value + this->c - 'a' + 10;
 201                 } else {
 202                         token->type     = T_INTEGER;
 203                         token->v.intvalue = value;
 204                         return;
 205                 }
 206                 next_char(this);
 207         }
 208 }
 209
 210 static
 211 void parse_number_oct(lexer_t *this, token_t *token)
 212 {
 213         assert(this->c == 'o' || this->c == 'O');
 214         next_char(this);
 215
 216         int value = 0;
 217         for(;;) {
 218                 if ('0' <= this->c && this->c <= '7') {
 219                         value = 8 * value + this->c - '0';
 220                 } else {
 221                         token->type     = T_INTEGER;
 222                         token->v.intvalue = value;
 223                         return;
 224                 }
 225                 next_char(this);
 226         }
 227 }
 228
 229 static
 230 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
 231 {
 232         int value = 0;
 233         if(first_char > 0) {
 234                 assert(first_char >= '0' && first_char <= '9');
 235                 value = first_char - '0';
 236         }
 237
 238         for(;;) {
 239                 if (isdigit(this->c)) {
 240                         value = 10 * value + this->c - '0';
 241                 } else {
 242                         token->type     = T_INTEGER;
 243                         token->v.intvalue = value;
 244                         return;
 245                 }
 246                 next_char(this);
 247         }
 248 }
 249
 250 static
 251 void parse_number(lexer_t *this, token_t *token)
 252 {
 253         // TODO check for overflow
 254         // TODO check for various invalid inputs sequences
 255
 256         if (this->c == '0') {
 257                 next_char(this);
 258                 switch (this->c) {
 259                         case 'X':
 260                         case 'x': parse_number_hex(this, token); break;
 261                         case 'o':
 262                         case 'O': parse_number_oct(this, token); break;
 263                         default:  parse_number_dec(this, token, '0');
 264                 }
 265         } else {
 266                 parse_number_dec(this, token, 0);
 267         }
 268 }
 269
 270 static
 271 int parse_escape_sequence(lexer_t *this)
 272 {
 273         int c = this->c;
 274         next_char(this);
 275
 276         switch(c) {
 277         case '"': return '"';
 278         case '\'': return'\'';
 279         case '\\':
 280                 if(this->c == '\n') {
 281                         this->source_position.linenr++;
 282                         next_char(this);
 283                         return parse_escape_sequence(this);
 284                 }
 285                 return '\\';
 286         case 'a': return '\a';
 287         case 'b': return '\b';
 288         case 'f': return '\f';
 289         case 'n': return '\n';
 290         case 'r': return '\r';
 291         case 't': return '\t';
 292         case 'v': return '\v';
 293         case 'x': /* TODO parse hex number ... */
 294                 parse_error(this, "hex escape sequences not implemented yet");
 295                 return EOF;
 296         case 0 ... 8: /* TODO parse octal number ... */
 297                 parse_error(this, "octal escape sequences not implemented yet");
 298                 return EOF;
 299         case '?':
 300                 if(this->c != '?') {
 301                         return '?';
 302                 }
 303                 /* might be a trigraph */
 304                 next_char(this);
 305                 if(replace_trigraph(this)) {
 306                         return parse_escape_sequence(this);
 307                 }
 308                 put_back(this, '?');
 309                 return '?';
 310
 311         case EOF:
 312                 parse_error(this, "reached end of file while parsing escape sequence");
 313                 return EOF;
 314         default:
 315                 parse_error(this, "unknown escape sequence\n");
 316                 return EOF;
 317         }
 318 }
 319
 320 static
 321 void parse_string_literal(lexer_t *this, token_t *token)
 322 {
 323         unsigned    start_linenr = this->source_position.linenr;
 324         char       *string;
 325         const char *result;
 326
 327         assert(this->c == '"');
 328         next_char(this);
 329
 330         while(1) {
 331                 switch(this->c) {
 332                 case '?':
 333                         next_char(this);
 334                         if(this->c != '?') {
 335                                 obstack_1grow(&symbol_obstack, '?');
 336                                 break;
 337                         }
 338                         next_char(this);
 339                         if(replace_trigraph(this))
 340                                 break;
 341                         obstack_1grow(&symbol_obstack, '?');
 342                         put_back(this, this->c);
 343                         this->c = '?';
 344                         break;
 345
 346                 case '\\':
 347                         next_char(this);
 348                         if(this->c == '\n') {
 349                                 next_char(this);
 350                                 this->source_position.linenr++;
 351                                 break;
 352                         }
 353                         int c = parse_escape_sequence(this);
 354                         obstack_1grow(&symbol_obstack, c);
 355                         break;
 356
 357                 case EOF:
 358                         error_prefix_at(this, this->source_position.input_name,
 359                                         start_linenr);
 360                         fprintf(stderr, "string has no end\n");
 361                         token->type = T_ERROR;
 362                         return;
 363
 364                 case '"':
 365                         next_char(this);
 366                         goto end_of_string;
 367
 368                 default:
 369                         obstack_1grow(&symbol_obstack, this->c);
 370                         next_char(this);
 371                         break;
 372                 }
 373         }
 374
 375 end_of_string:
 376
 377         /* TODO: concatenate multiple strings separated by whitespace... */
 378
 379         /* add finishing 0 to the string */
 380         obstack_1grow(&symbol_obstack, '\0');
 381         string = obstack_finish(&symbol_obstack);
 382
 383         /* check if there is already a copy of the string */
 384         result = strset_insert(&this->stringset, string);
 385         if(result != string) {
 386                 obstack_free(&symbol_obstack, string);
 387         }
 388
 389         token->type     = T_STRING_LITERAL;
 390         token->v.string = result;
 391 }
 392
 393 static
 394 void skip_multiline_comment(lexer_t *this)
 395 {
 396         unsigned start_linenr = this->source_position.linenr;
 397
 398         while(1) {
 399                 switch(this->c) {
 400                 case '*':
 401                         next_char(this);
 402                         if(this->c == '/') {
 403                                 next_char(this);
 404                                 return;
 405                         }
 406                         break;
 407                 case '?':
 408                         next_char(this);
 409                         if(this->c != '?')
 410                                 break;
 411                         next_char(this);
 412                         if(replace_trigraph(this))
 413                                 break;
 414                         put_back(this, '?');
 415                         /* we don't put back the 2nd ? as the comment text is discarded
 416                          * anyway */
 417                         break;
 418
 419                 case '\n':
 420                         this->source_position.linenr++;
 421                         next_char(this);
 422                         break;
 423                 case EOF:
 424                         error_prefix_at(this, this->source_position.input_name,
 425                                         start_linenr);
 426                         fprintf(stderr, "at end of file while looking for comment end\n");
 427                         return;
 428                 default:
 429                         next_char(this);
 430                         break;
 431                 }
 432         }
 433 }
 434
 435 static
 436 void skip_line_comment(lexer_t *this)
 437 {
 438         while(1) {
 439                 switch(this->c) {
 440                 case '?':
 441                         next_char(this);
 442                         if(this->c != '?')
 443                                 break;
 444                         next_char(this);
 445                         if(replace_trigraph(this))
 446                                 break;
 447                         put_back(this, '?');
 448                         /* we don't put back the 2nd ? as the comment text is discarded
 449                          * anyway */
 450                         break;
 451
 452                 case '\\':
 453                         next_char(this);
 454                         if(this->c == '\n') {
 455                                 next_char(this);
 456                                 this->source_position.linenr++;
 457                         }
 458                         break;
 459
 460                 case EOF:
 461                 case '\n':
 462                         return;
 463                 default:
 464                         next_char(this);
 465                         break;
 466                 }
 467         }
 468 }
 469
 470 #if 0
 471 static
 472 void eat_until_newline(lexer_t *this)
 473 {
 474         while(this->c != '\n') {
 475                 next_char(this);
 476         }
 477 }
 478 #endif
 479
 480 #if 0
 481 static
 482 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
 483 {
 484         (void) result_token;
 485         /* skip whitespaces */
 486         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
 487                 next_char(this);
 488         }
 489 }
 490 #endif
 491
 492 void preprocessor_next_token(lexer_t *this, token_t *token)
 493 {
 494         /* skip whitespaces */
 495         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
 496                 next_char(this);
 497         }
 498
 499         switch(this->c) {
 500         case 'A' ... 'Z':
 501         case 'a' ... 'z':
 502         case '_':
 503                 parse_symbol(this, token);
 504         }
 505 }
 506
 507 void lexer_next_token(lexer_t *this, token_t *token)
 508 {
 509         int line_begin = 0;
 510
 511         /* skip whitespaces */
 512         while(this->c == ' ' || this->c == '\t' || this->c == '\n'
 513               || this->c == '\r') {
 514                 if(this->c == '\n') {
 515                         line_begin = 1;
 516                         this->source_position.linenr++;
 517                 }
 518                 next_char(this);
 519         }
 520
 521         switch(this->c) {
 522         case 'A' ... 'Z':
 523         case 'a' ... 'z':
 524         case '_':
 525                 parse_symbol(this, token);
 526                 break;
 527
 528         case '0' ... '9':
 529                 parse_number(this, token);
 530                 break;
 531
 532         case '"':
 533                 parse_string_literal(this, token);
 534                 break;
 535
 536         case '\'':
 537                 next_char(this);
 538                 if(this->c == '\\') {
 539                         next_char(this);
 540                         token->type       = T_INTEGER;
 541                         token->v.intvalue = parse_escape_sequence(this);
 542                 } else {
 543                         if(this->c == '\n') {
 544                                 parse_error(this, "newline while parsing character constant");
 545                                 this->source_position.linenr++;
 546                         }
 547                         token->type       = T_INTEGER;
 548                         token->v.intvalue = this->c;
 549                         next_char(this);
 550                 }
 551                 if(this->c != '\'') {
 552                         parse_error(this, "multibyte character constant");
 553                         token->type = T_ERROR;
 554                 } else {
 555                         next_char(this);
 556                 }
 557                 break;
 558
 559         case '\\':
 560                 next_char(this);
 561                 if(this->c == '\n') {
 562                         next_char(this);
 563                         this->source_position.linenr++;
 564                         lexer_next_token(this, token);
 565                         return;
 566                 } else {
 567                         parse_error(this, "unexpected '\\' found");
 568                         token->type = T_ERROR;
 569                 }
 570                 break;
 571
 572 #define MAYBE1(ch, set_type)                           \
 573                 next_char(this);                               \
 574                 while(1) {                                     \
 575                         switch(this->c) {                          \
 576                         case ch:                                   \
 577                                 next_char(this);                       \
 578                                 token->type = set_type;                \
 579                                 return;                                \
 580
 581 #define MAYBE(ch, set_type)                            \
 582                         case ch:                                   \
 583                                 next_char(this);                       \
 584                                 token->type = set_type;                \
 585                                 return;
 586
 587 #define ELSE(set_type)                                 \
 588                         case '?':                                  \
 589                                 next_char(this);                       \
 590                                 if(this->c != '?') {                   \
 591                                         put_back(this, this->c);           \
 592                                         this->c = '?';                     \
 593                                         token->type = set_type;            \
 594                                         return;                            \
 595                                 }                                      \
 596                                 next_char(this);                       \
 597                                 if(replace_trigraph(this))             \
 598                                         break;                             \
 599                                 put_back(this, '?');                   \
 600                                 put_back(this, this->c);               \
 601                                 this->c = '?';                         \
 602                                 token->type = set_type;                \
 603                                 return;                                \
 604                                                        \
 605                         case '\\':                                 \
 606                                 next_char(this);                       \
 607                                 if(this->c == '\n') {                  \
 608                                         next_char(this);                   \
 609                                         this->source_position.linenr++;    \
 610                                         break;                             \
 611                                 }                                      \
 612                                 /* fallthrough */                      \
 613                         default:                                   \
 614                                 token->type = set_type;                \
 615                                 return;                                \
 616                         }                                          \
 617                 } /* end of while(1) */                        \
 618                 break;
 619
 620         case '.':
 621                 next_char(this);
 622                 if(this->c == '.') {
 623                         next_char(this);
 624                         if(this->c == '.') {
 625                                 next_char(this);
 626                                 token->type = T_DOTDOTDOT;
 627                         } else {
 628                                 put_back(this, '.');
 629                                 token->type = '.';
 630                         }
 631                 } else {
 632                         token->type = '.';
 633                 }
 634                 break;
 635         case '&':
 636                 MAYBE1('&', T_ANDAND)
 637                 MAYBE('=', T_ANDEQUAL)
 638                 ELSE('&')
 639         case '*':
 640                 MAYBE1('=', T_ASTERISKEQUAL)
 641                 ELSE('*')
 642         case '+':
 643                 MAYBE1('+', T_PLUSPLUS)
 644                 MAYBE('=', T_PLUSEQUAL)
 645                 ELSE('+')
 646         case '-':
 647                 MAYBE1('-', T_MINUSMINUS)
 648                 MAYBE('=', T_MINUSEQUAL)
 649                 ELSE('-')
 650         case '!':
 651                 MAYBE1('=', T_EXCLAMATIONMARKEQUAL)
 652                 ELSE('!')
 653         case '/':
 654                 MAYBE1('=', T_SLASHEQUAL)
 655                         case '*':
 656                                 next_char(this);
 657                                 skip_multiline_comment(this);
 658                                 lexer_next_token(this, token);
 659                                 return;
 660                         case '/':
 661                                 next_char(this);
 662                                 skip_line_comment(this);
 663                                 lexer_next_token(this, token);
 664                                 return;
 665                 ELSE('/')
 666         case '%':
 667                 MAYBE1('=', T_PERCENTEQUAL)
 668                         case ':':
 669                                 /* TODO find trigraphs... */
 670                                 next_char(this);
 671                                 if(this->c == '%') {
 672                                         next_char(this);
 673                                         if(this->c == ':') {
 674                                                 next_char(this);
 675                                                 token->type = T_PERCENTCOLONPERCENTCOLON;
 676                                         } else {
 677                                                 put_back(this, '%');
 678                                                 token->type = T_PERCENTCOLON;
 679                                         }
 680                                         return;
 681                                 }
 682                                 token->type = T_PERCENTCOLON;
 683                                 return;
 684                 MAYBE('>', T_PERCENTGREATER)
 685                 ELSE('%')
 686         case '<':
 687                 MAYBE1(':', T_LESSCOLON)
 688                 MAYBE('%', T_LESSPERCENT)
 689                         case '<':
 690                                 /* TODO trigraphs... */
 691                                 next_char(this);
 692                                 if(this->c == '<') {
 693                                         next_char(this);
 694                                         if(this->c == '=') {
 695                                                 next_char(this);
 696                                                 token->type = T_LESSLESSEQUAL;
 697                                         } else {
 698                                                 token->type = T_LESSLESS;
 699                                         }
 700                                 } else {
 701                                         token->type = T_LESS;
 702                                 }
 703                                 return;
 704                 ELSE('<')
 705         case '>':
 706                 next_char(this);
 707                 while(1) {
 708                         switch(this->c) {
 709                         case '>':
 710                                 next_char(this);
 711                                 /* TODO trigraphs */
 712                                 if(this->c == '=') {
 713                                         next_char(this);
 714                                         token->type = T_GREATERGREATEREQUAL;
 715                                 } else {
 716                                         token->type = T_GREATERGREATER;
 717                                 }
 718                                 break;
 719                 ELSE('>')
 720         case '^':
 721                 MAYBE1('=', T_CARETEQUAL)
 722                 ELSE('^')
 723         case '|':
 724                 MAYBE1('=', T_PIPEEQUAL)
 725                 MAYBE('|', T_PIPEPIPE)
 726                 ELSE('|')
 727         case ':':
 728                 MAYBE1('>', T_COLONGREATER)
 729                 ELSE(':')
 730         case '=':
 731                 MAYBE1('=', T_EQUALEQUAL)
 732                 ELSE('=')
 733         case '#':
 734                 MAYBE1('#', T_HASHHASH)
 735 #if 0
 736                 else {
 737                         if(line_begin) {
 738                                 parse_preprocessor_directive(this, token);
 739                                 return;
 740                         } else {
 741                                 token->type = '#';
 742                         }
 743 #else
 744                 ELSE('#')
 745 #endif
 746
 747         case '?':
 748                 next_char(this);
 749                 /* just a simple ? */
 750                 if(this->c != '?') {
 751                         token->type = '?';
 752                         break;
 753                 }
 754                 /* might be a trigraph */
 755                 next_char(this);
 756                 if(replace_trigraph(this)) {
 757                         lexer_next_token(this, token);
 758                         return;
 759                 }
 760                 put_back(this, this->c);
 761                 this->c = '?';
 762                 token->type = '?';
 763                 break;
 764
 765         case '[':
 766         case ']':
 767         case '(':
 768         case ')':
 769         case '{':
 770         case '}':
 771         case '~':
 772         case ';':
 773         case ',':
 774                 token->type = this->c;
 775                 next_char(this);
 776                 break;
 777
 778         case EOF:
 779                 token->type = T_EOF;
 780                 break;
 781
 782         default:
 783                 error_prefix(this);
 784                 fprintf(stderr, "unknown character '%c' found\n", this->c);
 785                 token->type = T_ERROR;
 786                 next_char(this);
 787                 break;
 788         }
 789 }
 790
 791 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
 792 {
 793         memset(this, 0, sizeof(this[0]));
 794
 795         this->input = stream;
 796
 797         this->source_position.linenr     = 0;
 798         this->source_position.input_name = input_name;
 799         strset_init(&this->stringset);
 800
 801         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
 802          * beginning of a line */
 803         this->c = '\n';
 804 }
 805
 806 void lexer_destroy(lexer_t *this)
 807 {
 808         (void) this;
 809 }
 810
 811 static __attribute__((unused))
 812 void dbg_pos(const source_position_t source_position)
 813 {
 814         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
 815         fflush(stdout);
 816 }