acd2ef6954136adec7d308d2e56b94185f0c15d3
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer_t.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <string.h>
11 #include <ctype.h>
12
13 //#define DEBUG_CHARS
14 #define MAX_PUTBACK 1
15
16 static
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
18 {
19         (void) this;
20         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
21 }
22
23 static
24 void error_prefix(lexer_t *this)
25 {
26         error_prefix_at(this, this->source_position.input_name,
27                         this->source_position.linenr);
28 }
29
30 static
31 void parse_error(lexer_t *this, const char *msg)
32 {
33         error_prefix(this);
34         fprintf(stderr, "%s\n", msg);
35 }
36
37 static inline
38 void next_char(lexer_t *this)
39 {
40         this->bufpos++;
41         if(this->bufpos >= this->bufend) {
42                 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43                                  sizeof(this->buf) - MAX_PUTBACK, this->input);
44                 if(s == 0) {
45                         this->c = EOF;
46                         return;
47                 }
48                 this->bufpos = this->buf + MAX_PUTBACK;
49                 this->bufend = this->buf + MAX_PUTBACK + s;
50         }
51         this->c = *(this->bufpos);
52 #ifdef DEBUG_CHARS
53         printf("nchar '%c'\n", this->c);
54 #endif
55 }
56
57 static inline
58 void put_back(lexer_t *this, int c)
59 {
60         char *p = (char*) this->bufpos - 1;
61         this->bufpos--;
62         assert(p >= this->buf);
63         *p = c;
64
65 #ifdef DEBUG_CHARS
66         printf("putback '%c'\n", c);
67 #endif
68 }
69
70 static
71 int replace_trigraph(lexer_t *this)
72 {
73 #define MATCH_TRIGRAPH(ch,replacement)           \
74         case ch:                                     \
75                 this->c = replacement;                   \
76                 return 1;
77
78         switch(this->c) {
79         MATCH_TRIGRAPH('=', '#')
80         MATCH_TRIGRAPH('(', '[')
81         MATCH_TRIGRAPH('/', '\\')
82         MATCH_TRIGRAPH(')', ']')
83         MATCH_TRIGRAPH('\'', '^')
84         MATCH_TRIGRAPH('<', '{')
85         MATCH_TRIGRAPH('!', '|')
86         MATCH_TRIGRAPH('>', '}')
87         MATCH_TRIGRAPH('-', '~')
88         default:
89                 break;
90         }
91
92         return 0;
93 }
94
95 static
96 void parse_symbol(lexer_t *this, token_t *token)
97 {
98         symbol_t *symbol;
99         char     *string;
100
101         obstack_1grow(&symbol_obstack, this->c);
102         next_char(this);
103
104         while(1) {
105                 switch(this->c) {
106                 case '\\':
107                         next_char(this);
108                         if(this->c == '\n') {
109                                 next_char(this);
110                                 this->source_position.linenr++;
111                                 break;
112                         }
113
114                 case 'A' ... 'Z':
115                 case 'a' ... 'z':
116                 case '_':
117                         obstack_1grow(&symbol_obstack, this->c);
118                         next_char(this);
119                         break;
120
121                 case '?':
122                         next_char(this);
123                         if(this->c != '?') {
124                                 put_back(this, this->c);
125                                 this->c = '?';
126                                 goto end_symbol;
127                         }
128                         next_char(this);
129                         if(replace_trigraph(this))
130                                 break;
131                         put_back(this, '?');
132                         put_back(this, this->c);
133                         this->c = '?';
134                         goto end_symbol;
135
136                 default:
137                         goto end_symbol;
138                 }
139         }
140 end_symbol:
141         obstack_1grow(&symbol_obstack, '\0');
142
143         string = obstack_finish(&symbol_obstack);
144         symbol = symbol_table_insert(string);
145
146         if(symbol->ID > 0) {
147                 token->type = symbol->ID;
148         } else {
149                 token->type = T_IDENTIFIER;
150         }
151         token->v.symbol = symbol;
152
153         if(symbol->string != string) {
154                 obstack_free(&symbol_obstack, string);
155         }
156 }
157
158 #if 0
159 static
160 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
161 {
162         do {
163                 obstack_1grow(&symbol_obstack, this->c);
164                 next_char(this);
165         } while(is_ident_char(this->c));
166         obstack_1grow(&symbol_obstack, '\0');
167
168         char     *string = obstack_finish(&symbol_obstack);
169         symbol_t *symbol = preprocessor_symbol_table_find(string);
170         obstack_free(&symbol_obstack, string);
171
172         if(symbol == 0)
173                 return TP_ERROR;
174
175         return symbol->ID;
176 }
177 #endif
178
179 static
180 void parse_number_hex(lexer_t *this, token_t *token)
181 {
182         assert(this->c == 'x' || this->c == 'X');
183         next_char(this);
184
185         if (!isdigit(this->c) &&
186                 !('A' <= this->c && this->c <= 'F') &&
187                 !('a' <= this->c && this->c <= 'f')) {
188                 parse_error(this, "premature end of hex number literal");
189                 token->type = T_ERROR;
190                 return;
191         }
192
193         int value = 0;
194         for(;;) {
195                 if (isdigit(this->c)) {
196                         value = 16 * value + this->c - '0';
197                 } else if ('A' <= this->c && this->c <= 'F') {
198                         value = 16 * value + this->c - 'A' + 10;
199                 } else if ('a' <= this->c && this->c <= 'f') {
200                         value = 16 * value + this->c - 'a' + 10;
201                 } else {
202                         token->type     = T_INTEGER;
203                         token->v.intvalue = value;
204                         return;
205                 }
206                 next_char(this);
207         }
208 }
209
210 static
211 void parse_number_oct(lexer_t *this, token_t *token)
212 {
213         assert(this->c == 'o' || this->c == 'O');
214         next_char(this);
215
216         int value = 0;
217         for(;;) {
218                 if ('0' <= this->c && this->c <= '7') {
219                         value = 8 * value + this->c - '0';
220                 } else {
221                         token->type     = T_INTEGER;
222                         token->v.intvalue = value;
223                         return;
224                 }
225                 next_char(this);
226         }
227 }
228
229 static
230 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
231 {
232         int value = 0;
233         if(first_char > 0) {
234                 assert(first_char >= '0' && first_char <= '9');
235                 value = first_char - '0';
236         }
237
238         for(;;) {
239                 if (isdigit(this->c)) {
240                         value = 10 * value + this->c - '0';
241                 } else {
242                         token->type     = T_INTEGER;
243                         token->v.intvalue = value;
244                         return;
245                 }
246                 next_char(this);
247         }
248 }
249
250 static
251 void parse_number(lexer_t *this, token_t *token)
252 {
253         // TODO check for overflow
254         // TODO check for various invalid inputs sequences
255
256         if (this->c == '0') {
257                 next_char(this);
258                 switch (this->c) {
259                         case 'X':
260                         case 'x': parse_number_hex(this, token); break;
261                         case 'o':
262                         case 'O': parse_number_oct(this, token); break;
263                         default:  parse_number_dec(this, token, '0');
264                 }
265         } else {
266                 parse_number_dec(this, token, 0);
267         }
268 }
269
270 static
271 int parse_escape_sequence(lexer_t *this)
272 {
273         int c = this->c;
274         next_char(this);
275
276         switch(c) {
277         case '"': return '"';
278         case '\'': return'\'';
279         case '\\':
280                 if(this->c == '\n') {
281                         this->source_position.linenr++;
282                         next_char(this);
283                         return parse_escape_sequence(this);
284                 }
285                 return '\\';
286         case 'a': return '\a';
287         case 'b': return '\b';
288         case 'f': return '\f';
289         case 'n': return '\n';
290         case 'r': return '\r';
291         case 't': return '\t';
292         case 'v': return '\v';
293         case 'x': /* TODO parse hex number ... */
294                 parse_error(this, "hex escape sequences not implemented yet");
295                 return EOF;
296         case 0 ... 8: /* TODO parse octal number ... */
297                 parse_error(this, "octal escape sequences not implemented yet");
298                 return EOF;
299         case '?':
300                 if(this->c != '?') {
301                         return '?';
302                 }
303                 /* might be a trigraph */
304                 next_char(this);
305                 if(replace_trigraph(this)) {
306                         return parse_escape_sequence(this);
307                 }
308                 put_back(this, '?');
309                 return '?';
310
311         case EOF:
312                 parse_error(this, "reached end of file while parsing escape sequence");
313                 return EOF;
314         default:
315                 parse_error(this, "unknown escape sequence\n");
316                 return EOF;
317         }
318 }
319
320 static
321 void parse_string_literal(lexer_t *this, token_t *token)
322 {
323         unsigned    start_linenr = this->source_position.linenr;
324         char       *string;
325         const char *result;
326
327         assert(this->c == '"');
328         next_char(this);
329
330         while(1) {
331                 switch(this->c) {
332                 case '?':
333                         next_char(this);
334                         if(this->c != '?') {
335                                 obstack_1grow(&symbol_obstack, '?');
336                                 break;
337                         }
338                         next_char(this);
339                         if(replace_trigraph(this))
340                                 break;
341                         obstack_1grow(&symbol_obstack, '?');
342                         put_back(this, this->c);
343                         this->c = '?';
344                         break;
345
346                 case '\\':
347                         next_char(this);
348                         if(this->c == '\n') {
349                                 next_char(this);
350                                 this->source_position.linenr++;
351                                 break;
352                         }
353                         int c = parse_escape_sequence(this);
354                         obstack_1grow(&symbol_obstack, c);
355                         break;
356
357                 case EOF:
358                         error_prefix_at(this, this->source_position.input_name,
359                                         start_linenr);
360                         fprintf(stderr, "string has no end\n");
361                         token->type = T_ERROR;
362                         return;
363
364                 case '"':
365                         next_char(this);
366                         goto end_of_string;
367
368                 default:
369                         obstack_1grow(&symbol_obstack, this->c);
370                         next_char(this);
371                         break;
372                 }
373         }
374
375 end_of_string:
376
377         /* TODO: concatenate multiple strings separated by whitespace... */
378
379         /* add finishing 0 to the string */
380         obstack_1grow(&symbol_obstack, '\0');
381         string = obstack_finish(&symbol_obstack);
382
383         /* check if there is already a copy of the string */
384         result = strset_insert(&this->stringset, string);
385         if(result != string) {
386                 obstack_free(&symbol_obstack, string);
387         }
388
389         token->type     = T_STRING_LITERAL;
390         token->v.string = result;
391 }
392
393 static
394 void skip_multiline_comment(lexer_t *this)
395 {
396         unsigned start_linenr = this->source_position.linenr;
397
398         while(1) {
399                 switch(this->c) {
400                 case '*':
401                         next_char(this);
402                         if(this->c == '/') {
403                                 next_char(this);
404                                 return;
405                         }
406                         break;
407                 case '?':
408                         next_char(this);
409                         if(this->c != '?')
410                                 break;
411                         next_char(this);
412                         if(replace_trigraph(this))
413                                 break;
414                         put_back(this, '?');
415                         /* we don't put back the 2nd ? as the comment text is discarded
416                          * anyway */
417                         break;
418
419                 case '\n':
420                         this->source_position.linenr++;
421                         next_char(this);
422                         break;
423                 case EOF:
424                         error_prefix_at(this, this->source_position.input_name,
425                                         start_linenr);
426                         fprintf(stderr, "at end of file while looking for comment end\n");
427                         return;
428                 default:
429                         next_char(this);
430                         break;
431                 }
432         }
433 }
434
435 static
436 void skip_line_comment(lexer_t *this)
437 {
438         while(1) {
439                 switch(this->c) {
440                 case '?':
441                         next_char(this);
442                         if(this->c != '?')
443                                 break;
444                         next_char(this);
445                         if(replace_trigraph(this))
446                                 break;
447                         put_back(this, '?');
448                         /* we don't put back the 2nd ? as the comment text is discarded
449                          * anyway */
450                         break;
451
452                 case '\\':
453                         next_char(this);
454                         if(this->c == '\n') {
455                                 next_char(this);
456                                 this->source_position.linenr++;
457                         }
458                         break;
459
460                 case EOF:
461                 case '\n':
462                         return;
463                 default:
464                         next_char(this);
465                         break;
466                 }
467         }
468 }
469
470 #if 0
471 static
472 void eat_until_newline(lexer_t *this)
473 {
474         while(this->c != '\n') {
475                 next_char(this);
476         }
477 }
478 #endif
479
480 #if 0
481 static
482 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
483 {
484         (void) result_token;
485         /* skip whitespaces */
486         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
487                 next_char(this);
488         }
489 }
490 #endif
491
492 void preprocessor_next_token(lexer_t *this, token_t *token)
493 {
494         /* skip whitespaces */
495         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
496                 next_char(this);
497         }
498
499         switch(this->c) {
500         case 'A' ... 'Z':
501         case 'a' ... 'z':
502         case '_':
503                 parse_symbol(this, token);
504         }
505 }
506
507 void lexer_next_token(lexer_t *this, token_t *token)
508 {
509         int line_begin = 0;
510
511         /* skip whitespaces */
512         while(this->c == ' ' || this->c == '\t' || this->c == '\n'
513               || this->c == '\r') {
514                 if(this->c == '\n') {
515                         line_begin = 1;
516                         this->source_position.linenr++;
517                 }
518                 next_char(this);
519         }
520
521         switch(this->c) {
522         case 'A' ... 'Z':
523         case 'a' ... 'z':
524         case '_':
525                 parse_symbol(this, token);
526                 break;
527
528         case '0' ... '9':
529                 parse_number(this, token);
530                 break;
531
532         case '"':
533                 parse_string_literal(this, token);
534                 break;
535
536         case '\'':
537                 next_char(this);
538                 if(this->c == '\\') {
539                         next_char(this);
540                         token->type       = T_INTEGER;
541                         token->v.intvalue = parse_escape_sequence(this);
542                 } else {
543                         if(this->c == '\n') {
544                                 parse_error(this, "newline while parsing character constant");
545                                 this->source_position.linenr++;
546                         }
547                         token->type       = T_INTEGER;
548                         token->v.intvalue = this->c;
549                         next_char(this);
550                 }
551                 if(this->c != '\'') {
552                         parse_error(this, "multibyte character constant");
553                         token->type = T_ERROR;
554                 } else {
555                         next_char(this);
556                 }
557                 break;
558
559         case '\\':
560                 next_char(this);
561                 if(this->c == '\n') {
562                         next_char(this);
563                         this->source_position.linenr++;
564                         lexer_next_token(this, token);
565                         return;
566                 } else {
567                         parse_error(this, "unexpected '\\' found");
568                         token->type = T_ERROR;
569                 }
570                 break;
571
572 #define MAYBE1(ch, set_type)                           \
573                 next_char(this);                               \
574                 while(1) {                                     \
575                         switch(this->c) {                          \
576                         case ch:                                   \
577                                 next_char(this);                       \
578                                 token->type = set_type;                \
579                                 return;                                \
580
581 #define MAYBE(ch, set_type)                            \
582                         case ch:                                   \
583                                 next_char(this);                       \
584                                 token->type = set_type;                \
585                                 return;
586
587 #define ELSE(set_type)                                 \
588                         case '?':                                  \
589                                 next_char(this);                       \
590                                 if(this->c != '?') {                   \
591                                         put_back(this, this->c);           \
592                                         this->c = '?';                     \
593                                         token->type = set_type;            \
594                                         return;                            \
595                                 }                                      \
596                                 next_char(this);                       \
597                                 if(replace_trigraph(this))             \
598                                         break;                             \
599                                 put_back(this, '?');                   \
600                                 put_back(this, this->c);               \
601                                 this->c = '?';                         \
602                                 token->type = set_type;                \
603                                 return;                                \
604                                                        \
605                         case '\\':                                 \
606                                 next_char(this);                       \
607                                 if(this->c == '\n') {                  \
608                                         next_char(this);                   \
609                                         this->source_position.linenr++;    \
610                                         break;                             \
611                                 }                                      \
612                                 /* fallthrough */                      \
613                         default:                                   \
614                                 token->type = set_type;                \
615                                 return;                                \
616                         }                                          \
617                 } /* end of while(1) */                        \
618                 break;
619
620         case '.':
621                 next_char(this);
622                 if(this->c == '.') {
623                         next_char(this);
624                         if(this->c == '.') {
625                                 next_char(this);
626                                 token->type = T_DOTDOTDOT;
627                         } else {
628                                 put_back(this, '.');
629                                 token->type = '.';
630                         }
631                 } else {
632                         token->type = '.';
633                 }
634                 break;
635         case '&':
636                 MAYBE1('&', T_ANDAND)
637                 MAYBE('=', T_ANDEQUAL)
638                 ELSE('&')
639         case '*':
640                 MAYBE1('=', T_ASTERISKEQUAL)
641                 ELSE('*')
642         case '+':
643                 MAYBE1('+', T_PLUSPLUS)
644                 MAYBE('=', T_PLUSEQUAL)
645                 ELSE('+')
646         case '-':
647                 MAYBE1('-', T_MINUSMINUS)
648                 MAYBE('=', T_MINUSEQUAL)
649                 ELSE('-')
650         case '!':
651                 MAYBE1('=', T_EXCLAMATIONMARKEQUAL)
652                 ELSE('!')
653         case '/':
654                 MAYBE1('=', T_SLASHEQUAL)
655                         case '*':
656                                 next_char(this);
657                                 skip_multiline_comment(this);
658                                 lexer_next_token(this, token);
659                                 return;
660                         case '/':
661                                 next_char(this);
662                                 skip_line_comment(this);
663                                 lexer_next_token(this, token);
664                                 return;
665                 ELSE('/')
666         case '%':
667                 MAYBE1('=', T_PERCENTEQUAL)
668                         case ':':
669                                 /* TODO find trigraphs... */
670                                 next_char(this);
671                                 if(this->c == '%') {
672                                         next_char(this);
673                                         if(this->c == ':') {
674                                                 next_char(this);
675                                                 token->type = T_PERCENTCOLONPERCENTCOLON;
676                                         } else {
677                                                 put_back(this, '%');
678                                                 token->type = T_PERCENTCOLON;
679                                         }
680                                         return;
681                                 }
682                                 token->type = T_PERCENTCOLON;
683                                 return;
684                 MAYBE('>', T_PERCENTGREATER)
685                 ELSE('%')
686         case '<':
687                 MAYBE1(':', T_LESSCOLON)
688                 MAYBE('%', T_LESSPERCENT)
689                         case '<':
690                                 /* TODO trigraphs... */
691                                 next_char(this);
692                                 if(this->c == '<') {
693                                         next_char(this);
694                                         if(this->c == '=') {
695                                                 next_char(this);
696                                                 token->type = T_LESSLESSEQUAL;
697                                         } else {
698                                                 token->type = T_LESSLESS;
699                                         }
700                                 } else {
701                                         token->type = T_LESS;
702                                 }
703                                 return;
704                 ELSE('<')
705         case '>':
706                 next_char(this);
707                 while(1) {
708                         switch(this->c) {
709                         case '>':
710                                 next_char(this);
711                                 /* TODO trigraphs */
712                                 if(this->c == '=') {
713                                         next_char(this);
714                                         token->type = T_GREATERGREATEREQUAL;
715                                 } else {
716                                         token->type = T_GREATERGREATER;
717                                 }
718                                 break;
719                 ELSE('>')
720         case '^':
721                 MAYBE1('=', T_CARETEQUAL)
722                 ELSE('^')
723         case '|':
724                 MAYBE1('=', T_PIPEEQUAL)
725                 MAYBE('|', T_PIPEPIPE)
726                 ELSE('|')
727         case ':':
728                 MAYBE1('>', T_COLONGREATER)
729                 ELSE(':')
730         case '=':
731                 MAYBE1('=', T_EQUALEQUAL)
732                 ELSE('=')
733         case '#':
734                 MAYBE1('#', T_HASHHASH)
735 #if 0
736                 else {
737                         if(line_begin) {
738                                 parse_preprocessor_directive(this, token);
739                                 return;
740                         } else {
741                                 token->type = '#';
742                         }
743 #else
744                 ELSE('#')
745 #endif
746
747         case '?':
748                 next_char(this);
749                 /* just a simple ? */
750                 if(this->c != '?') {
751                         token->type = '?';
752                         break;
753                 }
754                 /* might be a trigraph */
755                 next_char(this);
756                 if(replace_trigraph(this)) {
757                         lexer_next_token(this, token);
758                         return;
759                 }
760                 put_back(this, this->c);
761                 this->c = '?';
762                 token->type = '?';
763                 break;
764
765         case '[':
766         case ']':
767         case '(':
768         case ')':
769         case '{':
770         case '}':
771         case '~':
772         case ';':
773         case ',':
774                 token->type = this->c;
775                 next_char(this);
776                 break;
777
778         case EOF:
779                 token->type = T_EOF;
780                 break;
781
782         default:
783                 error_prefix(this);
784                 fprintf(stderr, "unknown character '%c' found\n", this->c);
785                 token->type = T_ERROR;
786                 next_char(this);
787                 break;
788         }
789 }
790
791 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
792 {
793         memset(this, 0, sizeof(this[0]));
794
795         this->input = stream;
796
797         this->source_position.linenr     = 0;
798         this->source_position.input_name = input_name;
799         strset_init(&this->stringset);
800
801         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
802          * beginning of a line */
803         this->c = '\n';
804 }
805
806 void lexer_destroy(lexer_t *this)
807 {
808         (void) this;
809 }
810
811 static __attribute__((unused))
812 void dbg_pos(const source_position_t source_position)
813 {
814         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
815         fflush(stdout);
816 }