7e3ee4b490ac4ccc17b5771d55b9d8582fca7e4f
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer_t.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <string.h>
11 #include <ctype.h>
12
13 //#define DEBUG_CHARS
14 #define MAX_PUTBACK 2
15
16 static
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
18 {
19         (void) this;
20         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
21 }
22
23 static
24 void error_prefix(lexer_t *this)
25 {
26         error_prefix_at(this, this->source_position.input_name,
27                         this->source_position.linenr);
28 }
29
30 static
31 void parse_error(lexer_t *this, const char *msg)
32 {
33         error_prefix(this);
34         fprintf(stderr, "%s\n", msg);
35 }
36
37 static inline
38 void next_char(lexer_t *this)
39 {
40         this->bufpos++;
41         if(this->bufpos >= this->bufend) {
42                 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43                                  sizeof(this->buf) - MAX_PUTBACK, this->input);
44                 if(s == 0) {
45                         this->c = EOF;
46                         return;
47                 }
48                 this->bufpos = this->buf + MAX_PUTBACK;
49                 this->bufend = this->buf + MAX_PUTBACK + s;
50         }
51         this->c = *(this->bufpos);
52 #ifdef DEBUG_CHARS
53         printf("nchar '%c'\n", this->c);
54 #endif
55 }
56
57 static inline
58 void put_back(lexer_t *this, int c)
59 {
60         char *p = (char*) this->bufpos - 1;
61         this->bufpos--;
62         assert(p >= this->buf);
63         *p = c;
64
65 #ifdef DEBUG_CHARS
66         printf("putback '%c'\n", c);
67 #endif
68 }
69
70 static
71 int replace_trigraph(lexer_t *this)
72 {
73 #define MATCH_TRIGRAPH(ch,replacement)           \
74         case ch:                                     \
75                 this->c = replacement;                   \
76                 return 1;
77
78         switch(this->c) {
79         MATCH_TRIGRAPH('=', '#')
80         MATCH_TRIGRAPH('(', '[')
81         MATCH_TRIGRAPH('/', '\\')
82         MATCH_TRIGRAPH(')', ']')
83         MATCH_TRIGRAPH('\'', '^')
84         MATCH_TRIGRAPH('<', '{')
85         MATCH_TRIGRAPH('!', '|')
86         MATCH_TRIGRAPH('>', '}')
87         MATCH_TRIGRAPH('-', '~')
88         default:
89                 break;
90         }
91
92         return 0;
93 }
94
95 static
96 void parse_symbol(lexer_t *this, token_t *token)
97 {
98         symbol_t *symbol;
99         char     *string;
100
101         obstack_1grow(&symbol_obstack, this->c);
102         next_char(this);
103
104         while(1) {
105                 switch(this->c) {
106                 case '\\':
107                         next_char(this);
108                         if(this->c == '\n') {
109                                 next_char(this);
110                                 this->source_position.linenr++;
111                                 break;
112                         }
113
114                 case 'A' ... 'Z':
115                 case 'a' ... 'z':
116                 case '_':
117                         obstack_1grow(&symbol_obstack, this->c);
118                         next_char(this);
119                         break;
120
121                 case '?':
122                         next_char(this);
123                         if(this->c != '?') {
124                                 put_back(this, this->c);
125                                 this->c = '?';
126                                 goto end_symbol;
127                         }
128                         next_char(this);
129                         if(replace_trigraph(this))
130                                 break;
131                         put_back(this, '?');
132                         put_back(this, this->c);
133                         this->c = '?';
134                         goto end_symbol;
135
136                 default:
137                         goto end_symbol;
138                 }
139         }
140 end_symbol:
141         obstack_1grow(&symbol_obstack, '\0');
142
143         string = obstack_finish(&symbol_obstack);
144         symbol = symbol_table_insert(string);
145
146         if(symbol->ID > 0) {
147                 token->type = symbol->ID;
148         } else {
149                 token->type = T_IDENTIFIER;
150         }
151         token->v.symbol = symbol;
152
153         if(symbol->string != string) {
154                 obstack_free(&symbol_obstack, string);
155         }
156 }
157
158 #if 0
159 static
160 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
161 {
162         do {
163                 obstack_1grow(&symbol_obstack, this->c);
164                 next_char(this);
165         } while(is_ident_char(this->c));
166         obstack_1grow(&symbol_obstack, '\0');
167
168         char     *string = obstack_finish(&symbol_obstack);
169         symbol_t *symbol = preprocessor_symbol_table_find(string);
170         obstack_free(&symbol_obstack, string);
171
172         if(symbol == 0)
173                 return TP_ERROR;
174
175         return symbol->ID;
176 }
177 #endif
178
179 static
180 void parse_number_hex(lexer_t *this, token_t *token)
181 {
182         assert(this->c == 'x' || this->c == 'X');
183         next_char(this);
184
185         if (!isdigit(this->c) &&
186                 !('A' <= this->c && this->c <= 'F') &&
187                 !('a' <= this->c && this->c <= 'f')) {
188                 parse_error(this, "premature end of hex number literal");
189                 token->type = T_ERROR;
190                 return;
191         }
192
193         int value = 0;
194         for(;;) {
195                 if (isdigit(this->c)) {
196                         value = 16 * value + this->c - '0';
197                 } else if ('A' <= this->c && this->c <= 'F') {
198                         value = 16 * value + this->c - 'A' + 10;
199                 } else if ('a' <= this->c && this->c <= 'f') {
200                         value = 16 * value + this->c - 'a' + 10;
201                 } else {
202                         token->type     = T_INTEGER;
203                         token->v.intvalue = value;
204                         return;
205                 }
206                 next_char(this);
207         }
208 }
209
210 static
211 void parse_number_oct(lexer_t *this, token_t *token)
212 {
213         assert(this->c == 'o' || this->c == 'O');
214         next_char(this);
215
216         int value = 0;
217         for(;;) {
218                 if ('0' <= this->c && this->c <= '7') {
219                         value = 8 * value + this->c - '0';
220                 } else {
221                         token->type     = T_INTEGER;
222                         token->v.intvalue = value;
223                         return;
224                 }
225                 next_char(this);
226         }
227 }
228
229 static
230 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
231 {
232         int value = 0;
233         if(first_char > 0) {
234                 assert(first_char >= '0' && first_char <= '9');
235                 value = first_char - '0';
236         }
237
238         for(;;) {
239                 if (isdigit(this->c)) {
240                         value = 10 * value + this->c - '0';
241                 } else {
242                         token->type     = T_INTEGER;
243                         token->v.intvalue = value;
244                         return;
245                 }
246                 next_char(this);
247         }
248 }
249
250 static
251 void parse_number(lexer_t *this, token_t *token)
252 {
253         // TODO check for overflow
254         // TODO check for various invalid inputs sequences
255
256         if (this->c == '0') {
257                 next_char(this);
258                 switch (this->c) {
259                         case 'X':
260                         case 'x': parse_number_hex(this, token); break;
261                         case 'o':
262                         case 'O': parse_number_oct(this, token); break;
263                         default:  parse_number_dec(this, token, '0');
264                 }
265         } else {
266                 parse_number_dec(this, token, 0);
267         }
268 }
269
270 static
271 int parse_escape_sequence(lexer_t *this)
272 {
273         int c = this->c;
274         next_char(this);
275
276         switch(c) {
277         case '"': return '"';
278         case '\'': return'\'';
279         case '\\':
280                 if(this->c == '\n') {
281                         this->source_position.linenr++;
282                         next_char(this);
283                         return parse_escape_sequence(this);
284                 }
285                 return '\\';
286         case 'a': return '\a';
287         case 'b': return '\b';
288         case 'f': return '\f';
289         case 'n': return '\n';
290         case 'r': return '\r';
291         case 't': return '\t';
292         case 'v': return '\v';
293         case 'x': /* TODO parse hex number ... */
294                 parse_error(this, "hex escape sequences not implemented yet");
295                 return EOF;
296         case 0 ... 8: /* TODO parse octal number ... */
297                 parse_error(this, "octal escape sequences not implemented yet");
298                 return EOF;
299         case '?':
300                 if(this->c != '?') {
301                         return '?';
302                 }
303                 /* might be a trigraph */
304                 next_char(this);
305                 if(replace_trigraph(this)) {
306                         return parse_escape_sequence(this);
307                 }
308                 put_back(this, this->c);
309                 this->c = '?';
310                 return '?';
311
312         case EOF:
313                 parse_error(this, "reached end of file while parsing escape sequence");
314                 return EOF;
315         default:
316                 parse_error(this, "unknown escape sequence\n");
317                 return EOF;
318         }
319 }
320
321 #define SKIP_TRIGRAPHS(no_trigraph_code)       \
322         case '?':                                  \
323                 next_char(this);                       \
324                 if(this->c != '?') {                   \
325                         put_back(this, this->c);           \
326                         this->c = '?';                     \
327                         no_trigraph_code;                  \
328                 }                                      \
329                 next_char(this);                       \
330                 if(replace_trigraph(this))             \
331                         break;                             \
332                 put_back(this, '?');                   \
333                 put_back(this, this->c);               \
334                 this->c = '?';                         \
335                 no_trigraph_code;                      \
336
337 static
338 void parse_string_literal(lexer_t *this, token_t *token)
339 {
340         unsigned    start_linenr = this->source_position.linenr;
341         char       *string;
342         const char *result;
343
344         assert(this->c == '"');
345         next_char(this);
346
347         while(1) {
348                 switch(this->c) {
349                 SKIP_TRIGRAPHS(
350                         obstack_1grow(&symbol_obstack, '?');
351                         break;
352                 )
353
354                 case '\\':
355                         next_char(this);
356                         if(this->c == '\n') {
357                                 next_char(this);
358                                 this->source_position.linenr++;
359                                 break;
360                         }
361                         int c = parse_escape_sequence(this);
362                         obstack_1grow(&symbol_obstack, c);
363                         break;
364
365                 case EOF:
366                         error_prefix_at(this, this->source_position.input_name,
367                                         start_linenr);
368                         fprintf(stderr, "string has no end\n");
369                         token->type = T_ERROR;
370                         return;
371
372                 case '"':
373                         next_char(this);
374                         goto end_of_string;
375
376                 default:
377                         obstack_1grow(&symbol_obstack, this->c);
378                         next_char(this);
379                         break;
380                 }
381         }
382
383 end_of_string:
384
385         /* TODO: concatenate multiple strings separated by whitespace... */
386
387         /* add finishing 0 to the string */
388         obstack_1grow(&symbol_obstack, '\0');
389         string = obstack_finish(&symbol_obstack);
390
391         /* check if there is already a copy of the string */
392         result = strset_insert(&this->stringset, string);
393         if(result != string) {
394                 obstack_free(&symbol_obstack, string);
395         }
396
397         token->type     = T_STRING_LITERAL;
398         token->v.string = result;
399 }
400
401 static
402 void parse_character_constant(lexer_t *this, token_t *token)
403 {
404         assert(this->c == '\'');
405         next_char(this);
406
407         while(1) {
408                 switch(this->c) {
409                 SKIP_TRIGRAPHS(
410                         token->type       = T_INTEGER;
411                         token->v.intvalue = '?';
412                         goto end_of_char_constant;
413                 )
414
415                 case '\\':
416                         next_char(this);
417                         if(this->c == '\n') {
418                                 this->source_position.linenr++;
419                                 break;
420                         }
421                         token->type       = T_INTEGER;
422                         token->v.intvalue = parse_escape_sequence(this);
423                         goto end_of_char_constant;
424
425                 case '\n':
426                         next_char(this);
427                         parse_error(this, "newline while parsing character constant");
428                         this->source_position.linenr++;
429                         goto end_of_char_constant;
430
431                 case EOF:
432                         parse_error(this, "EOF while parsing character constant");
433                         token->type = T_ERROR;
434                         return;
435
436                 default:
437                         token->type       = T_INTEGER;
438                         token->v.intvalue = this->c;
439                         next_char(this);
440                         goto end_of_char_constant;
441                 }
442         }
443
444 end_of_char_constant:
445         if(this->c != '\'') {
446                 parse_error(this, "multibyte character constant");
447         } else {
448                 next_char(this);
449         }
450 }
451
452 static
453 void skip_multiline_comment(lexer_t *this)
454 {
455         unsigned start_linenr = this->source_position.linenr;
456
457         while(1) {
458                 switch(this->c) {
459                 case '*':
460                         next_char(this);
461                         if(this->c == '/') {
462                                 next_char(this);
463                                 return;
464                         }
465                         break;
466                 case '?':
467                         next_char(this);
468                         if(this->c != '?')
469                                 break;
470                         next_char(this);
471                         if(replace_trigraph(this))
472                                 break;
473                         put_back(this, '?');
474                         /* we don't put back the 2nd ? as the comment text is discarded
475                          * anyway */
476                         break;
477
478                 case '\n':
479                         this->source_position.linenr++;
480                         next_char(this);
481                         break;
482                 case EOF:
483                         error_prefix_at(this, this->source_position.input_name,
484                                         start_linenr);
485                         fprintf(stderr, "at end of file while looking for comment end\n");
486                         return;
487                 default:
488                         next_char(this);
489                         break;
490                 }
491         }
492 }
493
494 static
495 void skip_line_comment(lexer_t *this)
496 {
497         while(1) {
498                 switch(this->c) {
499                 case '?':
500                         next_char(this);
501                         if(this->c != '?')
502                                 break;
503                         next_char(this);
504                         if(replace_trigraph(this))
505                                 break;
506                         put_back(this, '?');
507                         /* we don't put back the 2nd ? as the comment text is discarded
508                          * anyway */
509                         break;
510
511                 case '\\':
512                         next_char(this);
513                         if(this->c == '\n') {
514                                 next_char(this);
515                                 this->source_position.linenr++;
516                         }
517                         break;
518
519                 case EOF:
520                 case '\n':
521                         return;
522                 default:
523                         next_char(this);
524                         break;
525                 }
526         }
527 }
528
529 #if 0
530 static
531 void eat_until_newline(lexer_t *this)
532 {
533         while(this->c != '\n') {
534                 next_char(this);
535         }
536 }
537 #endif
538
539 #if 0
540 static
541 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
542 {
543         (void) result_token;
544         /* skip whitespaces */
545         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
546                 next_char(this);
547         }
548 }
549 #endif
550
551 void preprocessor_next_token(lexer_t *this, token_t *token)
552 {
553         /* skip whitespaces */
554         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
555                 next_char(this);
556         }
557
558         switch(this->c) {
559         case 'A' ... 'Z':
560         case 'a' ... 'z':
561         case '_':
562                 parse_symbol(this, token);
563         }
564 }
565
566 void lexer_next_token(lexer_t *this, token_t *token)
567 {
568         while(1) {
569                 switch(this->c) {
570                 case ' ':
571                 case '\t':
572                 case '\r':
573                         next_char(this);
574                         break;
575
576                 case '\n':
577                         this->source_position.linenr++;
578                         next_char(this);
579                         break;
580
581                 case 'A' ... 'Z':
582                 case 'a' ... 'z':
583                 case '_':
584                         parse_symbol(this, token);
585                         return;
586
587                 case '0' ... '9':
588                         parse_number(this, token);
589                         return;
590
591                 case '"':
592                         parse_string_literal(this, token);
593                         return;
594
595                 case '\'':
596                         parse_character_constant(this, token);
597                         return;
598
599                 case '\\':
600                         next_char(this);
601                         if(this->c == '\n') {
602                                 next_char(this);
603                                 this->source_position.linenr++;
604                                 break;
605                         } else {
606                                 parse_error(this, "unexpected '\\' found");
607                                 token->type = T_ERROR;
608                         }
609                         return;
610
611 #define MAYBE_PROLOG                                       \
612                         next_char(this);                               \
613                         while(1) {                                     \
614                                 switch(this->c) {
615
616 #define MAYBE(ch, set_type)                                \
617                                 case ch:                                   \
618                                         next_char(this);                       \
619                                         token->type = set_type;                \
620                                         return;
621
622 #define ELSE_CODE(code)                                    \
623                                 SKIP_TRIGRAPHS(                            \
624                                         code;                                  \
625                                 )                                          \
626                                                                                                                    \
627                                 case '\\':                                 \
628                                         next_char(this);                       \
629                                         if(this->c == '\n') {                  \
630                                                 next_char(this);                   \
631                                                 this->source_position.linenr++;    \
632                                                 break;                             \
633                                         }                                      \
634                                         /* fallthrough */                      \
635                                 default:                                   \
636                                         code;                                  \
637                                 }                                          \
638                         } /* end of while(1) */                        \
639                         break;
640
641 #define ELSE(set_type)                                     \
642                 ELSE_CODE(                                         \
643                         token->type = set_type;                        \
644                         return;                                        \
645                 )
646
647                 case '.':
648                         MAYBE_PROLOG
649                                 case '.':
650                                         MAYBE_PROLOG
651                                         MAYBE('.', T_DOTDOTDOT)
652                                         ELSE_CODE(
653                                                 put_back(this, this->c);
654                                                 this->c = '.';
655                                                 token->type = '.';
656                                                 return;
657                                         )
658                         ELSE('.')
659                 case '&':
660                         MAYBE_PROLOG
661                         MAYBE('&', T_ANDAND)
662                         MAYBE('=', T_ANDEQUAL)
663                         ELSE('&')
664                 case '*':
665                         MAYBE_PROLOG
666                         MAYBE('=', T_ASTERISKEQUAL)
667                         ELSE('*')
668                 case '+':
669                         MAYBE_PROLOG
670                         MAYBE('+', T_PLUSPLUS)
671                         MAYBE('=', T_PLUSEQUAL)
672                         ELSE('+')
673                 case '-':
674                         MAYBE_PROLOG
675                         MAYBE('-', T_MINUSMINUS)
676                         MAYBE('=', T_MINUSEQUAL)
677                         ELSE('-')
678                 case '!':
679                         MAYBE_PROLOG
680                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
681                         ELSE('!')
682                 case '/':
683                         MAYBE_PROLOG
684                         MAYBE('=', T_SLASHEQUAL)
685                                 case '*':
686                                         next_char(this);
687                                         skip_multiline_comment(this);
688                                         lexer_next_token(this, token);
689                                         return;
690                                 case '/':
691                                         next_char(this);
692                                         skip_line_comment(this);
693                                         lexer_next_token(this, token);
694                                         return;
695                         ELSE('/')
696                 case '%':
697                         MAYBE_PROLOG
698                         MAYBE('>', T_PERCENTGREATER)
699                         MAYBE('=', T_PERCENTEQUAL)
700                                 case ':':
701                                         MAYBE_PROLOG
702                                                 case '%':
703                                                         MAYBE_PROLOG
704                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
705                                                         ELSE_CODE(
706                                                                 put_back(this, this->c);
707                                                                 this->c = '%';
708                                                                 token->type = T_PERCENTCOLON;
709                                                                 return;
710                                                         )
711                                         ELSE(T_PERCENTCOLON)
712                         ELSE('%')
713                 case '<':
714                         MAYBE_PROLOG
715                         MAYBE(':', T_LESSCOLON)
716                         MAYBE('%', T_LESSPERCENT)
717                                 case '<':
718                                         MAYBE_PROLOG
719                                         MAYBE('=', T_LESSLESSEQUAL)
720                                         ELSE(T_LESSLESS)
721                         ELSE('<')
722                 case '>':
723                         MAYBE_PROLOG
724                                 case '>':
725                                         MAYBE_PROLOG
726                                         MAYBE('=', T_GREATERGREATEREQUAL)
727                                         ELSE(T_GREATERGREATER)
728                         ELSE('>')
729                 case '^':
730                         MAYBE_PROLOG
731                         MAYBE('=', T_CARETEQUAL)
732                         ELSE('^')
733                 case '|':
734                         MAYBE_PROLOG
735                         MAYBE('=', T_PIPEEQUAL)
736                         MAYBE('|', T_PIPEPIPE)
737                         ELSE('|')
738                 case ':':
739                         MAYBE_PROLOG
740                         MAYBE('>', T_COLONGREATER)
741                         ELSE(':')
742                 case '=':
743                         MAYBE_PROLOG
744                         MAYBE('=', T_EQUALEQUAL)
745                         ELSE('=')
746                 case '#':
747                         MAYBE_PROLOG
748                         MAYBE('#', T_HASHHASH)
749 #if 0
750                         else {
751                                 if(line_begin) {
752                                         parse_preprocessor_directive(this, token);
753                                         return;
754                                 } else {
755                                         token->type = '#';
756                                 }
757 #else
758                         ELSE('#')
759 #endif
760
761                 case '?':
762                         next_char(this);
763                         /* just a simple ? */
764                         if(this->c != '?') {
765                                 token->type = '?';
766                                 return;
767                         }
768                         /* might be a trigraph */
769                         next_char(this);
770                         if(replace_trigraph(this)) {
771                                 break;
772                         }
773                         put_back(this, this->c);
774                         this->c = '?';
775                         token->type = '?';
776                         return;
777
778                 case '[':
779                 case ']':
780                 case '(':
781                 case ')':
782                 case '{':
783                 case '}':
784                 case '~':
785                 case ';':
786                 case ',':
787                         token->type = this->c;
788                         next_char(this);
789                         return;
790
791                 case EOF:
792                         token->type = T_EOF;
793                         return;
794
795                 default:
796                         next_char(this);
797                         error_prefix(this);
798                         fprintf(stderr, "unknown character '%c' found\n", this->c);
799                         token->type = T_ERROR;
800                         return;
801                 }
802         }
803 }
804
805 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
806 {
807         memset(this, 0, sizeof(this[0]));
808
809         this->input = stream;
810
811         this->source_position.linenr     = 0;
812         this->source_position.input_name = input_name;
813         strset_init(&this->stringset);
814
815         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
816          * beginning of a line */
817         this->c = '\n';
818 }
819
820 void lexer_destroy(lexer_t *this)
821 {
822         (void) this;
823 }
824
825 static __attribute__((unused))
826 void dbg_pos(const source_position_t source_position)
827 {
828         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
829         fflush(stdout);
830 }