filter trigraphs in advance and simplify lexer code because of that
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7 #include "adt/strset.h"
8 #include "adt/util.h"
9
10 #include <assert.h>
11 #include <errno.h>
12 #include <string.h>
13 #include <ctype.h>
14
15 //#define DEBUG_CHARS
16 #define MAX_PUTBACK 3
17
18 static int         c;
19 token_t            lexer_token;
20 static FILE       *input;
21 static char        buf[1024 + MAX_PUTBACK];
22 static const char *bufend;
23 static const char *bufpos;
24 static strset_t    stringset;
25 //static FILE      **input_stack;
26 //static char      **buf_stack;
27
28 static void error_prefix_at(const char *input_name, unsigned linenr)
29 {
30         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
31 }
32
33 static void error_prefix(void)
34 {
35         error_prefix_at(lexer_token.source_position.input_name,
36                         lexer_token.source_position.linenr);
37 }
38
39 static void parse_error(const char *msg)
40 {
41         error_prefix();
42         fprintf(stderr, "%s\n", msg);
43 }
44
45 static inline void next_real_char(void)
46 {
47         bufpos++;
48         if(bufpos >= bufend) {
49                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
50                                  input);
51                 if(s == 0) {
52                         c = EOF;
53                         return;
54                 }
55                 bufpos = buf + MAX_PUTBACK;
56                 bufend = buf + MAX_PUTBACK + s;
57         }
58         c = *(bufpos);
59 }
60
61 static inline void put_back(int pc)
62 {
63         char *p = (char*) bufpos - 1;
64         bufpos--;
65         assert(p >= buf);
66         *p = pc;
67
68 #ifdef DEBUG_CHARS
69         printf("putback '%c'\n", pc);
70 #endif
71 }
72
73 static inline void next_char(void);
74
75 #define MATCH_NEWLINE(code)                   \
76         case '\r':                                \
77                 next_char();                          \
78                 if(c == '\n') {                       \
79                         next_char();                      \
80                 }                                     \
81                 lexer_token.source_position.linenr++; \
82                 code;                                 \
83         case '\n':                                \
84                 next_char();                          \
85                 lexer_token.source_position.linenr++; \
86                 code;
87
88 static void maybe_concat_lines(void)
89 {
90         next_char();
91         switch(c) {
92         MATCH_NEWLINE(return;)
93
94         default:
95                 break;
96         }
97
98         put_back(c);
99         c = '\\';
100 }
101
102 static inline void next_char(void)
103 {
104         next_real_char();
105
106         /* filter trigraphs */
107         if(UNLIKELY(c == '\\')) {
108                 maybe_concat_lines();
109                 goto end_of_next_char;
110         }
111
112         if(LIKELY(c != '?'))
113                 goto end_of_next_char;
114
115         next_real_char();
116         if(LIKELY(c != '?')) {
117                 put_back(c);
118                 c = '?';
119                 goto end_of_next_char;
120         }
121
122         next_real_char();
123         switch(c) {
124         case '=': c = '#'; break;
125         case '(': c = '['; break;
126         case '/': c = '\\'; maybe_concat_lines(); break;
127         case ')': c = ']'; break;
128         case '\'': c = '^'; break;
129         case '<': c = '{'; break;
130         case '!': c = '|'; break;
131         case '>': c = '}'; break;
132         case '-': c = '~'; break;
133         default:
134                 put_back('?');
135                 put_back(c);
136                 c = '?';
137                 break;
138         }
139
140 end_of_next_char:
141 #ifdef DEBUG_CHARS
142         printf("nchar '%c'\n", c);
143 #else
144         ;
145 #endif
146 }
147
148 #define SYMBOL_CHARS  \
149         case 'a':         \
150         case 'b':         \
151         case 'c':         \
152         case 'd':         \
153         case 'e':         \
154         case 'f':         \
155         case 'g':         \
156         case 'h':         \
157         case 'i':         \
158         case 'j':         \
159         case 'k':         \
160         case 'l':         \
161         case 'm':         \
162         case 'n':         \
163         case 'o':         \
164         case 'p':         \
165         case 'q':         \
166         case 'r':         \
167         case 's':         \
168         case 't':         \
169         case 'u':         \
170         case 'v':         \
171         case 'w':         \
172         case 'x':         \
173         case 'y':         \
174         case 'z':         \
175         case 'A':         \
176         case 'B':         \
177         case 'C':         \
178         case 'D':         \
179         case 'E':         \
180         case 'F':         \
181         case 'G':         \
182         case 'H':         \
183         case 'I':         \
184         case 'J':         \
185         case 'K':         \
186         case 'L':         \
187         case 'M':         \
188         case 'N':         \
189         case 'O':         \
190         case 'P':         \
191         case 'Q':         \
192         case 'R':         \
193         case 'S':         \
194         case 'T':         \
195         case 'U':         \
196         case 'V':         \
197         case 'W':         \
198         case 'X':         \
199         case 'Y':         \
200         case 'Z':         \
201         case '_':
202
203 #define DIGITS        \
204         case '0':         \
205         case '1':         \
206         case '2':         \
207         case '3':         \
208         case '4':         \
209         case '5':         \
210         case '6':         \
211         case '7':         \
212         case '8':         \
213         case '9':
214
215 static void parse_symbol(void)
216 {
217         symbol_t *symbol;
218         char     *string;
219
220         obstack_1grow(&symbol_obstack, c);
221         next_char();
222
223         while(1) {
224                 switch(c) {
225                 DIGITS
226                 SYMBOL_CHARS
227                         obstack_1grow(&symbol_obstack, c);
228                         next_char();
229                         break;
230
231                 default:
232                         goto end_symbol;
233                 }
234         }
235
236 end_symbol:
237         obstack_1grow(&symbol_obstack, '\0');
238
239         string = obstack_finish(&symbol_obstack);
240         symbol = symbol_table_insert(string);
241
242         lexer_token.type     = symbol->ID;
243         lexer_token.v.symbol = symbol;
244
245         if(symbol->string != string) {
246                 obstack_free(&symbol_obstack, string);
247         }
248 }
249
250 static void parse_number_hex(void)
251 {
252         assert(c == 'x' || c == 'X');
253         next_char();
254
255         if (!isdigit(c) &&
256                 !('A' <= c && c <= 'F') &&
257                 !('a' <= c && c <= 'f')) {
258                 parse_error("premature end of hex number literal");
259                 lexer_token.type = T_ERROR;
260                 return;
261         }
262
263         int value = 0;
264         while(1) {
265                 if (isdigit(c)) {
266                         value = 16 * value + c - '0';
267                 } else if ('A' <= c && c <= 'F') {
268                         value = 16 * value + c - 'A' + 10;
269                 } else if ('a' <= c && c <= 'f') {
270                         value = 16 * value + c - 'a' + 10;
271                 } else {
272                         lexer_token.type     = T_INTEGER;
273                         lexer_token.v.intvalue = value;
274                         return;
275                 }
276                 next_char();
277         }
278 }
279
280 static void parse_number_oct(void)
281 {
282         int value = 0;
283         while(1) {
284                 if ('0' <= c && c <= '7') {
285                         value = 8 * value + c - '0';
286                 } else if (c == '8' || c == '9') {
287                         parse_error("invalid octal number");
288                         lexer_token.type = T_ERROR;
289                         return;
290                 } else {
291                         lexer_token.type       = T_INTEGER;
292                         lexer_token.v.intvalue = value;
293                         return;
294                 }
295                 next_char();
296         }
297 }
298
299 static void parse_number_dec(void)
300 {
301         int value = 0;
302
303         for(;;) {
304                 if (isdigit(c)) {
305                         value = 10 * value + c - '0';
306                 } else {
307                         lexer_token.type       = T_INTEGER;
308                         lexer_token.v.intvalue = value;
309                         return;
310                 }
311                 next_char();
312         }
313 }
314
315 static void parse_integer_suffix(void)
316 {
317         if(c == 'U' || c == 'U') {
318                 /* TODO do something with the suffixes... */
319                 next_char();
320                 if(c == 'L' || c == 'l') {
321                         next_char();
322                         if(c == 'L' || c == 'l') {
323                                 next_char();
324                         }
325                 }
326         } else if(c == 'l' || c == 'L') {
327                 next_char();
328                 if(c == 'l' || c == 'L') {
329                         next_char();
330                         if(c == 'u' || c == 'U') {
331                                 next_char();
332                         }
333                 } else if(c == 'u' || c == 'U') {
334                         next_char();
335                 }
336         }
337 }
338
339 static void parse_number(void)
340 {
341         if (c == '0') {
342                 next_char();
343                 switch (c) {
344                         case 'X':
345                         case 'x': parse_number_hex(); break;
346                         default:  parse_number_oct(); break;
347                 }
348         } else {
349                 parse_number_dec();
350         }
351
352         parse_integer_suffix();
353 }
354
355 static int parse_octal_sequence(void)
356 {
357         int value = 0;
358         while(1) {
359                 if(c < '0' || c > '7')
360                         break;
361                 value = 8 * value + c - '0';
362                 next_char();
363         }
364
365         return value;
366 }
367
368 static int parse_hex_sequence(void)
369 {
370         int value = 0;
371         while(1) {
372                 if (c >= '0' && c <= '9') {
373                         value = 16 * value + c - '0';
374                 } else if ('A' <= c && c <= 'F') {
375                         value = 16 * value + c - 'A' + 10;
376                 } else if ('a' <= c && c <= 'f') {
377                         value = 16 * value + c - 'a' + 10;
378                 } else {
379                         break;
380                 }
381                 next_char();
382         }
383
384         return value;
385 }
386
387 static int parse_escape_sequence(void)
388 {
389         while(1) {
390                 int ec = c;
391                 next_char();
392
393                 switch(ec) {
394                 case '"':  return '"';
395                 case '\'': return'\'';
396                 case '\\': return '\\';
397                 case '?': return '\?';
398                 case 'a': return '\a';
399                 case 'b': return '\b';
400                 case 'f': return '\f';
401                 case 'n': return '\n';
402                 case 'r': return '\r';
403                 case 't': return '\t';
404                 case 'v': return '\v';
405                 case 'x':
406                         return parse_hex_sequence();
407                 case '0':
408                 case '1':
409                 case '2':
410                 case '3':
411                 case '4':
412                 case '5':
413                 case '6':
414                 case '7':
415                         return parse_octal_sequence();
416                 case EOF:
417                         parse_error("reached end of file while parsing escape sequence");
418                         return EOF;
419                 default:
420                         parse_error("unknown escape sequence");
421                         return EOF;
422                 }
423         }
424 }
425
426 const char *concat_strings(const char *s1, const char *s2)
427 {
428         size_t  len1   = strlen(s1);
429         size_t  len2   = strlen(s2);
430
431         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
432         memcpy(concat, s1, len1);
433         memcpy(concat + len1, s2, len2 + 1);
434
435         const char *result = strset_insert(&stringset, concat);
436         if(result != concat) {
437                 obstack_free(&symbol_obstack, concat);
438         }
439
440         return result;
441 }
442
443 static void parse_string_literal(void)
444 {
445         unsigned    start_linenr = lexer_token.source_position.linenr;
446         char       *string;
447         const char *result;
448
449         assert(c == '"');
450         next_char();
451
452         while(1) {
453                 switch(c) {
454                 case '\\':
455                         next_char();
456                         int ec = parse_escape_sequence();
457                         obstack_1grow(&symbol_obstack, ec);
458                         break;
459
460                 case EOF:
461                         error_prefix_at(lexer_token.source_position.input_name,
462                                         start_linenr);
463                         fprintf(stderr, "string has no end\n");
464                         lexer_token.type = T_ERROR;
465                         return;
466
467                 case '"':
468                         next_char();
469                         goto end_of_string;
470
471                 default:
472                         obstack_1grow(&symbol_obstack, c);
473                         next_char();
474                         break;
475                 }
476         }
477
478 end_of_string:
479
480         /* TODO: concatenate multiple strings separated by whitespace... */
481
482         /* add finishing 0 to the string */
483         obstack_1grow(&symbol_obstack, '\0');
484         string = obstack_finish(&symbol_obstack);
485
486         /* check if there is already a copy of the string */
487         result = strset_insert(&stringset, string);
488         if(result != string) {
489                 obstack_free(&symbol_obstack, string);
490         }
491
492         lexer_token.type     = T_STRING_LITERAL;
493         lexer_token.v.string = result;
494 }
495
496 static void parse_character_constant(void)
497 {
498         assert(c == '\'');
499         next_char();
500
501         int found_char = 0;
502         while(1) {
503                 switch(c) {
504                 case '\\':
505                         next_char();
506                         found_char = parse_escape_sequence();
507                         break;
508
509                 MATCH_NEWLINE(
510                         parse_error("newline while parsing character constant");
511                         break;
512                 )
513
514                 case '\'':
515                         next_char();
516                         goto end_of_char_constant;
517
518                 case EOF:
519                         parse_error("EOF while parsing character constant");
520                         lexer_token.type = T_ERROR;
521                         return;
522
523                 default:
524                         if(found_char != 0) {
525                                 parse_error("more than 1 characters in character "
526                                             "constant");
527                                 goto end_of_char_constant;
528                         } else {
529                                 found_char = c;
530                                 next_char();
531                         }
532                         break;
533                 }
534         }
535
536 end_of_char_constant:
537         lexer_token.type       = T_INTEGER;
538         lexer_token.v.intvalue = found_char;
539 }
540
541 static void skip_multiline_comment(void)
542 {
543         unsigned start_linenr = lexer_token.source_position.linenr;
544
545         while(1) {
546                 switch(c) {
547                 case '*':
548                         next_char();
549                         if(c == '/') {
550                                 next_char();
551                                 return;
552                         }
553                         break;
554
555                 MATCH_NEWLINE(break;)
556
557                 case EOF:
558                         error_prefix_at(lexer_token.source_position.input_name,
559                                         start_linenr);
560                         fprintf(stderr, "at end of file while looking for comment end\n");
561                         return;
562
563                 default:
564                         next_char();
565                         break;
566                 }
567         }
568 }
569
570 static void skip_line_comment(void)
571 {
572         while(1) {
573                 switch(c) {
574                 case EOF:
575                         return;
576
577                 case '\n':
578                 case '\r':
579                         return;
580
581                 default:
582                         next_char();
583                         break;
584                 }
585         }
586 }
587
588 static token_t pp_token;
589
590 static inline void next_pp_token(void)
591 {
592         lexer_next_preprocessing_token();
593         pp_token = lexer_token;
594 }
595
596 static void eat_until_newline(void)
597 {
598         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
599                 next_pp_token();
600         }
601 }
602
603 static void error_directive(void)
604 {
605         error_prefix();
606         fprintf(stderr, "#error directive: \n");
607
608         /* parse pp-tokens until new-line */
609 }
610
611 static void define_directive(void)
612 {
613         lexer_next_preprocessing_token();
614         if(lexer_token.type != T_IDENTIFIER) {
615                 parse_error("expected identifier after #define\n");
616                 eat_until_newline();
617         }
618 }
619
620 static void ifdef_directive(int is_ifndef)
621 {
622         (void) is_ifndef;
623         lexer_next_preprocessing_token();
624         //expect_identifier();
625         //extect_newline();
626 }
627
628 static void endif_directive(void)
629 {
630         //expect_newline();
631 }
632
633 static void parse_line_directive(void)
634 {
635         if(pp_token.type != T_INTEGER) {
636                 parse_error("expected integer");
637         } else {
638                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
639                 next_pp_token();
640         }
641         if(pp_token.type == T_STRING_LITERAL) {
642                 lexer_token.source_position.input_name = pp_token.v.string;
643                 next_pp_token();
644         }
645
646         eat_until_newline();
647 }
648
649 static void parse_preprocessor_identifier(void)
650 {
651         assert(pp_token.type == T_IDENTIFIER);
652         symbol_t *symbol = pp_token.v.symbol;
653
654         switch(symbol->pp_ID) {
655         case TP_include:
656                 printf("include - enable header name parsing!\n");
657                 break;
658         case TP_define:
659                 define_directive();
660                 break;
661         case TP_ifdef:
662                 ifdef_directive(0);
663                 break;
664         case TP_ifndef:
665                 ifdef_directive(1);
666                 break;
667         case TP_endif:
668                 endif_directive();
669                 break;
670         case TP_line:
671                 next_pp_token();
672                 parse_line_directive();
673                 break;
674         case TP_if:
675         case TP_else:
676         case TP_elif:
677         case TP_undef:
678         case TP_error:
679                 error_directive();
680                 break;
681         case TP_pragma:
682                 break;
683         }
684 }
685
686 static void parse_preprocessor_directive()
687 {
688         next_pp_token();
689
690         switch(pp_token.type) {
691         case T_IDENTIFIER:
692                 parse_preprocessor_identifier();
693                 break;
694         case T_INTEGER:
695                 parse_line_directive();
696                 break;
697         default:
698                 parse_error("invalid preprocessor directive");
699                 eat_until_newline();
700                 break;
701         }
702 }
703
704 #define MAYBE_PROLOG                                       \
705                         next_char();                                   \
706                         while(1) {                                     \
707                                 switch(c) {
708
709 #define MAYBE(ch, set_type)                                \
710                                 case ch:                                   \
711                                         next_char();                           \
712                                         lexer_token.type = set_type;           \
713                                         return;
714
715 #define ELSE_CODE(code)                                    \
716                                 default:                                   \
717                                         code;                                  \
718                                 }                                          \
719                         } /* end of while(1) */                        \
720                         break;
721
722 #define ELSE(set_type)                                     \
723                 ELSE_CODE(                                         \
724                         lexer_token.type = set_type;                   \
725                         return;                                        \
726                 )
727
728 void lexer_next_preprocessing_token(void)
729 {
730         while(1) {
731                 switch(c) {
732                 case ' ':
733                 case '\t':
734                         next_char();
735                         break;
736
737                 MATCH_NEWLINE(
738                         lexer_token.type = '\n';
739                         return;
740                 )
741
742                 SYMBOL_CHARS
743                         parse_symbol();
744                         return;
745
746                 DIGITS
747                         parse_number();
748                         return;
749
750                 case '"':
751                         parse_string_literal();
752                         return;
753
754                 case '\'':
755                         parse_character_constant();
756                         return;
757
758                 case '.':
759                         MAYBE_PROLOG
760                                 case '.':
761                                         MAYBE_PROLOG
762                                         MAYBE('.', T_DOTDOTDOT)
763                                         ELSE_CODE(
764                                                 put_back(c);
765                                                 c = '.';
766                                                 lexer_token.type = '.';
767                                                 return;
768                                         )
769                         ELSE('.')
770                 case '&':
771                         MAYBE_PROLOG
772                         MAYBE('&', T_ANDAND)
773                         MAYBE('=', T_ANDEQUAL)
774                         ELSE('&')
775                 case '*':
776                         MAYBE_PROLOG
777                         MAYBE('=', T_ASTERISKEQUAL)
778                         ELSE('*')
779                 case '+':
780                         MAYBE_PROLOG
781                         MAYBE('+', T_PLUSPLUS)
782                         MAYBE('=', T_PLUSEQUAL)
783                         ELSE('+')
784                 case '-':
785                         MAYBE_PROLOG
786                         MAYBE('>', T_MINUSGREATER)
787                         MAYBE('-', T_MINUSMINUS)
788                         MAYBE('=', T_MINUSEQUAL)
789                         ELSE('-')
790                 case '!':
791                         MAYBE_PROLOG
792                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
793                         ELSE('!')
794                 case '/':
795                         MAYBE_PROLOG
796                         MAYBE('=', T_SLASHEQUAL)
797                                 case '*':
798                                         next_char();
799                                         skip_multiline_comment();
800                                         lexer_next_preprocessing_token();
801                                         return;
802                                 case '/':
803                                         next_char();
804                                         skip_line_comment();
805                                         lexer_next_preprocessing_token();
806                                         return;
807                         ELSE('/')
808                 case '%':
809                         MAYBE_PROLOG
810                         MAYBE('>', T_PERCENTGREATER)
811                         MAYBE('=', T_PERCENTEQUAL)
812                                 case ':':
813                                         MAYBE_PROLOG
814                                                 case '%':
815                                                         MAYBE_PROLOG
816                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
817                                                         ELSE_CODE(
818                                                                 put_back(c);
819                                                                 c = '%';
820                                                                 lexer_token.type = T_PERCENTCOLON;
821                                                                 return;
822                                                         )
823                                         ELSE(T_PERCENTCOLON)
824                         ELSE('%')
825                 case '<':
826                         MAYBE_PROLOG
827                         MAYBE(':', T_LESSCOLON)
828                         MAYBE('%', T_LESSPERCENT)
829                         MAYBE('=', T_LESSEQUAL)
830                                 case '<':
831                                         MAYBE_PROLOG
832                                         MAYBE('=', T_LESSLESSEQUAL)
833                                         ELSE(T_LESSLESS)
834                         ELSE('<')
835                 case '>':
836                         MAYBE_PROLOG
837                         MAYBE('=', T_GREATEREQUAL)
838                                 case '>':
839                                         MAYBE_PROLOG
840                                         MAYBE('=', T_GREATERGREATEREQUAL)
841                                         ELSE(T_GREATERGREATER)
842                         ELSE('>')
843                 case '^':
844                         MAYBE_PROLOG
845                         MAYBE('=', T_CARETEQUAL)
846                         ELSE('^')
847                 case '|':
848                         MAYBE_PROLOG
849                         MAYBE('=', T_PIPEEQUAL)
850                         MAYBE('|', T_PIPEPIPE)
851                         ELSE('|')
852                 case ':':
853                         MAYBE_PROLOG
854                         MAYBE('>', T_COLONGREATER)
855                         ELSE(':')
856                 case '=':
857                         MAYBE_PROLOG
858                         MAYBE('=', T_EQUALEQUAL)
859                         ELSE('=')
860                 case '#':
861                         MAYBE_PROLOG
862                         MAYBE('#', T_HASHHASH)
863                         ELSE('#')
864
865                 case '?':
866                 case '[':
867                 case ']':
868                 case '(':
869                 case ')':
870                 case '{':
871                 case '}':
872                 case '~':
873                 case ';':
874                 case ',':
875                 case '\\':
876                         lexer_token.type = c;
877                         next_char();
878                         return;
879
880                 case EOF:
881                         lexer_token.type = T_EOF;
882                         return;
883
884                 default:
885                         next_char();
886                         error_prefix();
887                         fprintf(stderr, "unknown character '%c' found\n", c);
888                         lexer_token.type = T_ERROR;
889                         return;
890                 }
891         }
892 }
893
894 void lexer_next_token(void)
895 {
896         lexer_next_preprocessing_token();
897         if(lexer_token.type != '\n')
898                 return;
899
900 newline_found:
901         do {
902                 lexer_next_preprocessing_token();
903         } while(lexer_token.type == '\n');
904
905         if(lexer_token.type == '#') {
906                 parse_preprocessor_directive();
907                 goto newline_found;
908         }
909 }
910
911 void init_lexer(void)
912 {
913         strset_init(&stringset);
914 }
915
916 void lexer_open_stream(FILE *stream, const char *input_name)
917 {
918         input                                  = stream;
919         lexer_token.source_position.linenr     = 1;
920         lexer_token.source_position.input_name = input_name;
921
922         next_char();
923 }
924
925 void exit_lexer(void)
926 {
927         strset_destroy(&stringset);
928 }
929
930 static __attribute__((unused))
931 void dbg_pos(const source_position_t source_position)
932 {
933         fprintf(stdout, "%s:%d\n", source_position.input_name,
934                 source_position.linenr);
935         fflush(stdout);
936 }