406bc7a984c7883f39fde4446e839b840150feec
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer_t.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <string.h>
11 #include <ctype.h>
12
13 //#define DEBUG_CHARS
14 #define MAX_PUTBACK 3
15
16 static int               c;
17 source_position_t source_position;
18 static FILE             *input;
19 static char              buf[1027];
20 static const char       *bufend;
21 static const char       *bufpos;
22 static strset_t          stringset;
23 //static FILE            **input_stack;
24 //static char            **buf_stack;
25
26 static
27 void error_prefix_at(const char *input_name, unsigned linenr)
28 {
29         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
30 }
31
32 static
33 void error_prefix()
34 {
35         error_prefix_at(source_position.input_name, source_position.linenr);
36 }
37
38 static
39 void parse_error(const char *msg)
40 {
41         error_prefix();
42         fprintf(stderr, "%s\n", msg);
43 }
44
45 static inline
46 void next_char()
47 {
48         bufpos++;
49         if(bufpos >= bufend) {
50                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
51                                  input);
52                 if(s == 0) {
53                         c = EOF;
54                         return;
55                 }
56                 bufpos = buf + MAX_PUTBACK;
57                 bufend = buf + MAX_PUTBACK + s;
58         }
59         c = *(bufpos);
60 #ifdef DEBUG_CHARS
61         printf("nchar '%c'\n", c);
62 #endif
63 }
64
65 static inline
66 void put_back(int pc)
67 {
68         char *p = (char*) bufpos - 1;
69         bufpos--;
70         assert(p >= buf);
71         *p = pc;
72
73 #ifdef DEBUG_CHARS
74         printf("putback '%c'\n", pc);
75 #endif
76 }
77
78
79 static
80 int replace_trigraph(void)
81 {
82 #define MATCH_TRIGRAPH(ch,replacement)           \
83         case ch:                                     \
84                 c = replacement;                         \
85                 return 1;
86
87         switch(c) {
88         MATCH_TRIGRAPH('=', '#')
89         MATCH_TRIGRAPH('(', '[')
90         MATCH_TRIGRAPH('/', '\\')
91         MATCH_TRIGRAPH(')', ']')
92         MATCH_TRIGRAPH('\'', '^')
93         MATCH_TRIGRAPH('<', '{')
94         MATCH_TRIGRAPH('!', '|')
95         MATCH_TRIGRAPH('>', '}')
96         MATCH_TRIGRAPH('-', '~')
97         default:
98                 break;
99         }
100
101         return 0;
102 }
103
104 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
105         case '?':                                  \
106                 next_char();                           \
107                 if(c != '?') {                         \
108                         custom_putback;                    \
109                         put_back(c);                       \
110                         c = '?';                           \
111                         no_trigraph_code;                  \
112                 }                                      \
113                 next_char();                           \
114                 if(replace_trigraph()) {               \
115                         break;                             \
116                 }                                      \
117                 custom_putback;                        \
118                 put_back('?');                         \
119                 put_back(c);                           \
120                 c = '?';                               \
121                 no_trigraph_code;
122
123 #define EAT_NEWLINE(newline_code)              \
124         if(c == '\r') {                            \
125                 next_char();                           \
126                 if(c == '\n')                          \
127                         next_char();                       \
128                 source_position.linenr++;              \
129                 newline_code;                          \
130         } else if(c == '\n') {                     \
131                 next_char();                           \
132                 source_position.linenr++;              \
133                 newline_code;                          \
134         }
135
136 static
137 void parse_symbol(token_t *token)
138 {
139         symbol_t *symbol;
140         char     *string;
141
142         obstack_1grow(&symbol_obstack, c);
143         next_char();
144
145         while(1) {
146                 switch(c) {
147                 case '\\':
148                         next_char();
149                         EAT_NEWLINE(break;)
150                         goto end_symbol;
151
152                 case 'A' ... 'Z':
153                 case 'a' ... 'z':
154                 case '_':
155                         obstack_1grow(&symbol_obstack, c);
156                         next_char();
157                         break;
158
159                 case '?':
160                         next_char();
161                         if(c != '?') {
162                                 put_back(c);
163                                 c = '?';
164                                 goto end_symbol;
165                         }
166                         next_char();
167                         if(replace_trigraph())
168                                 break;
169                         put_back('?');
170                         put_back(c);
171                         c = '?';
172                         goto end_symbol;
173
174                 default:
175                         goto end_symbol;
176                 }
177         }
178 end_symbol:
179         obstack_1grow(&symbol_obstack, '\0');
180
181         string = obstack_finish(&symbol_obstack);
182         symbol = symbol_table_insert(string);
183
184         if(symbol->ID > 0) {
185                 token->type = symbol->ID;
186         } else {
187                 token->type = T_IDENTIFIER;
188         }
189         token->v.symbol = symbol;
190
191         if(symbol->string != string) {
192                 obstack_free(&symbol_obstack, string);
193         }
194 }
195
196 static
197 void parse_number_hex(token_t *token)
198 {
199         assert(c == 'x' || c == 'X');
200         next_char();
201
202         if (!isdigit(c) &&
203                 !('A' <= c && c <= 'F') &&
204                 !('a' <= c && c <= 'f')) {
205                 parse_error("premature end of hex number literal");
206                 token->type = T_ERROR;
207                 return;
208         }
209
210         int value = 0;
211         for(;;) {
212                 if (isdigit(c)) {
213                         value = 16 * value + c - '0';
214                 } else if ('A' <= c && c <= 'F') {
215                         value = 16 * value + c - 'A' + 10;
216                 } else if ('a' <= c && c <= 'f') {
217                         value = 16 * value + c - 'a' + 10;
218                 } else {
219                         token->type     = T_INTEGER;
220                         token->v.intvalue = value;
221                         return;
222                 }
223                 next_char();
224         }
225 }
226
227 static
228 void parse_number_oct(token_t *token)
229 {
230         assert(c == 'o' || c == 'O');
231         next_char();
232
233         int value = 0;
234         for(;;) {
235                 if ('0' <= c && c <= '7') {
236                         value = 8 * value + c - '0';
237                 } else {
238                         token->type     = T_INTEGER;
239                         token->v.intvalue = value;
240                         return;
241                 }
242                 next_char();
243         }
244 }
245
246 static
247 void parse_number_dec(token_t *token, int first_char)
248 {
249         int value = 0;
250         if(first_char > 0) {
251                 assert(first_char >= '0' && first_char <= '9');
252                 value = first_char - '0';
253         }
254
255         for(;;) {
256                 if (isdigit(c)) {
257                         value = 10 * value + c - '0';
258                 } else {
259                         token->type     = T_INTEGER;
260                         token->v.intvalue = value;
261                         return;
262                 }
263                 next_char();
264         }
265 }
266
267 static
268 void parse_number(token_t *token)
269 {
270         // TODO check for overflow
271         // TODO check for various invalid inputs sequences
272
273         if (c == '0') {
274                 next_char();
275                 switch (c) {
276                         case 'X':
277                         case 'x': parse_number_hex(token); break;
278                         case 'o':
279                         case 'O': parse_number_oct(token); break;
280                         default:  parse_number_dec(token, '0');
281                 }
282         } else {
283                 parse_number_dec(token, 0);
284         }
285 }
286
287 static
288 int parse_escape_sequence()
289 {
290         while(1) {
291                 int ec = c;
292                 next_char();
293
294                 switch(ec) {
295                 case '"': return '"';
296                 case '\'': return'\'';
297                 case '\\':
298                         EAT_NEWLINE(break;)
299                         return '\\';
300                 case 'a': return '\a';
301                 case 'b': return '\b';
302                 case 'f': return '\f';
303                 case 'n': return '\n';
304                 case 'r': return '\r';
305                 case 't': return '\t';
306                 case 'v': return '\v';
307                 case 'x': /* TODO parse hex number ... */
308                         parse_error("hex escape sequences not implemented yet");
309                         return EOF;
310                 case 0 ... 8: /* TODO parse octal number ... */
311                         parse_error("octal escape sequences not implemented yet");
312                         return EOF;
313                 case '?':
314                         if(c != '?') {
315                                 return '?';
316                         }
317                         /* might be a trigraph */
318                         next_char();
319                         if(replace_trigraph()) {
320                                 break;
321                         }
322                         put_back(c);
323                         c = '?';
324                         return '?';
325
326                 case EOF:
327                         parse_error("reached end of file while parsing escape sequence");
328                         return EOF;
329                 default:
330                         parse_error("unknown escape sequence");
331                         return EOF;
332                 }
333         }
334 }
335
336 static
337 void parse_string_literal(token_t *token)
338 {
339         unsigned    start_linenr = source_position.linenr;
340         char       *string;
341         const char *result;
342
343         assert(c == '"');
344         next_char();
345
346         while(1) {
347                 switch(c) {
348                 SKIP_TRIGRAPHS(,
349                         obstack_1grow(&symbol_obstack, '?');
350                         next_char();
351                         break;
352                 )
353
354                 case '\\':
355                         next_char();
356                         EAT_NEWLINE(break;)
357                         int ec = parse_escape_sequence();
358                         obstack_1grow(&symbol_obstack, ec);
359                         break;
360
361                 case EOF:
362                         error_prefix_at(source_position.input_name, start_linenr);
363                         fprintf(stderr, "string has no end\n");
364                         token->type = T_ERROR;
365                         return;
366
367                 case '"':
368                         next_char();
369                         goto end_of_string;
370
371                 default:
372                         obstack_1grow(&symbol_obstack, c);
373                         next_char();
374                         break;
375                 }
376         }
377
378 end_of_string:
379
380         /* TODO: concatenate multiple strings separated by whitespace... */
381
382         /* add finishing 0 to the string */
383         obstack_1grow(&symbol_obstack, '\0');
384         string = obstack_finish(&symbol_obstack);
385
386         /* check if there is already a copy of the string */
387         result = strset_insert(&stringset, string);
388         if(result != string) {
389                 obstack_free(&symbol_obstack, string);
390         }
391
392         token->type     = T_STRING_LITERAL;
393         token->v.string = result;
394 }
395
396 #define MATCH_NEWLINE(code)                 \
397         case '\r':                              \
398                 next_char();                        \
399                 if(c == '\n') {                     \
400                         next_char();                    \
401                 }                                   \
402                 source_position.linenr++;           \
403                 code;                               \
404         case '\n':                              \
405                 next_char();                        \
406                 source_position.linenr++;           \
407                 code;
408
409 static
410 void parse_character_constant(token_t *token)
411 {
412         assert(c == '\'');
413         next_char();
414
415         int found_char = 0;
416         while(1) {
417                 switch(c) {
418                 SKIP_TRIGRAPHS(,
419                         found_char = '?';
420                         break;
421                 )
422
423                 case '\\':
424                         next_char();
425                         EAT_NEWLINE(break;)
426                         found_char = '\\';
427                         break;
428
429                 MATCH_NEWLINE(
430                         parse_error("newline while parsing character constant");
431                         break;
432                 )
433
434                 case '\'':
435                         next_char();
436                         goto end_of_char_constant;
437
438                 case EOF:
439                         parse_error("EOF while parsing character constant");
440                         token->type = T_ERROR;
441                         return;
442
443                 default:
444                         if(found_char != 0) {
445                                 parse_error("more than 1 characters in character "
446                                             "constant");
447                                 goto end_of_char_constant;
448                         } else {
449                                 found_char = c;
450                                 next_char();
451                         }
452                         break;
453                 }
454         }
455
456 end_of_char_constant:
457         token->type       = T_INTEGER;
458         token->v.intvalue = found_char;
459 }
460
461 static
462 void skip_multiline_comment(void)
463 {
464         unsigned start_linenr = source_position.linenr;
465         int had_star = 0;
466
467         while(1) {
468                 switch(c) {
469                 case '*':
470                         next_char();
471                         had_star = 1;
472                         break;
473
474                 case '/':
475                         next_char();
476                         if(had_star) {
477                                 return;
478                         }
479                         had_star = 0;
480                         break;
481
482                 case '\\':
483                         next_char();
484                         EAT_NEWLINE(break;)
485                         had_star = 0;
486                         break;
487
488                 case '?':
489                         next_char();
490                         if(c != '?') {
491                                 had_star = 0;
492                                 break;
493                         }
494                         next_char();
495                         if(replace_trigraph())
496                                 break;
497                         put_back(c);
498                         c = '?';
499                         had_star = 0;
500                         /* we don't put back the 2nd ? as the comment text is discarded
501                          * anyway */
502                         break;
503
504                 MATCH_NEWLINE(had_star = 0; break;)
505
506                 case EOF:
507                         error_prefix_at(source_position.input_name, start_linenr);
508                         fprintf(stderr, "at end of file while looking for comment end\n");
509                         return;
510                 default:
511                         had_star = 0;
512                         next_char();
513                         break;
514                 }
515         }
516 }
517
518 static
519 void skip_line_comment(void)
520 {
521         while(1) {
522                 switch(c) {
523                 case '?':
524                         next_char();
525                         if(c != '?')
526                                 break;
527                         next_char();
528                         if(replace_trigraph())
529                                 break;
530                         put_back('?');
531                         /* we don't put back the 2nd ? as the comment text is discarded
532                          * anyway */
533                         break;
534
535                 case '\\':
536                         next_char();
537                         if(c == '\n') {
538                                 next_char();
539                                 source_position.linenr++;
540                         }
541                         break;
542
543                 case EOF:
544                 case '\r':
545                 case '\n':
546                         return;
547
548                 default:
549                         next_char();
550                         break;
551                 }
552         }
553 }
554
555 static
556 void lexer_next_preprocessing_token(token_t *token);
557
558 static
559 void eat_until_newline(void)
560 {
561         /* TODO */
562 }
563
564 static
565 void error_directive(void)
566 {
567         error_prefix();
568         fprintf(stderr, "#error directive: \n");
569
570         /* parse pp-tokens until new-line */
571 }
572
573 static
574 void define_directive(void)
575 {
576         token_t temptoken;
577
578         lexer_next_preprocessing_token(&temptoken);
579         if(temptoken.type != T_IDENTIFIER) {
580                 parse_error("expected identifier after #define\n");
581                 eat_until_newline();
582         }
583 }
584
585 static
586 void ifdef_directive(int is_ifndef)
587 {
588         (void) is_ifndef;
589         token_t temptoken;
590         lexer_next_preprocessing_token(&temptoken);
591         //expect_identifier();
592         //extect_newline();
593 }
594
595 static
596 void endif_directive(void)
597 {
598         //expect_newline();
599 }
600
601 static
602 void found_preprocessor_identifier(symbol_t *symbol)
603 {
604         switch(symbol->pp_ID) {
605         case TP_include:
606                 printf("include - enable header name parsing!\n");
607                 break;
608         case TP_define:
609                 define_directive();
610                 break;
611         case TP_ifdef:
612                 ifdef_directive(0);
613                 break;
614         case TP_ifndef:
615                 ifdef_directive(1);
616                 break;
617         case TP_endif:
618                 endif_directive();
619                 break;
620         case TP_if:
621         case TP_else:
622         case TP_elif:
623         case TP_undef:
624         case TP_line:
625         case TP_error:
626                 error_directive();
627                 break;
628         case TP_pragma:
629                 break;
630         }
631 }
632
633 static
634 void parse_preprocessor_directive(token_t *result_token)
635 {
636         token_t temptoken;
637
638         (void) result_token;
639         lexer_next_preprocessing_token(&temptoken);
640         switch(temptoken.type) {
641         case T_IDENTIFIER:
642                 found_preprocessor_identifier(temptoken.v.symbol);
643                 break;
644         }
645 }
646
647 #define MAYBE_PROLOG                                       \
648                         next_char();                                   \
649                         while(1) {                                     \
650                                 switch(c) {
651
652 #define MAYBE(ch, set_type)                                \
653                                 case ch:                                   \
654                                         next_char();                           \
655                                         token->type = set_type;                \
656                                         return;
657
658 #define ELSE_CODE(code)                                    \
659                                 SKIP_TRIGRAPHS(,                           \
660                                         code;                                  \
661                                 )                                          \
662                                                                                                                    \
663                                 case '\\':                                 \
664                                         next_char();                           \
665                                         EAT_NEWLINE(break;)                    \
666                                         /* fallthrough */                      \
667                                 default:                                   \
668                                         code;                                  \
669                                 }                                          \
670                         } /* end of while(1) */                        \
671                         break;
672
673 #define ELSE(set_type)                                     \
674                 ELSE_CODE(                                         \
675                         token->type = set_type;                        \
676                         return;                                        \
677                 )
678
679 static
680 void eat_whitespace()
681 {
682         while(1) {
683                 switch(c) {
684                 case ' ':
685                 case '\t':
686                         next_char();
687                         break;
688
689                 case '\r':
690                 case '\n':
691                         return;
692
693                 case '\\':
694                         next_char();
695                         if(c == '\n') {
696                                 next_char();
697                                 source_position.linenr++;
698                                 break;
699                         }
700
701                         put_back(c);
702                         c = '\\';
703                         return;
704
705                 SKIP_TRIGRAPHS(,
706                         return;
707                 )
708
709                 case '/':
710                         next_char();
711                         while(1) {
712                                 switch(c) {
713                                 case '*':
714                                         next_char();
715                                         skip_multiline_comment();
716                                         eat_whitespace();
717                                         return;
718                                 case '/':
719                                         next_char();
720                                         skip_line_comment();
721                                         eat_whitespace();
722                                         return;
723
724                                 SKIP_TRIGRAPHS(
725                                                 put_back('?');
726                                         ,
727                                                 c = '/';
728                                                 return;
729                                 )
730
731                                 case '\\':
732                                         next_char();
733                                         EAT_NEWLINE(break;)
734                                         /* fallthrough */
735                                 default:
736                                         return;
737                                 }
738                         }
739                         break;
740
741                 default:
742                         return;
743                 }
744         }
745 }
746
747 static
748 void lexer_next_preprocessing_token(token_t *token)
749 {
750         while(1) {
751                 switch(c) {
752                 case ' ':
753                 case '\t':
754                         next_char();
755                         break;
756
757                 MATCH_NEWLINE(
758                         eat_whitespace();
759                         if(c == '#') {
760                                 next_char();
761                                 parse_preprocessor_directive(token);
762                                 return;
763                         }
764                         token->type = '\n';
765                         return;
766                 )
767
768                 case 'A' ... 'Z':
769                 case 'a' ... 'z':
770                 case '_':
771                         parse_symbol(token);
772                         return;
773
774                 case '0' ... '9':
775                         parse_number(token);
776                         return;
777
778                 case '"':
779                         parse_string_literal(token);
780                         return;
781
782                 case '\'':
783                         parse_character_constant(token);
784                         return;
785
786                 case '\\':
787                         next_char();
788                         if(c == '\n') {
789                                 next_char();
790                                 source_position.linenr++;
791                                 break;
792                         } else {
793                                 parse_error("unexpected '\\' found");
794                                 token->type = T_ERROR;
795                         }
796                         return;
797
798                 case '.':
799                         MAYBE_PROLOG
800                                 case '.':
801                                         MAYBE_PROLOG
802                                         MAYBE('.', T_DOTDOTDOT)
803                                         ELSE_CODE(
804                                                 put_back(c);
805                                                 c = '.';
806                                                 token->type = '.';
807                                                 return;
808                                         )
809                         ELSE('.')
810                 case '&':
811                         MAYBE_PROLOG
812                         MAYBE('&', T_ANDAND)
813                         MAYBE('=', T_ANDEQUAL)
814                         ELSE('&')
815                 case '*':
816                         MAYBE_PROLOG
817                         MAYBE('=', T_ASTERISKEQUAL)
818                         ELSE('*')
819                 case '+':
820                         MAYBE_PROLOG
821                         MAYBE('+', T_PLUSPLUS)
822                         MAYBE('=', T_PLUSEQUAL)
823                         ELSE('+')
824                 case '-':
825                         MAYBE_PROLOG
826                         MAYBE('-', T_MINUSMINUS)
827                         MAYBE('=', T_MINUSEQUAL)
828                         ELSE('-')
829                 case '!':
830                         MAYBE_PROLOG
831                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
832                         ELSE('!')
833                 case '/':
834                         MAYBE_PROLOG
835                         MAYBE('=', T_SLASHEQUAL)
836                                 case '*':
837                                         next_char();
838                                         skip_multiline_comment();
839                                         lexer_next_preprocessing_token(token);
840                                         return;
841                                 case '/':
842                                         next_char();
843                                         skip_line_comment();
844                                         lexer_next_preprocessing_token(token);
845                                         return;
846                         ELSE('/')
847                 case '%':
848                         MAYBE_PROLOG
849                         MAYBE('>', T_PERCENTGREATER)
850                         MAYBE('=', T_PERCENTEQUAL)
851                                 case ':':
852                                         MAYBE_PROLOG
853                                                 case '%':
854                                                         MAYBE_PROLOG
855                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
856                                                         ELSE_CODE(
857                                                                 put_back(c);
858                                                                 c = '%';
859                                                                 token->type = T_PERCENTCOLON;
860                                                                 return;
861                                                         )
862                                         ELSE(T_PERCENTCOLON)
863                         ELSE('%')
864                 case '<':
865                         MAYBE_PROLOG
866                         MAYBE(':', T_LESSCOLON)
867                         MAYBE('%', T_LESSPERCENT)
868                                 case '<':
869                                         MAYBE_PROLOG
870                                         MAYBE('=', T_LESSLESSEQUAL)
871                                         ELSE(T_LESSLESS)
872                         ELSE('<')
873                 case '>':
874                         MAYBE_PROLOG
875                                 case '>':
876                                         MAYBE_PROLOG
877                                         MAYBE('=', T_GREATERGREATEREQUAL)
878                                         ELSE(T_GREATERGREATER)
879                         ELSE('>')
880                 case '^':
881                         MAYBE_PROLOG
882                         MAYBE('=', T_CARETEQUAL)
883                         ELSE('^')
884                 case '|':
885                         MAYBE_PROLOG
886                         MAYBE('=', T_PIPEEQUAL)
887                         MAYBE('|', T_PIPEPIPE)
888                         ELSE('|')
889                 case ':':
890                         MAYBE_PROLOG
891                         MAYBE('>', T_COLONGREATER)
892                         ELSE(':')
893                 case '=':
894                         MAYBE_PROLOG
895                         MAYBE('=', T_EQUALEQUAL)
896                         ELSE('=')
897                 case '#':
898                         MAYBE_PROLOG
899                         MAYBE('#', T_HASHHASH)
900                         ELSE('#')
901
902                 case '?':
903                         next_char();
904                         /* just a simple ? */
905                         if(c != '?') {
906                                 token->type = '?';
907                                 return;
908                         }
909                         /* might be a trigraph */
910                         next_char();
911                         if(replace_trigraph()) {
912                                 break;
913                         }
914                         put_back(c);
915                         c = '?';
916                         token->type = '?';
917                         return;
918
919                 case '[':
920                 case ']':
921                 case '(':
922                 case ')':
923                 case '{':
924                 case '}':
925                 case '~':
926                 case ';':
927                 case ',':
928                         token->type = c;
929                         next_char();
930                         return;
931
932                 case EOF:
933                         token->type = T_EOF;
934                         return;
935
936                 default:
937                         next_char();
938                         error_prefix();
939                         fprintf(stderr, "unknown character '%c' found\n", c);
940                         token->type = T_ERROR;
941                         return;
942                 }
943         }
944 }
945
946 void lexer_next_token(token_t *token)
947 {
948         do {
949                 lexer_next_preprocessing_token(token);
950         } while(token->type == '\n');
951 }
952
953 void init_lexer(void)
954 {
955         strset_init(&stringset);
956 }
957
958 void lexer_open_stream(FILE *stream, const char *input_name)
959 {
960         input                      = stream;
961         source_position.linenr     = 0;
962         source_position.input_name = input_name;
963
964         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
965          * beginning of a line */
966         c = '\n';
967 }
968
969 void exit_lexer(void)
970 {
971         strset_destroy(&stringset);
972 }
973
974 static __attribute__((unused))
975 void dbg_pos(const source_position_t source_position)
976 {
977         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
978         fflush(stdout);
979 }