715c851f73ebdf623c69be8afc0b6779f56a2946
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer_t.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <string.h>
11 #include <ctype.h>
12
13 //#define DEBUG_CHARS
14 #define MAX_PUTBACK 2
15
16 static
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
18 {
19         (void) this;
20         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
21 }
22
23 static
24 void error_prefix(lexer_t *this)
25 {
26         error_prefix_at(this, this->source_position.input_name,
27                         this->source_position.linenr);
28 }
29
30 static
31 void parse_error(lexer_t *this, const char *msg)
32 {
33         error_prefix(this);
34         fprintf(stderr, "%s\n", msg);
35 }
36
37 static inline
38 void next_char(lexer_t *this)
39 {
40         this->bufpos++;
41         if(this->bufpos >= this->bufend) {
42                 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43                                  sizeof(this->buf) - MAX_PUTBACK, this->input);
44                 if(s == 0) {
45                         this->c = EOF;
46                         return;
47                 }
48                 this->bufpos = this->buf + MAX_PUTBACK;
49                 this->bufend = this->buf + MAX_PUTBACK + s;
50         }
51         this->c = *(this->bufpos);
52 #ifdef DEBUG_CHARS
53         printf("nchar '%c'\n", this->c);
54 #endif
55 }
56
57 static inline
58 void put_back(lexer_t *this, int c)
59 {
60         char *p = (char*) this->bufpos - 1;
61         this->bufpos--;
62         assert(p >= this->buf);
63         *p = c;
64
65 #ifdef DEBUG_CHARS
66         printf("putback '%c'\n", c);
67 #endif
68 }
69
70 static
71 int replace_trigraph(lexer_t *this)
72 {
73 #define MATCH_TRIGRAPH(ch,replacement)           \
74         case ch:                                     \
75                 this->c = replacement;                   \
76                 return 1;
77
78         switch(this->c) {
79         MATCH_TRIGRAPH('=', '#')
80         MATCH_TRIGRAPH('(', '[')
81         MATCH_TRIGRAPH('/', '\\')
82         MATCH_TRIGRAPH(')', ']')
83         MATCH_TRIGRAPH('\'', '^')
84         MATCH_TRIGRAPH('<', '{')
85         MATCH_TRIGRAPH('!', '|')
86         MATCH_TRIGRAPH('>', '}')
87         MATCH_TRIGRAPH('-', '~')
88         default:
89                 break;
90         }
91
92         return 0;
93 }
94
95 #define SKIP_TRIGRAPHS(no_trigraph_code)       \
96         case '?':                                  \
97                 next_char(this);                       \
98                 if(this->c != '?') {                   \
99                         put_back(this, this->c);           \
100                         this->c = '?';                     \
101                         no_trigraph_code;                  \
102                 }                                      \
103                 next_char(this);                       \
104                 if(replace_trigraph(this))             \
105                         break;                             \
106                 put_back(this, '?');                   \
107                 put_back(this, this->c);               \
108                 this->c = '?';                         \
109                 no_trigraph_code;
110
111 #define EAT_NEWLINE(newline_code)              \
112         if(this->c == '\r') {                      \
113                 next_char(this);                       \
114                 if(this->c == '\n')                    \
115                         next_char(this);                   \
116                 this->source_position.linenr++;        \
117                 newline_code;                          \
118         } else if(this->c == '\n') {               \
119                 next_char(this);                       \
120                 this->source_position.linenr++;        \
121                 newline_code;                          \
122         }
123
124 static
125 void parse_symbol(lexer_t *this, token_t *token)
126 {
127         symbol_t *symbol;
128         char     *string;
129
130         obstack_1grow(&symbol_obstack, this->c);
131         next_char(this);
132
133         while(1) {
134                 switch(this->c) {
135                 case '\\':
136                         next_char(this);
137                         EAT_NEWLINE(break;)
138                         goto end_symbol;
139
140                 case 'A' ... 'Z':
141                 case 'a' ... 'z':
142                 case '_':
143                         obstack_1grow(&symbol_obstack, this->c);
144                         next_char(this);
145                         break;
146
147                 case '?':
148                         next_char(this);
149                         if(this->c != '?') {
150                                 put_back(this, this->c);
151                                 this->c = '?';
152                                 goto end_symbol;
153                         }
154                         next_char(this);
155                         if(replace_trigraph(this))
156                                 break;
157                         put_back(this, '?');
158                         put_back(this, this->c);
159                         this->c = '?';
160                         goto end_symbol;
161
162                 default:
163                         goto end_symbol;
164                 }
165         }
166 end_symbol:
167         obstack_1grow(&symbol_obstack, '\0');
168
169         string = obstack_finish(&symbol_obstack);
170         symbol = symbol_table_insert(string);
171
172         if(symbol->ID > 0) {
173                 token->type = symbol->ID;
174         } else {
175                 token->type = T_IDENTIFIER;
176         }
177         token->v.symbol = symbol;
178
179         if(symbol->string != string) {
180                 obstack_free(&symbol_obstack, string);
181         }
182 }
183
184 #if 0
185 static
186 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
187 {
188         do {
189                 obstack_1grow(&symbol_obstack, this->c);
190                 next_char(this);
191         } while(is_ident_char(this->c));
192         obstack_1grow(&symbol_obstack, '\0');
193
194         char     *string = obstack_finish(&symbol_obstack);
195         symbol_t *symbol = preprocessor_symbol_table_find(string);
196         obstack_free(&symbol_obstack, string);
197
198         if(symbol == 0)
199                 return TP_ERROR;
200
201         return symbol->ID;
202 }
203 #endif
204
205 static
206 void parse_number_hex(lexer_t *this, token_t *token)
207 {
208         assert(this->c == 'x' || this->c == 'X');
209         next_char(this);
210
211         if (!isdigit(this->c) &&
212                 !('A' <= this->c && this->c <= 'F') &&
213                 !('a' <= this->c && this->c <= 'f')) {
214                 parse_error(this, "premature end of hex number literal");
215                 token->type = T_ERROR;
216                 return;
217         }
218
219         int value = 0;
220         for(;;) {
221                 if (isdigit(this->c)) {
222                         value = 16 * value + this->c - '0';
223                 } else if ('A' <= this->c && this->c <= 'F') {
224                         value = 16 * value + this->c - 'A' + 10;
225                 } else if ('a' <= this->c && this->c <= 'f') {
226                         value = 16 * value + this->c - 'a' + 10;
227                 } else {
228                         token->type     = T_INTEGER;
229                         token->v.intvalue = value;
230                         return;
231                 }
232                 next_char(this);
233         }
234 }
235
236 static
237 void parse_number_oct(lexer_t *this, token_t *token)
238 {
239         assert(this->c == 'o' || this->c == 'O');
240         next_char(this);
241
242         int value = 0;
243         for(;;) {
244                 if ('0' <= this->c && this->c <= '7') {
245                         value = 8 * value + this->c - '0';
246                 } else {
247                         token->type     = T_INTEGER;
248                         token->v.intvalue = value;
249                         return;
250                 }
251                 next_char(this);
252         }
253 }
254
255 static
256 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
257 {
258         int value = 0;
259         if(first_char > 0) {
260                 assert(first_char >= '0' && first_char <= '9');
261                 value = first_char - '0';
262         }
263
264         for(;;) {
265                 if (isdigit(this->c)) {
266                         value = 10 * value + this->c - '0';
267                 } else {
268                         token->type     = T_INTEGER;
269                         token->v.intvalue = value;
270                         return;
271                 }
272                 next_char(this);
273         }
274 }
275
276 static
277 void parse_number(lexer_t *this, token_t *token)
278 {
279         // TODO check for overflow
280         // TODO check for various invalid inputs sequences
281
282         if (this->c == '0') {
283                 next_char(this);
284                 switch (this->c) {
285                         case 'X':
286                         case 'x': parse_number_hex(this, token); break;
287                         case 'o':
288                         case 'O': parse_number_oct(this, token); break;
289                         default:  parse_number_dec(this, token, '0');
290                 }
291         } else {
292                 parse_number_dec(this, token, 0);
293         }
294 }
295
296 static
297 int parse_escape_sequence(lexer_t *this)
298 {
299         while(1) {
300                 int c = this->c;
301                 next_char(this);
302
303                 switch(c) {
304                 case '"': return '"';
305                 case '\'': return'\'';
306                 case '\\':
307                         EAT_NEWLINE(break;)
308                         return '\\';
309                 case 'a': return '\a';
310                 case 'b': return '\b';
311                 case 'f': return '\f';
312                 case 'n': return '\n';
313                 case 'r': return '\r';
314                 case 't': return '\t';
315                 case 'v': return '\v';
316                 case 'x': /* TODO parse hex number ... */
317                         parse_error(this, "hex escape sequences not implemented yet");
318                         return EOF;
319                 case 0 ... 8: /* TODO parse octal number ... */
320                         parse_error(this, "octal escape sequences not implemented yet");
321                         return EOF;
322                 case '?':
323                         if(this->c != '?') {
324                                 return '?';
325                         }
326                         /* might be a trigraph */
327                         next_char(this);
328                         if(replace_trigraph(this)) {
329                                 break;
330                         }
331                         put_back(this, this->c);
332                         this->c = '?';
333                         return '?';
334
335                 case EOF:
336                         parse_error(this, "reached end of file while parsing escape sequence");
337                         return EOF;
338                 default:
339                         parse_error(this, "unknown escape sequence");
340                         return EOF;
341                 }
342         }
343 }
344
345 static
346 void parse_string_literal(lexer_t *this, token_t *token)
347 {
348         unsigned    start_linenr = this->source_position.linenr;
349         char       *string;
350         const char *result;
351
352         assert(this->c == '"');
353         next_char(this);
354
355         while(1) {
356                 switch(this->c) {
357                 SKIP_TRIGRAPHS(
358                         obstack_1grow(&symbol_obstack, '?');
359                         next_char(this);
360                         break;
361                 )
362
363                 case '\\':
364                         next_char(this);
365                         EAT_NEWLINE(break;)
366                         int c = parse_escape_sequence(this);
367                         obstack_1grow(&symbol_obstack, c);
368                         break;
369
370                 case EOF:
371                         error_prefix_at(this, this->source_position.input_name,
372                                         start_linenr);
373                         fprintf(stderr, "string has no end\n");
374                         token->type = T_ERROR;
375                         return;
376
377                 case '"':
378                         next_char(this);
379                         goto end_of_string;
380
381                 default:
382                         obstack_1grow(&symbol_obstack, this->c);
383                         next_char(this);
384                         break;
385                 }
386         }
387
388 end_of_string:
389
390         /* TODO: concatenate multiple strings separated by whitespace... */
391
392         /* add finishing 0 to the string */
393         obstack_1grow(&symbol_obstack, '\0');
394         string = obstack_finish(&symbol_obstack);
395
396         /* check if there is already a copy of the string */
397         result = strset_insert(&this->stringset, string);
398         if(result != string) {
399                 obstack_free(&symbol_obstack, string);
400         }
401
402         token->type     = T_STRING_LITERAL;
403         token->v.string = result;
404 }
405
406 #define MATCH_NEWLINE(code)                 \
407         case '\r':                              \
408                 next_char(this);                    \
409                 if(this->c == '\n') {               \
410                         next_char(this);                \
411                 }                                   \
412                 this->source_position.linenr++;     \
413                 code;                               \
414         case '\n':                              \
415                 next_char(this);                    \
416                 this->source_position.linenr++;     \
417                 code;
418
419 static
420 void parse_character_constant(lexer_t *this, token_t *token)
421 {
422         assert(this->c == '\'');
423         next_char(this);
424
425         int found_char = 0;
426         while(1) {
427                 switch(this->c) {
428                 SKIP_TRIGRAPHS(
429                         found_char = '?';
430                         break;
431                 )
432
433                 case '\\':
434                         next_char(this);
435                         EAT_NEWLINE(break;)
436                         found_char = '\\';
437                         break;
438
439                 MATCH_NEWLINE(
440                         parse_error(this, "newline while parsing character constant");
441                         break;
442                 )
443
444                 case '\'':
445                         next_char(this);
446                         goto end_of_char_constant;
447
448                 case EOF:
449                         parse_error(this, "EOF while parsing character constant");
450                         token->type = T_ERROR;
451                         return;
452
453                 default:
454                         if(found_char != 0) {
455                                 parse_error(this, "more than 1 characters in character "
456                                             "constant");
457                                 goto end_of_char_constant;
458                         } else {
459                                 found_char = this->c;
460                                 next_char(this);
461                         }
462                         break;
463                 }
464         }
465
466 end_of_char_constant:
467         token->type       = T_INTEGER;
468         token->v.intvalue = found_char;
469 }
470
471 static
472 void skip_multiline_comment(lexer_t *this)
473 {
474         unsigned start_linenr = this->source_position.linenr;
475         int had_star = 0;
476
477         while(1) {
478                 switch(this->c) {
479                 case '*':
480                         next_char(this);
481                         had_star = 1;
482                         break;
483
484                 case '/':
485                         next_char(this);
486                         if(had_star) {
487                                 return;
488                         }
489                         had_star = 0;
490                         break;
491
492                 case '\\':
493                         next_char(this);
494                         EAT_NEWLINE(break;)
495                         had_star = 0;
496                         break;
497
498                 case '?':
499                         next_char(this);
500                         if(this->c != '?') {
501                                 had_star = 0;
502                                 break;
503                         }
504                         next_char(this);
505                         if(replace_trigraph(this))
506                                 break;
507                         put_back(this, this->c);
508                         this->c = '?';
509                         had_star = 0;
510                         /* we don't put back the 2nd ? as the comment text is discarded
511                          * anyway */
512                         break;
513
514                 MATCH_NEWLINE(had_star = 0; break;)
515
516                 case EOF:
517                         error_prefix_at(this, this->source_position.input_name,
518                                         start_linenr);
519                         fprintf(stderr, "at end of file while looking for comment end\n");
520                         return;
521                 default:
522                         had_star = 0;
523                         next_char(this);
524                         break;
525                 }
526         }
527 }
528
529 static
530 void skip_line_comment(lexer_t *this)
531 {
532         while(1) {
533                 switch(this->c) {
534                 case '?':
535                         next_char(this);
536                         if(this->c != '?')
537                                 break;
538                         next_char(this);
539                         if(replace_trigraph(this))
540                                 break;
541                         put_back(this, '?');
542                         /* we don't put back the 2nd ? as the comment text is discarded
543                          * anyway */
544                         break;
545
546                 case '\\':
547                         next_char(this);
548                         if(this->c == '\n') {
549                                 next_char(this);
550                                 this->source_position.linenr++;
551                         }
552                         break;
553
554                 case EOF:
555                 case '\r':
556                 case '\n':
557                         return;
558
559                 default:
560                         next_char(this);
561                         break;
562                 }
563         }
564 }
565
566 static
567 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
568 {
569         (void) result_token;
570         /* skip whitespaces */
571         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
572                 next_char(this);
573         }
574 }
575
576 void preprocessor_next_token(lexer_t *this, token_t *token)
577 {
578         /* skip whitespaces */
579         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
580                 next_char(this);
581         }
582
583         switch(this->c) {
584         case 'A' ... 'Z':
585         case 'a' ... 'z':
586         case '_':
587                 parse_symbol(this, token);
588         }
589 }
590
591 void lexer_next_token(lexer_t *this, token_t *token)
592 {
593         int line_begin = 0;
594
595         while(1) {
596                 switch(this->c) {
597                 case ' ':
598                 case '\t':
599                         next_char(this);
600                         break;
601
602                 MATCH_NEWLINE(break;)
603
604                 case 'A' ... 'Z':
605                 case 'a' ... 'z':
606                 case '_':
607                         parse_symbol(this, token);
608                         return;
609
610                 case '0' ... '9':
611                         parse_number(this, token);
612                         return;
613
614                 case '"':
615                         parse_string_literal(this, token);
616                         return;
617
618                 case '\'':
619                         parse_character_constant(this, token);
620                         return;
621
622                 case '\\':
623                         next_char(this);
624                         if(this->c == '\n') {
625                                 next_char(this);
626                                 this->source_position.linenr++;
627                                 break;
628                         } else {
629                                 parse_error(this, "unexpected '\\' found");
630                                 token->type = T_ERROR;
631                         }
632                         return;
633
634 #define MAYBE_PROLOG                                       \
635                         next_char(this);                               \
636                         while(1) {                                     \
637                                 switch(this->c) {
638
639 #define MAYBE(ch, set_type)                                \
640                                 case ch:                                   \
641                                         next_char(this);                       \
642                                         token->type = set_type;                \
643                                         return;
644
645 #define ELSE_CODE(code)                                    \
646                                 SKIP_TRIGRAPHS(                            \
647                                         code;                                  \
648                                 )                                          \
649                                                                                                                    \
650                                 case '\\':                                 \
651                                         next_char(this);                       \
652                                         if(this->c == '\n') {                  \
653                                                 next_char(this);                   \
654                                                 this->source_position.linenr++;    \
655                                                 break;                             \
656                                         }                                      \
657                                         /* fallthrough */                      \
658                                 default:                                   \
659                                         code;                                  \
660                                 }                                          \
661                         } /* end of while(1) */                        \
662                         break;
663
664 #define ELSE(set_type)                                     \
665                 ELSE_CODE(                                         \
666                         token->type = set_type;                        \
667                         return;                                        \
668                 )
669
670                 case '.':
671                         MAYBE_PROLOG
672                                 case '.':
673                                         MAYBE_PROLOG
674                                         MAYBE('.', T_DOTDOTDOT)
675                                         ELSE_CODE(
676                                                 put_back(this, this->c);
677                                                 this->c = '.';
678                                                 token->type = '.';
679                                                 return;
680                                         )
681                         ELSE('.')
682                 case '&':
683                         MAYBE_PROLOG
684                         MAYBE('&', T_ANDAND)
685                         MAYBE('=', T_ANDEQUAL)
686                         ELSE('&')
687                 case '*':
688                         MAYBE_PROLOG
689                         MAYBE('=', T_ASTERISKEQUAL)
690                         ELSE('*')
691                 case '+':
692                         MAYBE_PROLOG
693                         MAYBE('+', T_PLUSPLUS)
694                         MAYBE('=', T_PLUSEQUAL)
695                         ELSE('+')
696                 case '-':
697                         MAYBE_PROLOG
698                         MAYBE('-', T_MINUSMINUS)
699                         MAYBE('=', T_MINUSEQUAL)
700                         ELSE('-')
701                 case '!':
702                         MAYBE_PROLOG
703                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
704                         ELSE('!')
705                 case '/':
706                         MAYBE_PROLOG
707                         MAYBE('=', T_SLASHEQUAL)
708                                 case '*':
709                                         next_char(this);
710                                         skip_multiline_comment(this);
711                                         lexer_next_token(this, token);
712                                         return;
713                                 case '/':
714                                         next_char(this);
715                                         skip_line_comment(this);
716                                         lexer_next_token(this, token);
717                                         return;
718                         ELSE('/')
719                 case '%':
720                         MAYBE_PROLOG
721                         MAYBE('>', T_PERCENTGREATER)
722                         MAYBE('=', T_PERCENTEQUAL)
723                                 case ':':
724                                         MAYBE_PROLOG
725                                                 case '%':
726                                                         MAYBE_PROLOG
727                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
728                                                         ELSE_CODE(
729                                                                 put_back(this, this->c);
730                                                                 this->c = '%';
731                                                                 token->type = T_PERCENTCOLON;
732                                                                 return;
733                                                         )
734                                         ELSE(T_PERCENTCOLON)
735                         ELSE('%')
736                 case '<':
737                         MAYBE_PROLOG
738                         MAYBE(':', T_LESSCOLON)
739                         MAYBE('%', T_LESSPERCENT)
740                                 case '<':
741                                         MAYBE_PROLOG
742                                         MAYBE('=', T_LESSLESSEQUAL)
743                                         ELSE(T_LESSLESS)
744                         ELSE('<')
745                 case '>':
746                         MAYBE_PROLOG
747                                 case '>':
748                                         MAYBE_PROLOG
749                                         MAYBE('=', T_GREATERGREATEREQUAL)
750                                         ELSE(T_GREATERGREATER)
751                         ELSE('>')
752                 case '^':
753                         MAYBE_PROLOG
754                         MAYBE('=', T_CARETEQUAL)
755                         ELSE('^')
756                 case '|':
757                         MAYBE_PROLOG
758                         MAYBE('=', T_PIPEEQUAL)
759                         MAYBE('|', T_PIPEPIPE)
760                         ELSE('|')
761                 case ':':
762                         MAYBE_PROLOG
763                         MAYBE('>', T_COLONGREATER)
764                         ELSE(':')
765                 case '=':
766                         MAYBE_PROLOG
767                         MAYBE('=', T_EQUALEQUAL)
768                         ELSE('=')
769                 case '#':
770                         MAYBE_PROLOG
771                         MAYBE('#', T_HASHHASH)
772                         ELSE_CODE(
773                                 if(line_begin) {
774                                         parse_preprocessor_directive(this, token);
775                                         return;
776                                 } else {
777                                         token->type = '#';
778                                         return;
779                                 }
780                         )
781
782                 case '?':
783                         next_char(this);
784                         /* just a simple ? */
785                         if(this->c != '?') {
786                                 token->type = '?';
787                                 return;
788                         }
789                         /* might be a trigraph */
790                         next_char(this);
791                         if(replace_trigraph(this)) {
792                                 break;
793                         }
794                         put_back(this, this->c);
795                         this->c = '?';
796                         token->type = '?';
797                         return;
798
799                 case '[':
800                 case ']':
801                 case '(':
802                 case ')':
803                 case '{':
804                 case '}':
805                 case '~':
806                 case ';':
807                 case ',':
808                         token->type = this->c;
809                         next_char(this);
810                         return;
811
812                 case EOF:
813                         token->type = T_EOF;
814                         return;
815
816                 default:
817                         next_char(this);
818                         error_prefix(this);
819                         fprintf(stderr, "unknown character '%c' found\n", this->c);
820                         token->type = T_ERROR;
821                         return;
822                 }
823         }
824 }
825
826 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
827 {
828         memset(this, 0, sizeof(this[0]));
829
830         this->input = stream;
831
832         this->source_position.linenr     = 0;
833         this->source_position.input_name = input_name;
834         strset_init(&this->stringset);
835
836         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
837          * beginning of a line */
838         this->c = '\n';
839 }
840
841 void lexer_destroy(lexer_t *this)
842 {
843         (void) this;
844 }
845
846 static __attribute__((unused))
847 void dbg_pos(const source_position_t source_position)
848 {
849         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
850         fflush(stdout);
851 }