8968131cb1240c679acd468486a09be0955899dc
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer_t.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <string.h>
11 #include <ctype.h>
12
13 //#define DEBUG_CHARS
14 #define MAX_PUTBACK 2
15
16 static
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
18 {
19         (void) this;
20         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
21 }
22
23 static
24 void error_prefix(lexer_t *this)
25 {
26         error_prefix_at(this, this->source_position.input_name,
27                         this->source_position.linenr);
28 }
29
30 static
31 void parse_error(lexer_t *this, const char *msg)
32 {
33         error_prefix(this);
34         fprintf(stderr, "%s\n", msg);
35 }
36
37 static inline
38 void next_char(lexer_t *this)
39 {
40         this->bufpos++;
41         if(this->bufpos >= this->bufend) {
42                 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43                                  sizeof(this->buf) - MAX_PUTBACK, this->input);
44                 if(s == 0) {
45                         this->c = EOF;
46                         return;
47                 }
48                 this->bufpos = this->buf + MAX_PUTBACK;
49                 this->bufend = this->buf + MAX_PUTBACK + s;
50         }
51         this->c = *(this->bufpos);
52 #ifdef DEBUG_CHARS
53         printf("nchar '%c'\n", this->c);
54 #endif
55 }
56
57 static inline
58 void put_back(lexer_t *this, int c)
59 {
60         char *p = (char*) this->bufpos - 1;
61         this->bufpos--;
62         assert(p >= this->buf);
63         *p = c;
64
65 #ifdef DEBUG_CHARS
66         printf("putback '%c'\n", c);
67 #endif
68 }
69
70 static
71 int replace_trigraph(lexer_t *this)
72 {
73 #define MATCH_TRIGRAPH(ch,replacement)           \
74         case ch:                                     \
75                 this->c = replacement;                   \
76                 return 1;
77
78         switch(this->c) {
79         MATCH_TRIGRAPH('=', '#')
80         MATCH_TRIGRAPH('(', '[')
81         MATCH_TRIGRAPH('/', '\\')
82         MATCH_TRIGRAPH(')', ']')
83         MATCH_TRIGRAPH('\'', '^')
84         MATCH_TRIGRAPH('<', '{')
85         MATCH_TRIGRAPH('!', '|')
86         MATCH_TRIGRAPH('>', '}')
87         MATCH_TRIGRAPH('-', '~')
88         default:
89                 break;
90         }
91
92         return 0;
93 }
94
95 #define SKIP_TRIGRAPHS(no_trigraph_code)       \
96         case '?':                                  \
97                 next_char(this);                       \
98                 if(this->c != '?') {                   \
99                         put_back(this, this->c);           \
100                         this->c = '?';                     \
101                         no_trigraph_code;                  \
102                 }                                      \
103                 next_char(this);                       \
104                 if(replace_trigraph(this))             \
105                         break;                             \
106                 put_back(this, '?');                   \
107                 put_back(this, this->c);               \
108                 this->c = '?';                         \
109                 no_trigraph_code;                      \
110
111 static
112 void parse_symbol(lexer_t *this, token_t *token)
113 {
114         symbol_t *symbol;
115         char     *string;
116
117         obstack_1grow(&symbol_obstack, this->c);
118         next_char(this);
119
120         while(1) {
121                 switch(this->c) {
122                 case '\\':
123                         next_char(this);
124                         if(this->c == '\n') {
125                                 next_char(this);
126                                 this->source_position.linenr++;
127                                 break;
128                         }
129
130                 case 'A' ... 'Z':
131                 case 'a' ... 'z':
132                 case '_':
133                         obstack_1grow(&symbol_obstack, this->c);
134                         next_char(this);
135                         break;
136
137                 case '?':
138                         next_char(this);
139                         if(this->c != '?') {
140                                 put_back(this, this->c);
141                                 this->c = '?';
142                                 goto end_symbol;
143                         }
144                         next_char(this);
145                         if(replace_trigraph(this))
146                                 break;
147                         put_back(this, '?');
148                         put_back(this, this->c);
149                         this->c = '?';
150                         goto end_symbol;
151
152                 default:
153                         goto end_symbol;
154                 }
155         }
156 end_symbol:
157         obstack_1grow(&symbol_obstack, '\0');
158
159         string = obstack_finish(&symbol_obstack);
160         symbol = symbol_table_insert(string);
161
162         if(symbol->ID > 0) {
163                 token->type = symbol->ID;
164         } else {
165                 token->type = T_IDENTIFIER;
166         }
167         token->v.symbol = symbol;
168
169         if(symbol->string != string) {
170                 obstack_free(&symbol_obstack, string);
171         }
172 }
173
174 #if 0
175 static
176 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
177 {
178         do {
179                 obstack_1grow(&symbol_obstack, this->c);
180                 next_char(this);
181         } while(is_ident_char(this->c));
182         obstack_1grow(&symbol_obstack, '\0');
183
184         char     *string = obstack_finish(&symbol_obstack);
185         symbol_t *symbol = preprocessor_symbol_table_find(string);
186         obstack_free(&symbol_obstack, string);
187
188         if(symbol == 0)
189                 return TP_ERROR;
190
191         return symbol->ID;
192 }
193 #endif
194
195 static
196 void parse_number_hex(lexer_t *this, token_t *token)
197 {
198         assert(this->c == 'x' || this->c == 'X');
199         next_char(this);
200
201         if (!isdigit(this->c) &&
202                 !('A' <= this->c && this->c <= 'F') &&
203                 !('a' <= this->c && this->c <= 'f')) {
204                 parse_error(this, "premature end of hex number literal");
205                 token->type = T_ERROR;
206                 return;
207         }
208
209         int value = 0;
210         for(;;) {
211                 if (isdigit(this->c)) {
212                         value = 16 * value + this->c - '0';
213                 } else if ('A' <= this->c && this->c <= 'F') {
214                         value = 16 * value + this->c - 'A' + 10;
215                 } else if ('a' <= this->c && this->c <= 'f') {
216                         value = 16 * value + this->c - 'a' + 10;
217                 } else {
218                         token->type     = T_INTEGER;
219                         token->v.intvalue = value;
220                         return;
221                 }
222                 next_char(this);
223         }
224 }
225
226 static
227 void parse_number_oct(lexer_t *this, token_t *token)
228 {
229         assert(this->c == 'o' || this->c == 'O');
230         next_char(this);
231
232         int value = 0;
233         for(;;) {
234                 if ('0' <= this->c && this->c <= '7') {
235                         value = 8 * value + this->c - '0';
236                 } else {
237                         token->type     = T_INTEGER;
238                         token->v.intvalue = value;
239                         return;
240                 }
241                 next_char(this);
242         }
243 }
244
245 static
246 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
247 {
248         int value = 0;
249         if(first_char > 0) {
250                 assert(first_char >= '0' && first_char <= '9');
251                 value = first_char - '0';
252         }
253
254         for(;;) {
255                 if (isdigit(this->c)) {
256                         value = 10 * value + this->c - '0';
257                 } else {
258                         token->type     = T_INTEGER;
259                         token->v.intvalue = value;
260                         return;
261                 }
262                 next_char(this);
263         }
264 }
265
266 static
267 void parse_number(lexer_t *this, token_t *token)
268 {
269         // TODO check for overflow
270         // TODO check for various invalid inputs sequences
271
272         if (this->c == '0') {
273                 next_char(this);
274                 switch (this->c) {
275                         case 'X':
276                         case 'x': parse_number_hex(this, token); break;
277                         case 'o':
278                         case 'O': parse_number_oct(this, token); break;
279                         default:  parse_number_dec(this, token, '0');
280                 }
281         } else {
282                 parse_number_dec(this, token, 0);
283         }
284 }
285
286 static
287 int parse_escape_sequence(lexer_t *this)
288 {
289         int c = this->c;
290         next_char(this);
291
292         switch(c) {
293         case '"': return '"';
294         case '\'': return'\'';
295         case '\\':
296                 if(this->c == '\n') {
297                         this->source_position.linenr++;
298                         next_char(this);
299                         return parse_escape_sequence(this);
300                 }
301                 return '\\';
302         case 'a': return '\a';
303         case 'b': return '\b';
304         case 'f': return '\f';
305         case 'n': return '\n';
306         case 'r': return '\r';
307         case 't': return '\t';
308         case 'v': return '\v';
309         case 'x': /* TODO parse hex number ... */
310                 parse_error(this, "hex escape sequences not implemented yet");
311                 return EOF;
312         case 0 ... 8: /* TODO parse octal number ... */
313                 parse_error(this, "octal escape sequences not implemented yet");
314                 return EOF;
315         case '?':
316                 if(this->c != '?') {
317                         return '?';
318                 }
319                 /* might be a trigraph */
320                 next_char(this);
321                 if(replace_trigraph(this)) {
322                         return parse_escape_sequence(this);
323                 }
324                 put_back(this, this->c);
325                 this->c = '?';
326                 return '?';
327
328         case EOF:
329                 parse_error(this, "reached end of file while parsing escape sequence");
330                 return EOF;
331         default:
332                 parse_error(this, "unknown escape sequence\n");
333                 return EOF;
334         }
335 }
336
337 static
338 void parse_string_literal(lexer_t *this, token_t *token)
339 {
340         unsigned    start_linenr = this->source_position.linenr;
341         char       *string;
342         const char *result;
343
344         assert(this->c == '"');
345         next_char(this);
346
347         while(1) {
348                 switch(this->c) {
349                 SKIP_TRIGRAPHS(
350                         obstack_1grow(&symbol_obstack, '?');
351                         next_char(this);
352                         break;
353                 )
354
355                 case '\\':
356                         next_char(this);
357                         if(this->c == '\n') {
358                                 next_char(this);
359                                 this->source_position.linenr++;
360                                 break;
361                         }
362                         int c = parse_escape_sequence(this);
363                         obstack_1grow(&symbol_obstack, c);
364                         break;
365
366                 case EOF:
367                         error_prefix_at(this, this->source_position.input_name,
368                                         start_linenr);
369                         fprintf(stderr, "string has no end\n");
370                         token->type = T_ERROR;
371                         return;
372
373                 case '"':
374                         next_char(this);
375                         goto end_of_string;
376
377                 default:
378                         obstack_1grow(&symbol_obstack, this->c);
379                         next_char(this);
380                         break;
381                 }
382         }
383
384 end_of_string:
385
386         /* TODO: concatenate multiple strings separated by whitespace... */
387
388         /* add finishing 0 to the string */
389         obstack_1grow(&symbol_obstack, '\0');
390         string = obstack_finish(&symbol_obstack);
391
392         /* check if there is already a copy of the string */
393         result = strset_insert(&this->stringset, string);
394         if(result != string) {
395                 obstack_free(&symbol_obstack, string);
396         }
397
398         token->type     = T_STRING_LITERAL;
399         token->v.string = result;
400 }
401
402 static
403 void parse_character_constant(lexer_t *this, token_t *token)
404 {
405         assert(this->c == '\'');
406         next_char(this);
407
408         int found_char = 0;
409         while(1) {
410                 switch(this->c) {
411                 SKIP_TRIGRAPHS(
412                         found_char = '?';
413                         break;
414                 )
415
416                 case '\\':
417                         next_char(this);
418                         if(this->c == '\n') {
419                                 next_char(this);
420                                 this->source_position.linenr++;
421                                 break;
422                         }
423                         found_char = '\\';
424                         break;
425
426                 case '\n':
427                         next_char(this);
428                         parse_error(this, "newline while parsing character constant");
429                         this->source_position.linenr++;
430                         break;
431
432                 case '\'':
433                         next_char(this);
434                         goto end_of_char_constant;
435
436                 case EOF:
437                         parse_error(this, "EOF while parsing character constant");
438                         token->type = T_ERROR;
439                         return;
440
441                 default:
442                         if(found_char != 0) {
443                                 parse_error(this, "more than 1 characters in character "
444                                             "constant");
445                                 goto end_of_char_constant;
446                         } else {
447                                 found_char = this->c;
448                                 next_char(this);
449                         }
450                         break;
451                 }
452         }
453
454 end_of_char_constant:
455         token->type       = T_INTEGER;
456         token->v.intvalue = found_char;
457 }
458
459 static
460 void skip_multiline_comment(lexer_t *this)
461 {
462         unsigned start_linenr = this->source_position.linenr;
463
464         while(1) {
465                 switch(this->c) {
466                 case '*':
467                         next_char(this);
468                         if(this->c == '/') {
469                                 next_char(this);
470                                 return;
471                         }
472                         break;
473                 case '?':
474                         next_char(this);
475                         if(this->c != '?')
476                                 break;
477                         next_char(this);
478                         if(replace_trigraph(this))
479                                 break;
480                         put_back(this, '?');
481                         /* we don't put back the 2nd ? as the comment text is discarded
482                          * anyway */
483                         break;
484
485                 case '\n':
486                         this->source_position.linenr++;
487                         next_char(this);
488                         break;
489                 case EOF:
490                         error_prefix_at(this, this->source_position.input_name,
491                                         start_linenr);
492                         fprintf(stderr, "at end of file while looking for comment end\n");
493                         return;
494                 default:
495                         next_char(this);
496                         break;
497                 }
498         }
499 }
500
501 static
502 void skip_line_comment(lexer_t *this)
503 {
504         while(1) {
505                 switch(this->c) {
506                 case '?':
507                         next_char(this);
508                         if(this->c != '?')
509                                 break;
510                         next_char(this);
511                         if(replace_trigraph(this))
512                                 break;
513                         put_back(this, '?');
514                         /* we don't put back the 2nd ? as the comment text is discarded
515                          * anyway */
516                         break;
517
518                 case '\\':
519                         next_char(this);
520                         if(this->c == '\n') {
521                                 next_char(this);
522                                 this->source_position.linenr++;
523                         }
524                         break;
525
526                 case EOF:
527                 case '\n':
528                         return;
529                 default:
530                         next_char(this);
531                         break;
532                 }
533         }
534 }
535
536 #if 0
537 static
538 void eat_until_newline(lexer_t *this)
539 {
540         while(this->c != '\n') {
541                 next_char(this);
542         }
543 }
544 #endif
545
546 #if 0
547 static
548 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
549 {
550         (void) result_token;
551         /* skip whitespaces */
552         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
553                 next_char(this);
554         }
555 }
556 #endif
557
558 void preprocessor_next_token(lexer_t *this, token_t *token)
559 {
560         /* skip whitespaces */
561         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
562                 next_char(this);
563         }
564
565         switch(this->c) {
566         case 'A' ... 'Z':
567         case 'a' ... 'z':
568         case '_':
569                 parse_symbol(this, token);
570         }
571 }
572
573 void lexer_next_token(lexer_t *this, token_t *token)
574 {
575         while(1) {
576                 switch(this->c) {
577                 case ' ':
578                 case '\t':
579                 case '\r':
580                         next_char(this);
581                         break;
582
583                 case '\n':
584                         this->source_position.linenr++;
585                         next_char(this);
586                         break;
587
588                 case 'A' ... 'Z':
589                 case 'a' ... 'z':
590                 case '_':
591                         parse_symbol(this, token);
592                         return;
593
594                 case '0' ... '9':
595                         parse_number(this, token);
596                         return;
597
598                 case '"':
599                         parse_string_literal(this, token);
600                         return;
601
602                 case '\'':
603                         parse_character_constant(this, token);
604                         return;
605
606                 case '\\':
607                         next_char(this);
608                         if(this->c == '\n') {
609                                 next_char(this);
610                                 this->source_position.linenr++;
611                                 break;
612                         } else {
613                                 parse_error(this, "unexpected '\\' found");
614                                 token->type = T_ERROR;
615                         }
616                         return;
617
618 #define MAYBE_PROLOG                                       \
619                         next_char(this);                               \
620                         while(1) {                                     \
621                                 switch(this->c) {
622
623 #define MAYBE(ch, set_type)                                \
624                                 case ch:                                   \
625                                         next_char(this);                       \
626                                         token->type = set_type;                \
627                                         return;
628
629 #define ELSE_CODE(code)                                    \
630                                 SKIP_TRIGRAPHS(                            \
631                                         code;                                  \
632                                 )                                          \
633                                                                                                                    \
634                                 case '\\':                                 \
635                                         next_char(this);                       \
636                                         if(this->c == '\n') {                  \
637                                                 next_char(this);                   \
638                                                 this->source_position.linenr++;    \
639                                                 break;                             \
640                                         }                                      \
641                                         /* fallthrough */                      \
642                                 default:                                   \
643                                         code;                                  \
644                                 }                                          \
645                         } /* end of while(1) */                        \
646                         break;
647
648 #define ELSE(set_type)                                     \
649                 ELSE_CODE(                                         \
650                         token->type = set_type;                        \
651                         return;                                        \
652                 )
653
654                 case '.':
655                         MAYBE_PROLOG
656                                 case '.':
657                                         MAYBE_PROLOG
658                                         MAYBE('.', T_DOTDOTDOT)
659                                         ELSE_CODE(
660                                                 put_back(this, this->c);
661                                                 this->c = '.';
662                                                 token->type = '.';
663                                                 return;
664                                         )
665                         ELSE('.')
666                 case '&':
667                         MAYBE_PROLOG
668                         MAYBE('&', T_ANDAND)
669                         MAYBE('=', T_ANDEQUAL)
670                         ELSE('&')
671                 case '*':
672                         MAYBE_PROLOG
673                         MAYBE('=', T_ASTERISKEQUAL)
674                         ELSE('*')
675                 case '+':
676                         MAYBE_PROLOG
677                         MAYBE('+', T_PLUSPLUS)
678                         MAYBE('=', T_PLUSEQUAL)
679                         ELSE('+')
680                 case '-':
681                         MAYBE_PROLOG
682                         MAYBE('-', T_MINUSMINUS)
683                         MAYBE('=', T_MINUSEQUAL)
684                         ELSE('-')
685                 case '!':
686                         MAYBE_PROLOG
687                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
688                         ELSE('!')
689                 case '/':
690                         MAYBE_PROLOG
691                         MAYBE('=', T_SLASHEQUAL)
692                                 case '*':
693                                         next_char(this);
694                                         skip_multiline_comment(this);
695                                         lexer_next_token(this, token);
696                                         return;
697                                 case '/':
698                                         next_char(this);
699                                         skip_line_comment(this);
700                                         lexer_next_token(this, token);
701                                         return;
702                         ELSE('/')
703                 case '%':
704                         MAYBE_PROLOG
705                         MAYBE('>', T_PERCENTGREATER)
706                         MAYBE('=', T_PERCENTEQUAL)
707                                 case ':':
708                                         MAYBE_PROLOG
709                                                 case '%':
710                                                         MAYBE_PROLOG
711                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
712                                                         ELSE_CODE(
713                                                                 put_back(this, this->c);
714                                                                 this->c = '%';
715                                                                 token->type = T_PERCENTCOLON;
716                                                                 return;
717                                                         )
718                                         ELSE(T_PERCENTCOLON)
719                         ELSE('%')
720                 case '<':
721                         MAYBE_PROLOG
722                         MAYBE(':', T_LESSCOLON)
723                         MAYBE('%', T_LESSPERCENT)
724                                 case '<':
725                                         MAYBE_PROLOG
726                                         MAYBE('=', T_LESSLESSEQUAL)
727                                         ELSE(T_LESSLESS)
728                         ELSE('<')
729                 case '>':
730                         MAYBE_PROLOG
731                                 case '>':
732                                         MAYBE_PROLOG
733                                         MAYBE('=', T_GREATERGREATEREQUAL)
734                                         ELSE(T_GREATERGREATER)
735                         ELSE('>')
736                 case '^':
737                         MAYBE_PROLOG
738                         MAYBE('=', T_CARETEQUAL)
739                         ELSE('^')
740                 case '|':
741                         MAYBE_PROLOG
742                         MAYBE('=', T_PIPEEQUAL)
743                         MAYBE('|', T_PIPEPIPE)
744                         ELSE('|')
745                 case ':':
746                         MAYBE_PROLOG
747                         MAYBE('>', T_COLONGREATER)
748                         ELSE(':')
749                 case '=':
750                         MAYBE_PROLOG
751                         MAYBE('=', T_EQUALEQUAL)
752                         ELSE('=')
753                 case '#':
754                         MAYBE_PROLOG
755                         MAYBE('#', T_HASHHASH)
756 #if 0
757                         else {
758                                 if(line_begin) {
759                                         parse_preprocessor_directive(this, token);
760                                         return;
761                                 } else {
762                                         token->type = '#';
763                                 }
764 #else
765                         ELSE('#')
766 #endif
767
768                 case '?':
769                         next_char(this);
770                         /* just a simple ? */
771                         if(this->c != '?') {
772                                 token->type = '?';
773                                 return;
774                         }
775                         /* might be a trigraph */
776                         next_char(this);
777                         if(replace_trigraph(this)) {
778                                 break;
779                         }
780                         put_back(this, this->c);
781                         this->c = '?';
782                         token->type = '?';
783                         return;
784
785                 case '[':
786                 case ']':
787                 case '(':
788                 case ')':
789                 case '{':
790                 case '}':
791                 case '~':
792                 case ';':
793                 case ',':
794                         token->type = this->c;
795                         next_char(this);
796                         return;
797
798                 case EOF:
799                         token->type = T_EOF;
800                         return;
801
802                 default:
803                         next_char(this);
804                         error_prefix(this);
805                         fprintf(stderr, "unknown character '%c' found\n", this->c);
806                         token->type = T_ERROR;
807                         return;
808                 }
809         }
810 }
811
812 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
813 {
814         memset(this, 0, sizeof(this[0]));
815
816         this->input = stream;
817
818         this->source_position.linenr     = 0;
819         this->source_position.input_name = input_name;
820         strset_init(&this->stringset);
821
822         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
823          * beginning of a line */
824         this->c = '\n';
825 }
826
827 void lexer_destroy(lexer_t *this)
828 {
829         (void) this;
830 }
831
832 static __attribute__((unused))
833 void dbg_pos(const source_position_t source_position)
834 {
835         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
836         fflush(stdout);
837 }