proper newline handling
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer_t.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <string.h>
11 #include <ctype.h>
12
13 //#define DEBUG_CHARS
14 #define MAX_PUTBACK 2
15
16 static
17 void error_prefix_at(lexer_t *this, const char *input_name, unsigned linenr)
18 {
19         (void) this;
20         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
21 }
22
23 static
24 void error_prefix(lexer_t *this)
25 {
26         error_prefix_at(this, this->source_position.input_name,
27                         this->source_position.linenr);
28 }
29
30 static
31 void parse_error(lexer_t *this, const char *msg)
32 {
33         error_prefix(this);
34         fprintf(stderr, "%s\n", msg);
35 }
36
37 static inline
38 void next_char(lexer_t *this)
39 {
40         this->bufpos++;
41         if(this->bufpos >= this->bufend) {
42                 size_t s = fread(this->buf + MAX_PUTBACK, 1,
43                                  sizeof(this->buf) - MAX_PUTBACK, this->input);
44                 if(s == 0) {
45                         this->c = EOF;
46                         return;
47                 }
48                 this->bufpos = this->buf + MAX_PUTBACK;
49                 this->bufend = this->buf + MAX_PUTBACK + s;
50         }
51         this->c = *(this->bufpos);
52 #ifdef DEBUG_CHARS
53         printf("nchar '%c'\n", this->c);
54 #endif
55 }
56
57 static inline
58 void put_back(lexer_t *this, int c)
59 {
60         char *p = (char*) this->bufpos - 1;
61         this->bufpos--;
62         assert(p >= this->buf);
63         *p = c;
64
65 #ifdef DEBUG_CHARS
66         printf("putback '%c'\n", c);
67 #endif
68 }
69
70 static
71 int replace_trigraph(lexer_t *this)
72 {
73 #define MATCH_TRIGRAPH(ch,replacement)           \
74         case ch:                                     \
75                 this->c = replacement;                   \
76                 return 1;
77
78         switch(this->c) {
79         MATCH_TRIGRAPH('=', '#')
80         MATCH_TRIGRAPH('(', '[')
81         MATCH_TRIGRAPH('/', '\\')
82         MATCH_TRIGRAPH(')', ']')
83         MATCH_TRIGRAPH('\'', '^')
84         MATCH_TRIGRAPH('<', '{')
85         MATCH_TRIGRAPH('!', '|')
86         MATCH_TRIGRAPH('>', '}')
87         MATCH_TRIGRAPH('-', '~')
88         default:
89                 break;
90         }
91
92         return 0;
93 }
94
95 #define SKIP_TRIGRAPHS(no_trigraph_code)       \
96         case '?':                                  \
97                 next_char(this);                       \
98                 if(this->c != '?') {                   \
99                         put_back(this, this->c);           \
100                         this->c = '?';                     \
101                         no_trigraph_code;                  \
102                 }                                      \
103                 next_char(this);                       \
104                 if(replace_trigraph(this))             \
105                         break;                             \
106                 put_back(this, '?');                   \
107                 put_back(this, this->c);               \
108                 this->c = '?';                         \
109                 no_trigraph_code;
110
111 #define EAT_NEWLINE(newline_code)              \
112         if(this->c == '\r') {                      \
113                 next_char(this);                       \
114                 if(this->c == '\n')                    \
115                         next_char(this);                   \
116                 this->source_position.linenr++;        \
117                 newline_code;                          \
118         } else if(this->c == '\n') {               \
119                 next_char(this);                       \
120                 this->source_position.linenr++;        \
121                 newline_code;                          \
122         }
123
124 static
125 void parse_symbol(lexer_t *this, token_t *token)
126 {
127         symbol_t *symbol;
128         char     *string;
129
130         obstack_1grow(&symbol_obstack, this->c);
131         next_char(this);
132
133         while(1) {
134                 switch(this->c) {
135                 case '\\':
136                         next_char(this);
137                         EAT_NEWLINE(break;)
138                         goto end_symbol;
139
140                 case 'A' ... 'Z':
141                 case 'a' ... 'z':
142                 case '_':
143                         obstack_1grow(&symbol_obstack, this->c);
144                         next_char(this);
145                         break;
146
147                 case '?':
148                         next_char(this);
149                         if(this->c != '?') {
150                                 put_back(this, this->c);
151                                 this->c = '?';
152                                 goto end_symbol;
153                         }
154                         next_char(this);
155                         if(replace_trigraph(this))
156                                 break;
157                         put_back(this, '?');
158                         put_back(this, this->c);
159                         this->c = '?';
160                         goto end_symbol;
161
162                 default:
163                         goto end_symbol;
164                 }
165         }
166 end_symbol:
167         obstack_1grow(&symbol_obstack, '\0');
168
169         string = obstack_finish(&symbol_obstack);
170         symbol = symbol_table_insert(string);
171
172         if(symbol->ID > 0) {
173                 token->type = symbol->ID;
174         } else {
175                 token->type = T_IDENTIFIER;
176         }
177         token->v.symbol = symbol;
178
179         if(symbol->string != string) {
180                 obstack_free(&symbol_obstack, string);
181         }
182 }
183
184 #if 0
185 static
186 preprocessor_token_type_t parse_pp_symbol(lexer_t *this)
187 {
188         do {
189                 obstack_1grow(&symbol_obstack, this->c);
190                 next_char(this);
191         } while(is_ident_char(this->c));
192         obstack_1grow(&symbol_obstack, '\0');
193
194         char     *string = obstack_finish(&symbol_obstack);
195         symbol_t *symbol = preprocessor_symbol_table_find(string);
196         obstack_free(&symbol_obstack, string);
197
198         if(symbol == 0)
199                 return TP_ERROR;
200
201         return symbol->ID;
202 }
203 #endif
204
205 static
206 void parse_number_hex(lexer_t *this, token_t *token)
207 {
208         assert(this->c == 'x' || this->c == 'X');
209         next_char(this);
210
211         if (!isdigit(this->c) &&
212                 !('A' <= this->c && this->c <= 'F') &&
213                 !('a' <= this->c && this->c <= 'f')) {
214                 parse_error(this, "premature end of hex number literal");
215                 token->type = T_ERROR;
216                 return;
217         }
218
219         int value = 0;
220         for(;;) {
221                 if (isdigit(this->c)) {
222                         value = 16 * value + this->c - '0';
223                 } else if ('A' <= this->c && this->c <= 'F') {
224                         value = 16 * value + this->c - 'A' + 10;
225                 } else if ('a' <= this->c && this->c <= 'f') {
226                         value = 16 * value + this->c - 'a' + 10;
227                 } else {
228                         token->type     = T_INTEGER;
229                         token->v.intvalue = value;
230                         return;
231                 }
232                 next_char(this);
233         }
234 }
235
236 static
237 void parse_number_oct(lexer_t *this, token_t *token)
238 {
239         assert(this->c == 'o' || this->c == 'O');
240         next_char(this);
241
242         int value = 0;
243         for(;;) {
244                 if ('0' <= this->c && this->c <= '7') {
245                         value = 8 * value + this->c - '0';
246                 } else {
247                         token->type     = T_INTEGER;
248                         token->v.intvalue = value;
249                         return;
250                 }
251                 next_char(this);
252         }
253 }
254
255 static
256 void parse_number_dec(lexer_t *this, token_t *token, int first_char)
257 {
258         int value = 0;
259         if(first_char > 0) {
260                 assert(first_char >= '0' && first_char <= '9');
261                 value = first_char - '0';
262         }
263
264         for(;;) {
265                 if (isdigit(this->c)) {
266                         value = 10 * value + this->c - '0';
267                 } else {
268                         token->type     = T_INTEGER;
269                         token->v.intvalue = value;
270                         return;
271                 }
272                 next_char(this);
273         }
274 }
275
276 static
277 void parse_number(lexer_t *this, token_t *token)
278 {
279         // TODO check for overflow
280         // TODO check for various invalid inputs sequences
281
282         if (this->c == '0') {
283                 next_char(this);
284                 switch (this->c) {
285                         case 'X':
286                         case 'x': parse_number_hex(this, token); break;
287                         case 'o':
288                         case 'O': parse_number_oct(this, token); break;
289                         default:  parse_number_dec(this, token, '0');
290                 }
291         } else {
292                 parse_number_dec(this, token, 0);
293         }
294 }
295
296 static
297 int parse_escape_sequence(lexer_t *this)
298 {
299         while(1) {
300                 int c = this->c;
301                 next_char(this);
302
303                 switch(c) {
304                 case '"': return '"';
305                 case '\'': return'\'';
306                 case '\\':
307                         EAT_NEWLINE(break;)
308                         return '\\';
309                 case 'a': return '\a';
310                 case 'b': return '\b';
311                 case 'f': return '\f';
312                 case 'n': return '\n';
313                 case 'r': return '\r';
314                 case 't': return '\t';
315                 case 'v': return '\v';
316                 case 'x': /* TODO parse hex number ... */
317                         parse_error(this, "hex escape sequences not implemented yet");
318                         return EOF;
319                 case 0 ... 8: /* TODO parse octal number ... */
320                         parse_error(this, "octal escape sequences not implemented yet");
321                         return EOF;
322                 case '?':
323                         if(this->c != '?') {
324                                 return '?';
325                         }
326                         /* might be a trigraph */
327                         next_char(this);
328                         if(replace_trigraph(this)) {
329                                 break;
330                         }
331                         put_back(this, this->c);
332                         this->c = '?';
333                         return '?';
334
335                 case EOF:
336                         parse_error(this, "reached end of file while parsing escape sequence");
337                         return EOF;
338                 default:
339                         parse_error(this, "unknown escape sequence");
340                         return EOF;
341                 }
342         }
343 }
344
345 static
346 void parse_string_literal(lexer_t *this, token_t *token)
347 {
348         unsigned    start_linenr = this->source_position.linenr;
349         char       *string;
350         const char *result;
351
352         assert(this->c == '"');
353         next_char(this);
354
355         while(1) {
356                 switch(this->c) {
357                 SKIP_TRIGRAPHS(
358                         obstack_1grow(&symbol_obstack, '?');
359                         next_char(this);
360                         break;
361                 )
362
363                 case '\\':
364                         next_char(this);
365                         EAT_NEWLINE(break;)
366                         int c = parse_escape_sequence(this);
367                         obstack_1grow(&symbol_obstack, c);
368                         break;
369
370                 case EOF:
371                         error_prefix_at(this, this->source_position.input_name,
372                                         start_linenr);
373                         fprintf(stderr, "string has no end\n");
374                         token->type = T_ERROR;
375                         return;
376
377                 case '"':
378                         next_char(this);
379                         goto end_of_string;
380
381                 default:
382                         obstack_1grow(&symbol_obstack, this->c);
383                         next_char(this);
384                         break;
385                 }
386         }
387
388 end_of_string:
389
390         /* TODO: concatenate multiple strings separated by whitespace... */
391
392         /* add finishing 0 to the string */
393         obstack_1grow(&symbol_obstack, '\0');
394         string = obstack_finish(&symbol_obstack);
395
396         /* check if there is already a copy of the string */
397         result = strset_insert(&this->stringset, string);
398         if(result != string) {
399                 obstack_free(&symbol_obstack, string);
400         }
401
402         token->type     = T_STRING_LITERAL;
403         token->v.string = result;
404 }
405
406 #define MATCH_NEWLINE(code)                 \
407         case '\r':                              \
408                 next_char(this);                    \
409                 if(this->c == '\n') {               \
410                         next_char(this);                \
411                 }                                   \
412                 this->source_position.linenr++;     \
413                 code;                               \
414         case '\n':                              \
415                 next_char(this);                    \
416                 this->source_position.linenr++;     \
417                 code;
418
419 static
420 void parse_character_constant(lexer_t *this, token_t *token)
421 {
422         assert(this->c == '\'');
423         next_char(this);
424
425         int found_char = 0;
426         while(1) {
427                 switch(this->c) {
428                 SKIP_TRIGRAPHS(
429                         found_char = '?';
430                         break;
431                 )
432
433                 case '\\':
434                         next_char(this);
435                         EAT_NEWLINE(break;)
436                         found_char = '\\';
437                         break;
438
439                 MATCH_NEWLINE(
440                         parse_error(this, "newline while parsing character constant");
441                         break;
442                 )
443
444                 case '\'':
445                         next_char(this);
446                         goto end_of_char_constant;
447
448                 case EOF:
449                         parse_error(this, "EOF while parsing character constant");
450                         token->type = T_ERROR;
451                         return;
452
453                 default:
454                         if(found_char != 0) {
455                                 parse_error(this, "more than 1 characters in character "
456                                             "constant");
457                                 goto end_of_char_constant;
458                         } else {
459                                 found_char = this->c;
460                                 next_char(this);
461                         }
462                         break;
463                 }
464         }
465
466 end_of_char_constant:
467         token->type       = T_INTEGER;
468         token->v.intvalue = found_char;
469 }
470
471 static
472 void skip_multiline_comment(lexer_t *this)
473 {
474         unsigned start_linenr = this->source_position.linenr;
475
476         while(1) {
477                 switch(this->c) {
478                 case '*':
479                         next_char(this);
480                         if(this->c == '/') {
481                                 next_char(this);
482                                 return;
483                         }
484                         break;
485                 case '?':
486                         next_char(this);
487                         if(this->c != '?')
488                                 break;
489                         next_char(this);
490                         if(replace_trigraph(this))
491                                 break;
492                         put_back(this, '?');
493                         /* we don't put back the 2nd ? as the comment text is discarded
494                          * anyway */
495                         break;
496
497                 MATCH_NEWLINE(break;)
498
499                 case EOF:
500                         error_prefix_at(this, this->source_position.input_name,
501                                         start_linenr);
502                         fprintf(stderr, "at end of file while looking for comment end\n");
503                         return;
504                 default:
505                         next_char(this);
506                         break;
507                 }
508         }
509 }
510
511 static
512 void skip_line_comment(lexer_t *this)
513 {
514         while(1) {
515                 switch(this->c) {
516                 case '?':
517                         next_char(this);
518                         if(this->c != '?')
519                                 break;
520                         next_char(this);
521                         if(replace_trigraph(this))
522                                 break;
523                         put_back(this, '?');
524                         /* we don't put back the 2nd ? as the comment text is discarded
525                          * anyway */
526                         break;
527
528                 case '\\':
529                         next_char(this);
530                         if(this->c == '\n') {
531                                 next_char(this);
532                                 this->source_position.linenr++;
533                         }
534                         break;
535
536                 case EOF:
537                 case '\r':
538                 case '\n':
539                         return;
540
541                 default:
542                         next_char(this);
543                         break;
544                 }
545         }
546 }
547
548 static
549 void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
550 {
551         (void) result_token;
552         /* skip whitespaces */
553         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
554                 next_char(this);
555         }
556 }
557
558 void preprocessor_next_token(lexer_t *this, token_t *token)
559 {
560         /* skip whitespaces */
561         while(this->c == ' ' || this->c == '\t' || this->c == '\r') {
562                 next_char(this);
563         }
564
565         switch(this->c) {
566         case 'A' ... 'Z':
567         case 'a' ... 'z':
568         case '_':
569                 parse_symbol(this, token);
570         }
571 }
572
573 void lexer_next_token(lexer_t *this, token_t *token)
574 {
575         int line_begin = 0;
576
577         while(1) {
578                 switch(this->c) {
579                 case ' ':
580                 case '\t':
581                         next_char(this);
582                         break;
583
584                 MATCH_NEWLINE(break;)
585
586                 case 'A' ... 'Z':
587                 case 'a' ... 'z':
588                 case '_':
589                         parse_symbol(this, token);
590                         return;
591
592                 case '0' ... '9':
593                         parse_number(this, token);
594                         return;
595
596                 case '"':
597                         parse_string_literal(this, token);
598                         return;
599
600                 case '\'':
601                         parse_character_constant(this, token);
602                         return;
603
604                 case '\\':
605                         next_char(this);
606                         if(this->c == '\n') {
607                                 next_char(this);
608                                 this->source_position.linenr++;
609                                 break;
610                         } else {
611                                 parse_error(this, "unexpected '\\' found");
612                                 token->type = T_ERROR;
613                         }
614                         return;
615
616 #define MAYBE_PROLOG                                       \
617                         next_char(this);                               \
618                         while(1) {                                     \
619                                 switch(this->c) {
620
621 #define MAYBE(ch, set_type)                                \
622                                 case ch:                                   \
623                                         next_char(this);                       \
624                                         token->type = set_type;                \
625                                         return;
626
627 #define ELSE_CODE(code)                                    \
628                                 SKIP_TRIGRAPHS(                            \
629                                         code;                                  \
630                                 )                                          \
631                                                                                                                    \
632                                 case '\\':                                 \
633                                         next_char(this);                       \
634                                         if(this->c == '\n') {                  \
635                                                 next_char(this);                   \
636                                                 this->source_position.linenr++;    \
637                                                 break;                             \
638                                         }                                      \
639                                         /* fallthrough */                      \
640                                 default:                                   \
641                                         code;                                  \
642                                 }                                          \
643                         } /* end of while(1) */                        \
644                         break;
645
646 #define ELSE(set_type)                                     \
647                 ELSE_CODE(                                         \
648                         token->type = set_type;                        \
649                         return;                                        \
650                 )
651
652                 case '.':
653                         MAYBE_PROLOG
654                                 case '.':
655                                         MAYBE_PROLOG
656                                         MAYBE('.', T_DOTDOTDOT)
657                                         ELSE_CODE(
658                                                 put_back(this, this->c);
659                                                 this->c = '.';
660                                                 token->type = '.';
661                                                 return;
662                                         )
663                         ELSE('.')
664                 case '&':
665                         MAYBE_PROLOG
666                         MAYBE('&', T_ANDAND)
667                         MAYBE('=', T_ANDEQUAL)
668                         ELSE('&')
669                 case '*':
670                         MAYBE_PROLOG
671                         MAYBE('=', T_ASTERISKEQUAL)
672                         ELSE('*')
673                 case '+':
674                         MAYBE_PROLOG
675                         MAYBE('+', T_PLUSPLUS)
676                         MAYBE('=', T_PLUSEQUAL)
677                         ELSE('+')
678                 case '-':
679                         MAYBE_PROLOG
680                         MAYBE('-', T_MINUSMINUS)
681                         MAYBE('=', T_MINUSEQUAL)
682                         ELSE('-')
683                 case '!':
684                         MAYBE_PROLOG
685                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
686                         ELSE('!')
687                 case '/':
688                         MAYBE_PROLOG
689                         MAYBE('=', T_SLASHEQUAL)
690                                 case '*':
691                                         next_char(this);
692                                         skip_multiline_comment(this);
693                                         lexer_next_token(this, token);
694                                         return;
695                                 case '/':
696                                         next_char(this);
697                                         skip_line_comment(this);
698                                         lexer_next_token(this, token);
699                                         return;
700                         ELSE('/')
701                 case '%':
702                         MAYBE_PROLOG
703                         MAYBE('>', T_PERCENTGREATER)
704                         MAYBE('=', T_PERCENTEQUAL)
705                                 case ':':
706                                         MAYBE_PROLOG
707                                                 case '%':
708                                                         MAYBE_PROLOG
709                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
710                                                         ELSE_CODE(
711                                                                 put_back(this, this->c);
712                                                                 this->c = '%';
713                                                                 token->type = T_PERCENTCOLON;
714                                                                 return;
715                                                         )
716                                         ELSE(T_PERCENTCOLON)
717                         ELSE('%')
718                 case '<':
719                         MAYBE_PROLOG
720                         MAYBE(':', T_LESSCOLON)
721                         MAYBE('%', T_LESSPERCENT)
722                                 case '<':
723                                         MAYBE_PROLOG
724                                         MAYBE('=', T_LESSLESSEQUAL)
725                                         ELSE(T_LESSLESS)
726                         ELSE('<')
727                 case '>':
728                         MAYBE_PROLOG
729                                 case '>':
730                                         MAYBE_PROLOG
731                                         MAYBE('=', T_GREATERGREATEREQUAL)
732                                         ELSE(T_GREATERGREATER)
733                         ELSE('>')
734                 case '^':
735                         MAYBE_PROLOG
736                         MAYBE('=', T_CARETEQUAL)
737                         ELSE('^')
738                 case '|':
739                         MAYBE_PROLOG
740                         MAYBE('=', T_PIPEEQUAL)
741                         MAYBE('|', T_PIPEPIPE)
742                         ELSE('|')
743                 case ':':
744                         MAYBE_PROLOG
745                         MAYBE('>', T_COLONGREATER)
746                         ELSE(':')
747                 case '=':
748                         MAYBE_PROLOG
749                         MAYBE('=', T_EQUALEQUAL)
750                         ELSE('=')
751                 case '#':
752                         MAYBE_PROLOG
753                         MAYBE('#', T_HASHHASH)
754                         ELSE_CODE(
755                                 if(line_begin) {
756                                         parse_preprocessor_directive(this, token);
757                                         return;
758                                 } else {
759                                         token->type = '#';
760                                         return;
761                                 }
762                         )
763
764                 case '?':
765                         next_char(this);
766                         /* just a simple ? */
767                         if(this->c != '?') {
768                                 token->type = '?';
769                                 return;
770                         }
771                         /* might be a trigraph */
772                         next_char(this);
773                         if(replace_trigraph(this)) {
774                                 break;
775                         }
776                         put_back(this, this->c);
777                         this->c = '?';
778                         token->type = '?';
779                         return;
780
781                 case '[':
782                 case ']':
783                 case '(':
784                 case ')':
785                 case '{':
786                 case '}':
787                 case '~':
788                 case ';':
789                 case ',':
790                         token->type = this->c;
791                         next_char(this);
792                         return;
793
794                 case EOF:
795                         token->type = T_EOF;
796                         return;
797
798                 default:
799                         next_char(this);
800                         error_prefix(this);
801                         fprintf(stderr, "unknown character '%c' found\n", this->c);
802                         token->type = T_ERROR;
803                         return;
804                 }
805         }
806 }
807
808 void lexer_init(lexer_t *this, FILE *stream, const char *input_name)
809 {
810         memset(this, 0, sizeof(this[0]));
811
812         this->input = stream;
813
814         this->source_position.linenr     = 0;
815         this->source_position.input_name = input_name;
816         strset_init(&this->stringset);
817
818         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
819          * beginning of a line */
820         this->c = '\n';
821 }
822
823 void lexer_destroy(lexer_t *this)
824 {
825         (void) this;
826 }
827
828 static __attribute__((unused))
829 void dbg_pos(const source_position_t source_position)
830 {
831         fprintf(stdout, "%s:%d\n", source_position.input_name, source_position.linenr);
832         fflush(stdout);
833 }