handle multiple strings in a row
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7 #include "adt/strset.h"
8
9 #include <assert.h>
10 #include <errno.h>
11 #include <string.h>
12 #include <ctype.h>
13
14 //#define DEBUG_CHARS
15 #define MAX_PUTBACK 3
16
17 static int         c;
18 token_t            lexer_token;
19 static FILE       *input;
20 static char        buf[1024 + MAX_PUTBACK];
21 static const char *bufend;
22 static const char *bufpos;
23 static strset_t    stringset;
24 //static FILE      **input_stack;
25 //static char      **buf_stack;
26
27 static
28 void error_prefix_at(const char *input_name, unsigned linenr)
29 {
30         fprintf(stderr, "%s:%d: Error: ", input_name, linenr);
31 }
32
33 static
34 void error_prefix(void)
35 {
36         error_prefix_at(lexer_token.source_position.input_name,
37                         lexer_token.source_position.linenr);
38 }
39
40 static
41 void parse_error(const char *msg)
42 {
43         error_prefix();
44         fprintf(stderr, "%s\n", msg);
45 }
46
47 static inline
48 void next_char(void)
49 {
50         bufpos++;
51         if(bufpos >= bufend) {
52                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
53                                  input);
54                 if(s == 0) {
55                         c = EOF;
56                         return;
57                 }
58                 bufpos = buf + MAX_PUTBACK;
59                 bufend = buf + MAX_PUTBACK + s;
60         }
61         c = *(bufpos);
62 #ifdef DEBUG_CHARS
63         printf("nchar '%c'\n", c);
64 #endif
65 }
66
67 static inline
68 void put_back(int pc)
69 {
70         char *p = (char*) bufpos - 1;
71         bufpos--;
72         assert(p >= buf);
73         *p = pc;
74
75 #ifdef DEBUG_CHARS
76         printf("putback '%c'\n", pc);
77 #endif
78 }
79
80
81 static
82 int replace_trigraph(void)
83 {
84 #define MATCH_TRIGRAPH(ch,replacement)           \
85         case ch:                                     \
86                 c = replacement;                         \
87                 return 1;
88
89         switch(c) {
90         MATCH_TRIGRAPH('=', '#')
91         MATCH_TRIGRAPH('(', '[')
92         MATCH_TRIGRAPH('/', '\\')
93         MATCH_TRIGRAPH(')', ']')
94         MATCH_TRIGRAPH('\'', '^')
95         MATCH_TRIGRAPH('<', '{')
96         MATCH_TRIGRAPH('!', '|')
97         MATCH_TRIGRAPH('>', '}')
98         MATCH_TRIGRAPH('-', '~')
99         default:
100                 break;
101         }
102
103         return 0;
104 }
105
106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
107         case '?':                                  \
108                 next_char();                           \
109                 if(c != '?') {                         \
110                         custom_putback;                    \
111                         put_back(c);                       \
112                         c = '?';                           \
113                         no_trigraph_code;                  \
114                 }                                      \
115                 next_char();                           \
116                 if(replace_trigraph()) {               \
117                         break;                             \
118                 }                                      \
119                 custom_putback;                        \
120                 put_back('?');                         \
121                 put_back(c);                           \
122                 c = '?';                               \
123                 no_trigraph_code;
124
125 #define EAT_NEWLINE(newline_code)              \
126         if(c == '\r') {                            \
127                 next_char();                           \
128                 if(c == '\n')                          \
129                         next_char();                       \
130                 lexer_token.source_position.linenr++;  \
131                 newline_code;                          \
132         } else if(c == '\n') {                     \
133                 next_char();                           \
134                 lexer_token.source_position.linenr++;  \
135                 newline_code;                          \
136         }
137
138 #define SYMBOL_CHARS  \
139         case 'a':         \
140         case 'b':         \
141         case 'c':         \
142         case 'd':         \
143         case 'e':         \
144         case 'f':         \
145         case 'g':         \
146         case 'h':         \
147         case 'i':         \
148         case 'j':         \
149         case 'k':         \
150         case 'l':         \
151         case 'm':         \
152         case 'n':         \
153         case 'o':         \
154         case 'p':         \
155         case 'q':         \
156         case 'r':         \
157         case 's':         \
158         case 't':         \
159         case 'u':         \
160         case 'v':         \
161         case 'w':         \
162         case 'x':         \
163         case 'y':         \
164         case 'z':         \
165         case 'A':         \
166         case 'B':         \
167         case 'C':         \
168         case 'D':         \
169         case 'E':         \
170         case 'F':         \
171         case 'G':         \
172         case 'H':         \
173         case 'I':         \
174         case 'J':         \
175         case 'K':         \
176         case 'L':         \
177         case 'M':         \
178         case 'N':         \
179         case 'O':         \
180         case 'P':         \
181         case 'Q':         \
182         case 'R':         \
183         case 'S':         \
184         case 'T':         \
185         case 'U':         \
186         case 'V':         \
187         case 'W':         \
188         case 'X':         \
189         case 'Y':         \
190         case 'Z':         \
191         case '_':
192
193 #define DIGITS        \
194         case '0':         \
195         case '1':         \
196         case '2':         \
197         case '3':         \
198         case '4':         \
199         case '5':         \
200         case '6':         \
201         case '7':         \
202         case '8':         \
203         case '9':
204
205 static
206 void parse_symbol(void)
207 {
208         symbol_t *symbol;
209         char     *string;
210
211         obstack_1grow(&symbol_obstack, c);
212         next_char();
213
214         while(1) {
215                 switch(c) {
216                 case '\\':
217                         next_char();
218                         EAT_NEWLINE(break;)
219                         goto end_symbol;
220
221                 DIGITS
222                 SYMBOL_CHARS
223                         obstack_1grow(&symbol_obstack, c);
224                         next_char();
225                         break;
226
227                 case '?':
228                         next_char();
229                         if(c != '?') {
230                                 put_back(c);
231                                 c = '?';
232                                 goto end_symbol;
233                         }
234                         next_char();
235                         if(replace_trigraph())
236                                 break;
237                         put_back('?');
238                         put_back(c);
239                         c = '?';
240                         goto end_symbol;
241
242                 default:
243                         goto end_symbol;
244                 }
245         }
246 end_symbol:
247         obstack_1grow(&symbol_obstack, '\0');
248
249         string = obstack_finish(&symbol_obstack);
250         symbol = symbol_table_insert(string);
251
252         lexer_token.type     = symbol->ID;
253         lexer_token.v.symbol = symbol;
254
255         if(symbol->string != string) {
256                 obstack_free(&symbol_obstack, string);
257         }
258 }
259
260 static
261 void parse_number_hex(void)
262 {
263         assert(c == 'x' || c == 'X');
264         next_char();
265
266         if (!isdigit(c) &&
267                 !('A' <= c && c <= 'F') &&
268                 !('a' <= c && c <= 'f')) {
269                 parse_error("premature end of hex number literal");
270                 lexer_token.type = T_ERROR;
271                 return;
272         }
273
274         int value = 0;
275         for(;;) {
276                 if (isdigit(c)) {
277                         value = 16 * value + c - '0';
278                 } else if ('A' <= c && c <= 'F') {
279                         value = 16 * value + c - 'A' + 10;
280                 } else if ('a' <= c && c <= 'f') {
281                         value = 16 * value + c - 'a' + 10;
282                 } else {
283                         lexer_token.type     = T_INTEGER;
284                         lexer_token.v.intvalue = value;
285                         return;
286                 }
287                 next_char();
288         }
289 }
290
291 static
292 void parse_number_oct(void)
293 {
294         assert(c == 'o' || c == 'O');
295         next_char();
296
297         int value = 0;
298         for(;;) {
299                 if ('0' <= c && c <= '7') {
300                         value = 8 * value + c - '0';
301                 } else {
302                         lexer_token.type       = T_INTEGER;
303                         lexer_token.v.intvalue = value;
304                         return;
305                 }
306                 next_char();
307         }
308 }
309
310 static
311 void parse_number_dec(int first_char)
312 {
313         int value = 0;
314         if(first_char > 0) {
315                 assert(first_char >= '0' && first_char <= '9');
316                 value = first_char - '0';
317         }
318
319         for(;;) {
320                 if (isdigit(c)) {
321                         value = 10 * value + c - '0';
322                 } else {
323                         lexer_token.type       = T_INTEGER;
324                         lexer_token.v.intvalue = value;
325                         return;
326                 }
327                 next_char();
328         }
329 }
330
331 static
332 void parse_number(void)
333 {
334         // TODO check for overflow
335         // TODO check for various invalid inputs sequences
336
337         if (c == '0') {
338                 next_char();
339                 switch (c) {
340                         case 'X':
341                         case 'x': parse_number_hex(); break;
342                         case 'o':
343                         case 'O': parse_number_oct(); break;
344                         default:  parse_number_dec('0');
345                 }
346         } else {
347                 parse_number_dec(0);
348         }
349 }
350
351 static
352 int parse_escape_sequence(void)
353 {
354         while(1) {
355                 int ec = c;
356                 next_char();
357
358                 switch(ec) {
359                 case '"': return '"';
360                 case '\'': return'\'';
361                 case '\\':
362                         EAT_NEWLINE(break;)
363                         return '\\';
364                 case 'a': return '\a';
365                 case 'b': return '\b';
366                 case 'f': return '\f';
367                 case 'n': return '\n';
368                 case 'r': return '\r';
369                 case 't': return '\t';
370                 case 'v': return '\v';
371                 case 'x': /* TODO parse hex number ... */
372                         parse_error("hex escape sequences not implemented yet");
373                         return EOF;
374                 case '0':
375                 case '1':
376                 case '2':
377                 case '3':
378                 case '4':
379                 case '5':
380                 case '6':
381                 case '7':
382                         /* TODO parse octal number ... */
383                         parse_error("octal escape sequences not implemented yet");
384                         return EOF;
385                 case '?':
386                         if(c != '?') {
387                                 return '?';
388                         }
389                         /* might be a trigraph */
390                         next_char();
391                         if(replace_trigraph()) {
392                                 break;
393                         }
394                         put_back(c);
395                         c = '?';
396                         return '?';
397
398                 case EOF:
399                         parse_error("reached end of file while parsing escape sequence");
400                         return EOF;
401                 default:
402                         parse_error("unknown escape sequence");
403                         return EOF;
404                 }
405         }
406 }
407
408 const char *concat_strings(const char *s1, const char *s2)
409 {
410         size_t  len1   = strlen(s1);
411         size_t  len2   = strlen(s2);
412
413         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
414         memcpy(concat, s1, len1);
415         memcpy(concat + len1, s2, len2 + 1);
416
417         const char *result = strset_insert(&stringset, concat);
418         if(result != concat) {
419                 obstack_free(&symbol_obstack, concat);
420         }
421
422         return result;
423 }
424
425 static
426 void parse_string_literal(void)
427 {
428         unsigned    start_linenr = lexer_token.source_position.linenr;
429         char       *string;
430         const char *result;
431
432         assert(c == '"');
433         next_char();
434
435         while(1) {
436                 switch(c) {
437                 SKIP_TRIGRAPHS(,
438                         obstack_1grow(&symbol_obstack, '?');
439                         next_char();
440                         break;
441                 )
442
443                 case '\\':
444                         next_char();
445                         EAT_NEWLINE(break;)
446                         int ec = parse_escape_sequence();
447                         obstack_1grow(&symbol_obstack, ec);
448                         break;
449
450                 case EOF:
451                         error_prefix_at(lexer_token.source_position.input_name,
452                                         start_linenr);
453                         fprintf(stderr, "string has no end\n");
454                         lexer_token.type = T_ERROR;
455                         return;
456
457                 case '"':
458                         next_char();
459                         goto end_of_string;
460
461                 default:
462                         obstack_1grow(&symbol_obstack, c);
463                         next_char();
464                         break;
465                 }
466         }
467
468 end_of_string:
469
470         /* TODO: concatenate multiple strings separated by whitespace... */
471
472         /* add finishing 0 to the string */
473         obstack_1grow(&symbol_obstack, '\0');
474         string = obstack_finish(&symbol_obstack);
475
476         /* check if there is already a copy of the string */
477         result = strset_insert(&stringset, string);
478         if(result != string) {
479                 obstack_free(&symbol_obstack, string);
480         }
481
482         lexer_token.type     = T_STRING_LITERAL;
483         lexer_token.v.string = result;
484 }
485
486 #define MATCH_NEWLINE(code)                   \
487         case '\r':                                \
488                 next_char();                          \
489                 if(c == '\n') {                       \
490                         next_char();                      \
491                 }                                     \
492                 lexer_token.source_position.linenr++; \
493                 code;                                 \
494         case '\n':                                \
495                 next_char();                          \
496                 lexer_token.source_position.linenr++; \
497                 code;
498
499 static
500 void parse_character_constant(void)
501 {
502         assert(c == '\'');
503         next_char();
504
505         int found_char = 0;
506         while(1) {
507                 switch(c) {
508                 SKIP_TRIGRAPHS(,
509                         found_char = '?';
510                         break;
511                 )
512
513                 case '\\':
514                         next_char();
515                         EAT_NEWLINE(break;)
516                         found_char = '\\';
517                         break;
518
519                 MATCH_NEWLINE(
520                         parse_error("newline while parsing character constant");
521                         break;
522                 )
523
524                 case '\'':
525                         next_char();
526                         goto end_of_char_constant;
527
528                 case EOF:
529                         parse_error("EOF while parsing character constant");
530                         lexer_token.type = T_ERROR;
531                         return;
532
533                 default:
534                         if(found_char != 0) {
535                                 parse_error("more than 1 characters in character "
536                                             "constant");
537                                 goto end_of_char_constant;
538                         } else {
539                                 found_char = c;
540                                 next_char();
541                         }
542                         break;
543                 }
544         }
545
546 end_of_char_constant:
547         lexer_token.type       = T_INTEGER;
548         lexer_token.v.intvalue = found_char;
549 }
550
551 static
552 void skip_multiline_comment(void)
553 {
554         unsigned start_linenr = lexer_token.source_position.linenr;
555         int had_star = 0;
556
557         while(1) {
558                 switch(c) {
559                 case '*':
560                         next_char();
561                         had_star = 1;
562                         break;
563
564                 case '/':
565                         next_char();
566                         if(had_star) {
567                                 return;
568                         }
569                         had_star = 0;
570                         break;
571
572                 case '\\':
573                         next_char();
574                         EAT_NEWLINE(break;)
575                         had_star = 0;
576                         break;
577
578                 case '?':
579                         next_char();
580                         if(c != '?') {
581                                 had_star = 0;
582                                 break;
583                         }
584                         next_char();
585                         if(replace_trigraph())
586                                 break;
587                         put_back(c);
588                         c = '?';
589                         had_star = 0;
590                         /* we don't put back the 2nd ? as the comment text is discarded
591                          * anyway */
592                         break;
593
594                 MATCH_NEWLINE(had_star = 0; break;)
595
596                 case EOF:
597                         error_prefix_at(lexer_token.source_position.input_name,
598                                         start_linenr);
599                         fprintf(stderr, "at end of file while looking for comment end\n");
600                         return;
601                 default:
602                         had_star = 0;
603                         next_char();
604                         break;
605                 }
606         }
607 }
608
609 static
610 void skip_line_comment(void)
611 {
612         while(1) {
613                 switch(c) {
614                 case '?':
615                         next_char();
616                         if(c != '?')
617                                 break;
618                         next_char();
619                         if(replace_trigraph())
620                                 break;
621                         put_back('?');
622                         /* we don't put back the 2nd ? as the comment text is discarded
623                          * anyway */
624                         break;
625
626                 case '\\':
627                         next_char();
628                         if(c == '\n') {
629                                 next_char();
630                                 lexer_token.source_position.linenr++;
631                         }
632                         break;
633
634                 case EOF:
635                 case '\r':
636                 case '\n':
637                         return;
638
639                 default:
640                         next_char();
641                         break;
642                 }
643         }
644 }
645
646 static token_t pp_token;
647
648 static inline
649 void next_pp_token(void)
650 {
651         lexer_next_preprocessing_token();
652         pp_token = lexer_token;
653 }
654
655 static
656 void eat_until_newline(void)
657 {
658         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
659                 next_pp_token();
660         }
661 }
662
663 static
664 void error_directive(void)
665 {
666         error_prefix();
667         fprintf(stderr, "#error directive: \n");
668
669         /* parse pp-tokens until new-line */
670 }
671
672 static
673 void define_directive(void)
674 {
675         lexer_next_preprocessing_token();
676         if(lexer_token.type != T_IDENTIFIER) {
677                 parse_error("expected identifier after #define\n");
678                 eat_until_newline();
679         }
680 }
681
682 static
683 void ifdef_directive(int is_ifndef)
684 {
685         (void) is_ifndef;
686         lexer_next_preprocessing_token();
687         //expect_identifier();
688         //extect_newline();
689 }
690
691 static
692 void endif_directive(void)
693 {
694         //expect_newline();
695 }
696
697 static
698 void parse_line_directive(void)
699 {
700         if(pp_token.type != T_INTEGER) {
701                 parse_error("expected integer");
702         } else {
703                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
704                 next_pp_token();
705         }
706         if(pp_token.type == T_STRING_LITERAL) {
707                 lexer_token.source_position.input_name = pp_token.v.string;
708                 next_pp_token();
709         }
710
711         eat_until_newline();
712 }
713
714 static
715 void parse_preprocessor_identifier(void)
716 {
717         assert(pp_token.type == T_IDENTIFIER);
718         symbol_t *symbol = pp_token.v.symbol;
719
720         switch(symbol->pp_ID) {
721         case TP_include:
722                 printf("include - enable header name parsing!\n");
723                 break;
724         case TP_define:
725                 define_directive();
726                 break;
727         case TP_ifdef:
728                 ifdef_directive(0);
729                 break;
730         case TP_ifndef:
731                 ifdef_directive(1);
732                 break;
733         case TP_endif:
734                 endif_directive();
735                 break;
736         case TP_line:
737                 next_pp_token();
738                 parse_line_directive();
739                 break;
740         case TP_if:
741         case TP_else:
742         case TP_elif:
743         case TP_undef:
744         case TP_error:
745                 error_directive();
746                 break;
747         case TP_pragma:
748                 break;
749         }
750 }
751
752 static
753 void parse_preprocessor_directive()
754 {
755         next_pp_token();
756
757         switch(pp_token.type) {
758         case T_IDENTIFIER:
759                 parse_preprocessor_identifier();
760                 break;
761         case T_INTEGER:
762                 parse_line_directive();
763                 break;
764         default:
765                 parse_error("invalid preprocessor directive");
766                 eat_until_newline();
767                 break;
768         }
769 }
770
771 #define MAYBE_PROLOG                                       \
772                         next_char();                                   \
773                         while(1) {                                     \
774                                 switch(c) {
775
776 #define MAYBE(ch, set_type)                                \
777                                 case ch:                                   \
778                                         next_char();                           \
779                                         lexer_token.type = set_type;           \
780                                         return;
781
782 #define ELSE_CODE(code)                                    \
783                                 SKIP_TRIGRAPHS(,                           \
784                                         code;                                  \
785                                 )                                          \
786                                                                                                                    \
787                                 case '\\':                                 \
788                                         next_char();                           \
789                                         EAT_NEWLINE(break;)                    \
790                                         /* fallthrough */                      \
791                                 default:                                   \
792                                         code;                                  \
793                                 }                                          \
794                         } /* end of while(1) */                        \
795                         break;
796
797 #define ELSE(set_type)                                     \
798                 ELSE_CODE(                                         \
799                         lexer_token.type = set_type;                   \
800                         return;                                        \
801                 )
802
803 void lexer_next_preprocessing_token(void)
804 {
805         while(1) {
806                 switch(c) {
807                 case ' ':
808                 case '\t':
809                         next_char();
810                         break;
811
812                 MATCH_NEWLINE(
813                         lexer_token.type = '\n';
814                         return;
815                 )
816
817                 SYMBOL_CHARS
818                         parse_symbol();
819                         return;
820
821                 DIGITS
822                         parse_number();
823                         return;
824
825                 case '"':
826                         parse_string_literal();
827                         return;
828
829                 case '\'':
830                         parse_character_constant();
831                         return;
832
833                 case '\\':
834                         next_char();
835                         if(c == '\n') {
836                                 next_char();
837                                 lexer_token.source_position.linenr++;
838                                 break;
839                         } else {
840                                 parse_error("unexpected '\\' found");
841                                 lexer_token.type = T_ERROR;
842                         }
843                         return;
844
845                 case '.':
846                         MAYBE_PROLOG
847                                 case '.':
848                                         MAYBE_PROLOG
849                                         MAYBE('.', T_DOTDOTDOT)
850                                         ELSE_CODE(
851                                                 put_back(c);
852                                                 c = '.';
853                                                 lexer_token.type = '.';
854                                                 return;
855                                         )
856                         ELSE('.')
857                 case '&':
858                         MAYBE_PROLOG
859                         MAYBE('&', T_ANDAND)
860                         MAYBE('=', T_ANDEQUAL)
861                         ELSE('&')
862                 case '*':
863                         MAYBE_PROLOG
864                         MAYBE('=', T_ASTERISKEQUAL)
865                         ELSE('*')
866                 case '+':
867                         MAYBE_PROLOG
868                         MAYBE('+', T_PLUSPLUS)
869                         MAYBE('=', T_PLUSEQUAL)
870                         ELSE('+')
871                 case '-':
872                         MAYBE_PROLOG
873                         MAYBE('-', T_MINUSMINUS)
874                         MAYBE('=', T_MINUSEQUAL)
875                         ELSE('-')
876                 case '!':
877                         MAYBE_PROLOG
878                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
879                         ELSE('!')
880                 case '/':
881                         MAYBE_PROLOG
882                         MAYBE('=', T_SLASHEQUAL)
883                                 case '*':
884                                         next_char();
885                                         skip_multiline_comment();
886                                         lexer_next_preprocessing_token();
887                                         return;
888                                 case '/':
889                                         next_char();
890                                         skip_line_comment();
891                                         lexer_next_preprocessing_token();
892                                         return;
893                         ELSE('/')
894                 case '%':
895                         MAYBE_PROLOG
896                         MAYBE('>', T_PERCENTGREATER)
897                         MAYBE('=', T_PERCENTEQUAL)
898                                 case ':':
899                                         MAYBE_PROLOG
900                                                 case '%':
901                                                         MAYBE_PROLOG
902                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
903                                                         ELSE_CODE(
904                                                                 put_back(c);
905                                                                 c = '%';
906                                                                 lexer_token.type = T_PERCENTCOLON;
907                                                                 return;
908                                                         )
909                                         ELSE(T_PERCENTCOLON)
910                         ELSE('%')
911                 case '<':
912                         MAYBE_PROLOG
913                         MAYBE(':', T_LESSCOLON)
914                         MAYBE('%', T_LESSPERCENT)
915                                 case '<':
916                                         MAYBE_PROLOG
917                                         MAYBE('=', T_LESSLESSEQUAL)
918                                         ELSE(T_LESSLESS)
919                         ELSE('<')
920                 case '>':
921                         MAYBE_PROLOG
922                                 case '>':
923                                         MAYBE_PROLOG
924                                         MAYBE('=', T_GREATERGREATEREQUAL)
925                                         ELSE(T_GREATERGREATER)
926                         ELSE('>')
927                 case '^':
928                         MAYBE_PROLOG
929                         MAYBE('=', T_CARETEQUAL)
930                         ELSE('^')
931                 case '|':
932                         MAYBE_PROLOG
933                         MAYBE('=', T_PIPEEQUAL)
934                         MAYBE('|', T_PIPEPIPE)
935                         ELSE('|')
936                 case ':':
937                         MAYBE_PROLOG
938                         MAYBE('>', T_COLONGREATER)
939                         ELSE(':')
940                 case '=':
941                         MAYBE_PROLOG
942                         MAYBE('=', T_EQUALEQUAL)
943                         ELSE('=')
944                 case '#':
945                         MAYBE_PROLOG
946                         MAYBE('#', T_HASHHASH)
947                         ELSE('#')
948
949                 case '?':
950                         next_char();
951                         /* just a simple ? */
952                         if(c != '?') {
953                                 lexer_token.type = '?';
954                                 return;
955                         }
956                         /* might be a trigraph */
957                         next_char();
958                         if(replace_trigraph()) {
959                                 break;
960                         }
961                         put_back(c);
962                         c = '?';
963                         lexer_token.type = '?';
964                         return;
965
966                 case '[':
967                 case ']':
968                 case '(':
969                 case ')':
970                 case '{':
971                 case '}':
972                 case '~':
973                 case ';':
974                 case ',':
975                         lexer_token.type = c;
976                         next_char();
977                         return;
978
979                 case EOF:
980                         lexer_token.type = T_EOF;
981                         return;
982
983                 default:
984                         next_char();
985                         error_prefix();
986                         fprintf(stderr, "unknown character '%c' found\n", c);
987                         lexer_token.type = T_ERROR;
988                         return;
989                 }
990         }
991 }
992
993 void lexer_next_token(void)
994 {
995         lexer_next_preprocessing_token();
996         if(lexer_token.type != '\n')
997                 return;
998
999 newline_found:
1000         do {
1001                 lexer_next_preprocessing_token();
1002         } while(lexer_token.type == '\n');
1003
1004         if(lexer_token.type == '#') {
1005                 parse_preprocessor_directive();
1006                 goto newline_found;
1007         }
1008 }
1009
1010 void init_lexer(void)
1011 {
1012         strset_init(&stringset);
1013 }
1014
1015 void lexer_open_stream(FILE *stream, const char *input_name)
1016 {
1017         input                                  = stream;
1018         lexer_token.source_position.linenr     = 0;
1019         lexer_token.source_position.input_name = input_name;
1020
1021         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1022          * beginning of a line */
1023         c = '\n';
1024 }
1025
1026 void exit_lexer(void)
1027 {
1028         strset_destroy(&stringset);
1029 }
1030
1031 static __attribute__((unused))
1032 void dbg_pos(const source_position_t source_position)
1033 {
1034         fprintf(stdout, "%s:%d\n", source_position.input_name,
1035                 source_position.linenr);
1036         fflush(stdout);
1037 }