- countless bugfixes
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7 #include "adt/strset.h"
8
9 #include <assert.h>
10 #include <errno.h>
11 #include <string.h>
12 #include <ctype.h>
13
14 //#define DEBUG_CHARS
15 #define MAX_PUTBACK 3
16
17 static int         c;
18 token_t            lexer_token;
19 static FILE       *input;
20 static char        buf[1024 + MAX_PUTBACK];
21 static const char *bufend;
22 static const char *bufpos;
23 static strset_t    stringset;
24 //static FILE      **input_stack;
25 //static char      **buf_stack;
26
27 static
28 void error_prefix_at(const char *input_name, unsigned linenr)
29 {
30         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
31 }
32
33 static
34 void error_prefix(void)
35 {
36         error_prefix_at(lexer_token.source_position.input_name,
37                         lexer_token.source_position.linenr);
38 }
39
40 static
41 void parse_error(const char *msg)
42 {
43         error_prefix();
44         fprintf(stderr, "%s\n", msg);
45 }
46
47 static inline
48 void next_char(void)
49 {
50         bufpos++;
51         if(bufpos >= bufend) {
52                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
53                                  input);
54                 if(s == 0) {
55                         c = EOF;
56                         return;
57                 }
58                 bufpos = buf + MAX_PUTBACK;
59                 bufend = buf + MAX_PUTBACK + s;
60         }
61         c = *(bufpos);
62 #ifdef DEBUG_CHARS
63         printf("nchar '%c'\n", c);
64 #endif
65 }
66
67 static inline
68 void put_back(int pc)
69 {
70         char *p = (char*) bufpos - 1;
71         bufpos--;
72         assert(p >= buf);
73         *p = pc;
74
75 #ifdef DEBUG_CHARS
76         printf("putback '%c'\n", pc);
77 #endif
78 }
79
80
81 static
82 int replace_trigraph(void)
83 {
84 #define MATCH_TRIGRAPH(ch,replacement)           \
85         case ch:                                     \
86                 c = replacement;                         \
87                 return 1;
88
89         switch(c) {
90         MATCH_TRIGRAPH('=', '#')
91         MATCH_TRIGRAPH('(', '[')
92         MATCH_TRIGRAPH('/', '\\')
93         MATCH_TRIGRAPH(')', ']')
94         MATCH_TRIGRAPH('\'', '^')
95         MATCH_TRIGRAPH('<', '{')
96         MATCH_TRIGRAPH('!', '|')
97         MATCH_TRIGRAPH('>', '}')
98         MATCH_TRIGRAPH('-', '~')
99         default:
100                 break;
101         }
102
103         return 0;
104 }
105
106 #define SKIP_TRIGRAPHS(custom_putback, no_trigraph_code) \
107         case '?':                                  \
108                 next_char();                           \
109                 if(c != '?') {                         \
110                         custom_putback;                    \
111                         put_back(c);                       \
112                         c = '?';                           \
113                         no_trigraph_code;                  \
114                 }                                      \
115                 next_char();                           \
116                 if(replace_trigraph()) {               \
117                         break;                             \
118                 }                                      \
119                 custom_putback;                        \
120                 put_back('?');                         \
121                 put_back(c);                           \
122                 c = '?';                               \
123                 no_trigraph_code;
124
125 #define EAT_NEWLINE(newline_code)              \
126         if(c == '\r') {                            \
127                 next_char();                           \
128                 if(c == '\n')                          \
129                         next_char();                       \
130                 lexer_token.source_position.linenr++;  \
131                 newline_code;                          \
132         } else if(c == '\n') {                     \
133                 next_char();                           \
134                 lexer_token.source_position.linenr++;  \
135                 newline_code;                          \
136         }
137
138 #define SYMBOL_CHARS  \
139         case 'a':         \
140         case 'b':         \
141         case 'c':         \
142         case 'd':         \
143         case 'e':         \
144         case 'f':         \
145         case 'g':         \
146         case 'h':         \
147         case 'i':         \
148         case 'j':         \
149         case 'k':         \
150         case 'l':         \
151         case 'm':         \
152         case 'n':         \
153         case 'o':         \
154         case 'p':         \
155         case 'q':         \
156         case 'r':         \
157         case 's':         \
158         case 't':         \
159         case 'u':         \
160         case 'v':         \
161         case 'w':         \
162         case 'x':         \
163         case 'y':         \
164         case 'z':         \
165         case 'A':         \
166         case 'B':         \
167         case 'C':         \
168         case 'D':         \
169         case 'E':         \
170         case 'F':         \
171         case 'G':         \
172         case 'H':         \
173         case 'I':         \
174         case 'J':         \
175         case 'K':         \
176         case 'L':         \
177         case 'M':         \
178         case 'N':         \
179         case 'O':         \
180         case 'P':         \
181         case 'Q':         \
182         case 'R':         \
183         case 'S':         \
184         case 'T':         \
185         case 'U':         \
186         case 'V':         \
187         case 'W':         \
188         case 'X':         \
189         case 'Y':         \
190         case 'Z':         \
191         case '_':
192
193 #define DIGITS        \
194         case '0':         \
195         case '1':         \
196         case '2':         \
197         case '3':         \
198         case '4':         \
199         case '5':         \
200         case '6':         \
201         case '7':         \
202         case '8':         \
203         case '9':
204
205 static
206 void parse_symbol(void)
207 {
208         symbol_t *symbol;
209         char     *string;
210
211         obstack_1grow(&symbol_obstack, c);
212         next_char();
213
214         while(1) {
215                 switch(c) {
216                 case '\\':
217                         next_char();
218                         EAT_NEWLINE(break;)
219                         goto end_symbol;
220
221                 DIGITS
222                 SYMBOL_CHARS
223                         obstack_1grow(&symbol_obstack, c);
224                         next_char();
225                         break;
226
227                 case '?':
228                         next_char();
229                         if(c != '?') {
230                                 put_back(c);
231                                 c = '?';
232                                 goto end_symbol;
233                         }
234                         next_char();
235                         if(replace_trigraph())
236                                 break;
237                         put_back('?');
238                         put_back(c);
239                         c = '?';
240                         goto end_symbol;
241
242                 default:
243                         goto end_symbol;
244                 }
245         }
246 end_symbol:
247         obstack_1grow(&symbol_obstack, '\0');
248
249         string = obstack_finish(&symbol_obstack);
250         symbol = symbol_table_insert(string);
251
252         lexer_token.type     = symbol->ID;
253         lexer_token.v.symbol = symbol;
254
255         if(symbol->string != string) {
256                 obstack_free(&symbol_obstack, string);
257         }
258 }
259
260 static
261 void parse_number_hex(void)
262 {
263         assert(c == 'x' || c == 'X');
264         next_char();
265
266         if (!isdigit(c) &&
267                 !('A' <= c && c <= 'F') &&
268                 !('a' <= c && c <= 'f')) {
269                 parse_error("premature end of hex number literal");
270                 lexer_token.type = T_ERROR;
271                 return;
272         }
273
274         int value = 0;
275         while(1) {
276                 if (isdigit(c)) {
277                         value = 16 * value + c - '0';
278                 } else if ('A' <= c && c <= 'F') {
279                         value = 16 * value + c - 'A' + 10;
280                 } else if ('a' <= c && c <= 'f') {
281                         value = 16 * value + c - 'a' + 10;
282                 } else {
283                         lexer_token.type     = T_INTEGER;
284                         lexer_token.v.intvalue = value;
285                         return;
286                 }
287                 next_char();
288         }
289 }
290
291 static
292 void parse_number_oct(void)
293 {
294         assert(c == 'o' || c == 'O');
295         next_char();
296
297         int value = 0;
298         while(1) {
299                 if ('0' <= c && c <= '7') {
300                         value = 8 * value + c - '0';
301                 } else {
302                         lexer_token.type       = T_INTEGER;
303                         lexer_token.v.intvalue = value;
304                         return;
305                 }
306                 next_char();
307         }
308 }
309
310 static
311 void parse_number_dec(int first_char)
312 {
313         int value = 0;
314         if(first_char > 0) {
315                 assert(first_char >= '0' && first_char <= '9');
316                 value = first_char - '0';
317         }
318
319         for(;;) {
320                 if (isdigit(c)) {
321                         value = 10 * value + c - '0';
322                 } else {
323                         lexer_token.type       = T_INTEGER;
324                         lexer_token.v.intvalue = value;
325                         return;
326                 }
327                 next_char();
328         }
329 }
330
331 static
332 void parse_number(void)
333 {
334         // TODO check for overflow
335         // TODO check for various invalid inputs sequences
336
337         if (c == '0') {
338                 next_char();
339                 switch (c) {
340                         case 'X':
341                         case 'x': parse_number_hex(); break;
342                         case 'o':
343                         case 'O': parse_number_oct(); break;
344                         default:  parse_number_dec('0');
345                 }
346         } else {
347                 parse_number_dec(0);
348         }
349         if(c == 'U' || c == 'U') {
350                 /* TODO do something with the suffixes... */
351                 next_char();
352                 if(c == 'L' || c == 'l') {
353                         next_char();
354                         if(c == 'L' || c == 'l') {
355                                 next_char();
356                         }
357                 }
358         } else if(c == 'l' || c == 'L') {
359                 next_char();
360                 if(c == 'l' || c == 'L') {
361                         next_char();
362                         if(c == 'u' || c == 'U') {
363                                 next_char();
364                         }
365                 } else if(c == 'u' || c == 'U') {
366                         next_char();
367                 }
368         }
369 }
370
371 static int parse_octal_sequence(void)
372 {
373         int value = 0;
374         while(1) {
375                 if(c < '0' || c > '7')
376                         break;
377                 value = 8 * value + c - '0';
378                 next_char();
379         }
380
381         return value;
382 }
383
384 static int parse_hex_sequence(void)
385 {
386         int value = 0;
387         while(1) {
388                 if (c >= '0' && c <= '9') {
389                         value = 16 * value + c - '0';
390                 } else if ('A' <= c && c <= 'F') {
391                         value = 16 * value + c - 'A' + 10;
392                 } else if ('a' <= c && c <= 'f') {
393                         value = 16 * value + c - 'a' + 10;
394                 } else {
395                         break;
396                 }
397                 next_char();
398         }
399
400         return value;
401 }
402
403 static int parse_escape_sequence(void)
404 {
405         while(1) {
406                 int ec = c;
407                 next_char();
408
409                 switch(ec) {
410                 case '"': return '"';
411                 case '\'': return'\'';
412                 case '\\':
413                         EAT_NEWLINE(break;)
414                         return '\\';
415                 case 'a': return '\a';
416                 case 'b': return '\b';
417                 case 'f': return '\f';
418                 case 'n': return '\n';
419                 case 'r': return '\r';
420                 case 't': return '\t';
421                 case 'v': return '\v';
422                 case 'x':
423                         return parse_hex_sequence();
424                 case '0':
425                 case '1':
426                 case '2':
427                 case '3':
428                 case '4':
429                 case '5':
430                 case '6':
431                 case '7':
432                         return parse_octal_sequence();
433                 case '?':
434                         if(c != '?') {
435                                 return '?';
436                         }
437                         /* might be a trigraph */
438                         next_char();
439                         if(replace_trigraph()) {
440                                 break;
441                         }
442                         put_back(c);
443                         c = '?';
444                         return '?';
445
446                 case EOF:
447                         parse_error("reached end of file while parsing escape sequence");
448                         return EOF;
449                 default:
450                         parse_error("unknown escape sequence");
451                         return EOF;
452                 }
453         }
454 }
455
456 const char *concat_strings(const char *s1, const char *s2)
457 {
458         size_t  len1   = strlen(s1);
459         size_t  len2   = strlen(s2);
460
461         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
462         memcpy(concat, s1, len1);
463         memcpy(concat + len1, s2, len2 + 1);
464
465         const char *result = strset_insert(&stringset, concat);
466         if(result != concat) {
467                 obstack_free(&symbol_obstack, concat);
468         }
469
470         return result;
471 }
472
473 static
474 void parse_string_literal(void)
475 {
476         unsigned    start_linenr = lexer_token.source_position.linenr;
477         char       *string;
478         const char *result;
479
480         assert(c == '"');
481         next_char();
482
483         while(1) {
484                 switch(c) {
485                 SKIP_TRIGRAPHS(,
486                         obstack_1grow(&symbol_obstack, '?');
487                         next_char();
488                         break;
489                 )
490
491                 case '\\':
492                         next_char();
493                         EAT_NEWLINE(break;)
494                         int ec = parse_escape_sequence();
495                         obstack_1grow(&symbol_obstack, ec);
496                         break;
497
498                 case EOF:
499                         error_prefix_at(lexer_token.source_position.input_name,
500                                         start_linenr);
501                         fprintf(stderr, "string has no end\n");
502                         lexer_token.type = T_ERROR;
503                         return;
504
505                 case '"':
506                         next_char();
507                         goto end_of_string;
508
509                 default:
510                         obstack_1grow(&symbol_obstack, c);
511                         next_char();
512                         break;
513                 }
514         }
515
516 end_of_string:
517
518         /* TODO: concatenate multiple strings separated by whitespace... */
519
520         /* add finishing 0 to the string */
521         obstack_1grow(&symbol_obstack, '\0');
522         string = obstack_finish(&symbol_obstack);
523
524         /* check if there is already a copy of the string */
525         result = strset_insert(&stringset, string);
526         if(result != string) {
527                 obstack_free(&symbol_obstack, string);
528         }
529
530         lexer_token.type     = T_STRING_LITERAL;
531         lexer_token.v.string = result;
532 }
533
534 #define MATCH_NEWLINE(code)                   \
535         case '\r':                                \
536                 next_char();                          \
537                 if(c == '\n') {                       \
538                         next_char();                      \
539                 }                                     \
540                 lexer_token.source_position.linenr++; \
541                 code;                                 \
542         case '\n':                                \
543                 next_char();                          \
544                 lexer_token.source_position.linenr++; \
545                 code;
546
547 static
548 void parse_character_constant(void)
549 {
550         assert(c == '\'');
551         next_char();
552
553         int found_char = 0;
554         while(1) {
555                 switch(c) {
556                 SKIP_TRIGRAPHS(,
557                         next_char();
558                         found_char = '?';
559                         break;
560                 )
561
562                 case '\\':
563                         next_char();
564                         EAT_NEWLINE(break;)
565                         found_char = parse_escape_sequence();
566                         break;
567
568                 MATCH_NEWLINE(
569                         parse_error("newline while parsing character constant");
570                         break;
571                 )
572
573                 case '\'':
574                         next_char();
575                         goto end_of_char_constant;
576
577                 case EOF:
578                         parse_error("EOF while parsing character constant");
579                         lexer_token.type = T_ERROR;
580                         return;
581
582                 default:
583                         if(found_char != 0) {
584                                 parse_error("more than 1 characters in character "
585                                             "constant");
586                                 goto end_of_char_constant;
587                         } else {
588                                 found_char = c;
589                                 next_char();
590                         }
591                         break;
592                 }
593         }
594
595 end_of_char_constant:
596         lexer_token.type       = T_INTEGER;
597         lexer_token.v.intvalue = found_char;
598 }
599
600 static
601 void skip_multiline_comment(void)
602 {
603         unsigned start_linenr = lexer_token.source_position.linenr;
604         int had_star = 0;
605
606         while(1) {
607                 switch(c) {
608                 case '*':
609                         next_char();
610                         had_star = 1;
611                         break;
612
613                 case '/':
614                         next_char();
615                         if(had_star) {
616                                 return;
617                         }
618                         had_star = 0;
619                         break;
620
621                 case '\\':
622                         next_char();
623                         EAT_NEWLINE(break;)
624                         had_star = 0;
625                         break;
626
627                 case '?':
628                         next_char();
629                         if(c != '?') {
630                                 had_star = 0;
631                                 break;
632                         }
633                         next_char();
634                         if(replace_trigraph())
635                                 break;
636                         put_back(c);
637                         c = '?';
638                         had_star = 0;
639                         /* we don't put back the 2nd ? as the comment text is discarded
640                          * anyway */
641                         break;
642
643                 MATCH_NEWLINE(had_star = 0; break;)
644
645                 case EOF:
646                         error_prefix_at(lexer_token.source_position.input_name,
647                                         start_linenr);
648                         fprintf(stderr, "at end of file while looking for comment end\n");
649                         return;
650                 default:
651                         had_star = 0;
652                         next_char();
653                         break;
654                 }
655         }
656 }
657
658 static
659 void skip_line_comment(void)
660 {
661         while(1) {
662                 switch(c) {
663                 case '?':
664                         next_char();
665                         if(c != '?')
666                                 break;
667                         next_char();
668                         if(replace_trigraph())
669                                 break;
670                         put_back('?');
671                         /* we don't put back the 2nd ? as the comment text is discarded
672                          * anyway */
673                         break;
674
675                 case '\\':
676                         next_char();
677                         if(c == '\n') {
678                                 next_char();
679                                 lexer_token.source_position.linenr++;
680                         }
681                         break;
682
683                 case EOF:
684                 case '\r':
685                 case '\n':
686                         return;
687
688                 default:
689                         next_char();
690                         break;
691                 }
692         }
693 }
694
695 static token_t pp_token;
696
697 static inline
698 void next_pp_token(void)
699 {
700         lexer_next_preprocessing_token();
701         pp_token = lexer_token;
702 }
703
704 static
705 void eat_until_newline(void)
706 {
707         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
708                 next_pp_token();
709         }
710 }
711
712 static
713 void error_directive(void)
714 {
715         error_prefix();
716         fprintf(stderr, "#error directive: \n");
717
718         /* parse pp-tokens until new-line */
719 }
720
721 static
722 void define_directive(void)
723 {
724         lexer_next_preprocessing_token();
725         if(lexer_token.type != T_IDENTIFIER) {
726                 parse_error("expected identifier after #define\n");
727                 eat_until_newline();
728         }
729 }
730
731 static
732 void ifdef_directive(int is_ifndef)
733 {
734         (void) is_ifndef;
735         lexer_next_preprocessing_token();
736         //expect_identifier();
737         //extect_newline();
738 }
739
740 static
741 void endif_directive(void)
742 {
743         //expect_newline();
744 }
745
746 static
747 void parse_line_directive(void)
748 {
749         if(pp_token.type != T_INTEGER) {
750                 parse_error("expected integer");
751         } else {
752                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
753                 next_pp_token();
754         }
755         if(pp_token.type == T_STRING_LITERAL) {
756                 lexer_token.source_position.input_name = pp_token.v.string;
757                 next_pp_token();
758         }
759
760         eat_until_newline();
761 }
762
763 static
764 void parse_preprocessor_identifier(void)
765 {
766         assert(pp_token.type == T_IDENTIFIER);
767         symbol_t *symbol = pp_token.v.symbol;
768
769         switch(symbol->pp_ID) {
770         case TP_include:
771                 printf("include - enable header name parsing!\n");
772                 break;
773         case TP_define:
774                 define_directive();
775                 break;
776         case TP_ifdef:
777                 ifdef_directive(0);
778                 break;
779         case TP_ifndef:
780                 ifdef_directive(1);
781                 break;
782         case TP_endif:
783                 endif_directive();
784                 break;
785         case TP_line:
786                 next_pp_token();
787                 parse_line_directive();
788                 break;
789         case TP_if:
790         case TP_else:
791         case TP_elif:
792         case TP_undef:
793         case TP_error:
794                 error_directive();
795                 break;
796         case TP_pragma:
797                 break;
798         }
799 }
800
801 static
802 void parse_preprocessor_directive()
803 {
804         next_pp_token();
805
806         switch(pp_token.type) {
807         case T_IDENTIFIER:
808                 parse_preprocessor_identifier();
809                 break;
810         case T_INTEGER:
811                 parse_line_directive();
812                 break;
813         default:
814                 parse_error("invalid preprocessor directive");
815                 eat_until_newline();
816                 break;
817         }
818 }
819
820 #define MAYBE_PROLOG                                       \
821                         next_char();                                   \
822                         while(1) {                                     \
823                                 switch(c) {
824
825 #define MAYBE(ch, set_type)                                \
826                                 case ch:                                   \
827                                         next_char();                           \
828                                         lexer_token.type = set_type;           \
829                                         return;
830
831 #define ELSE_CODE(code)                                    \
832                                 SKIP_TRIGRAPHS(,                           \
833                                         code;                                  \
834                                 )                                          \
835                                                                                                                    \
836                                 case '\\':                                 \
837                                         next_char();                           \
838                                         EAT_NEWLINE(break;)                    \
839                                         /* fallthrough */                      \
840                                 default:                                   \
841                                         code;                                  \
842                                 }                                          \
843                         } /* end of while(1) */                        \
844                         break;
845
846 #define ELSE(set_type)                                     \
847                 ELSE_CODE(                                         \
848                         lexer_token.type = set_type;                   \
849                         return;                                        \
850                 )
851
852 void lexer_next_preprocessing_token(void)
853 {
854         while(1) {
855                 switch(c) {
856                 case ' ':
857                 case '\t':
858                         next_char();
859                         break;
860
861                 MATCH_NEWLINE(
862                         lexer_token.type = '\n';
863                         return;
864                 )
865
866                 SYMBOL_CHARS
867                         parse_symbol();
868                         return;
869
870                 DIGITS
871                         parse_number();
872                         return;
873
874                 case '"':
875                         parse_string_literal();
876                         return;
877
878                 case '\'':
879                         parse_character_constant();
880                         return;
881
882                 case '\\':
883                         next_char();
884                         if(c == '\n') {
885                                 next_char();
886                                 lexer_token.source_position.linenr++;
887                                 break;
888                         } else {
889                                 parse_error("unexpected '\\' found");
890                                 lexer_token.type = T_ERROR;
891                         }
892                         return;
893
894                 case '.':
895                         MAYBE_PROLOG
896                                 case '.':
897                                         MAYBE_PROLOG
898                                         MAYBE('.', T_DOTDOTDOT)
899                                         ELSE_CODE(
900                                                 put_back(c);
901                                                 c = '.';
902                                                 lexer_token.type = '.';
903                                                 return;
904                                         )
905                         ELSE('.')
906                 case '&':
907                         MAYBE_PROLOG
908                         MAYBE('&', T_ANDAND)
909                         MAYBE('=', T_ANDEQUAL)
910                         ELSE('&')
911                 case '*':
912                         MAYBE_PROLOG
913                         MAYBE('=', T_ASTERISKEQUAL)
914                         ELSE('*')
915                 case '+':
916                         MAYBE_PROLOG
917                         MAYBE('+', T_PLUSPLUS)
918                         MAYBE('=', T_PLUSEQUAL)
919                         ELSE('+')
920                 case '-':
921                         MAYBE_PROLOG
922                         MAYBE('>', T_MINUSGREATER)
923                         MAYBE('-', T_MINUSMINUS)
924                         MAYBE('=', T_MINUSEQUAL)
925                         ELSE('-')
926                 case '!':
927                         MAYBE_PROLOG
928                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
929                         ELSE('!')
930                 case '/':
931                         MAYBE_PROLOG
932                         MAYBE('=', T_SLASHEQUAL)
933                                 case '*':
934                                         next_char();
935                                         skip_multiline_comment();
936                                         lexer_next_preprocessing_token();
937                                         return;
938                                 case '/':
939                                         next_char();
940                                         skip_line_comment();
941                                         lexer_next_preprocessing_token();
942                                         return;
943                         ELSE('/')
944                 case '%':
945                         MAYBE_PROLOG
946                         MAYBE('>', T_PERCENTGREATER)
947                         MAYBE('=', T_PERCENTEQUAL)
948                                 case ':':
949                                         MAYBE_PROLOG
950                                                 case '%':
951                                                         MAYBE_PROLOG
952                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
953                                                         ELSE_CODE(
954                                                                 put_back(c);
955                                                                 c = '%';
956                                                                 lexer_token.type = T_PERCENTCOLON;
957                                                                 return;
958                                                         )
959                                         ELSE(T_PERCENTCOLON)
960                         ELSE('%')
961                 case '<':
962                         MAYBE_PROLOG
963                         MAYBE(':', T_LESSCOLON)
964                         MAYBE('%', T_LESSPERCENT)
965                         MAYBE('=', T_LESSEQUAL)
966                                 case '<':
967                                         MAYBE_PROLOG
968                                         MAYBE('=', T_LESSLESSEQUAL)
969                                         ELSE(T_LESSLESS)
970                         ELSE('<')
971                 case '>':
972                         MAYBE_PROLOG
973                         MAYBE('=', T_GREATEREQUAL)
974                                 case '>':
975                                         MAYBE_PROLOG
976                                         MAYBE('=', T_GREATERGREATEREQUAL)
977                                         ELSE(T_GREATERGREATER)
978                         ELSE('>')
979                 case '^':
980                         MAYBE_PROLOG
981                         MAYBE('=', T_CARETEQUAL)
982                         ELSE('^')
983                 case '|':
984                         MAYBE_PROLOG
985                         MAYBE('=', T_PIPEEQUAL)
986                         MAYBE('|', T_PIPEPIPE)
987                         ELSE('|')
988                 case ':':
989                         MAYBE_PROLOG
990                         MAYBE('>', T_COLONGREATER)
991                         ELSE(':')
992                 case '=':
993                         MAYBE_PROLOG
994                         MAYBE('=', T_EQUALEQUAL)
995                         ELSE('=')
996                 case '#':
997                         MAYBE_PROLOG
998                         MAYBE('#', T_HASHHASH)
999                         ELSE('#')
1000
1001                 case '?':
1002                         next_char();
1003                         /* just a simple ? */
1004                         if(c != '?') {
1005                                 lexer_token.type = '?';
1006                                 return;
1007                         }
1008                         /* might be a trigraph */
1009                         next_char();
1010                         if(replace_trigraph()) {
1011                                 break;
1012                         }
1013                         put_back(c);
1014                         c = '?';
1015                         lexer_token.type = '?';
1016                         return;
1017
1018                 case '[':
1019                 case ']':
1020                 case '(':
1021                 case ')':
1022                 case '{':
1023                 case '}':
1024                 case '~':
1025                 case ';':
1026                 case ',':
1027                         lexer_token.type = c;
1028                         next_char();
1029                         return;
1030
1031                 case EOF:
1032                         lexer_token.type = T_EOF;
1033                         return;
1034
1035                 default:
1036                         next_char();
1037                         error_prefix();
1038                         fprintf(stderr, "unknown character '%c' found\n", c);
1039                         lexer_token.type = T_ERROR;
1040                         return;
1041                 }
1042         }
1043 }
1044
1045 void lexer_next_token(void)
1046 {
1047         lexer_next_preprocessing_token();
1048         if(lexer_token.type != '\n')
1049                 return;
1050
1051 newline_found:
1052         do {
1053                 lexer_next_preprocessing_token();
1054         } while(lexer_token.type == '\n');
1055
1056         if(lexer_token.type == '#') {
1057                 parse_preprocessor_directive();
1058                 goto newline_found;
1059         }
1060 }
1061
1062 void init_lexer(void)
1063 {
1064         strset_init(&stringset);
1065 }
1066
1067 void lexer_open_stream(FILE *stream, const char *input_name)
1068 {
1069         input                                  = stream;
1070         lexer_token.source_position.linenr     = 0;
1071         lexer_token.source_position.input_name = input_name;
1072
1073         /* we place a virtual '\n' at the beginning so the lexer knows we're at the
1074          * beginning of a line */
1075         c = '\n';
1076 }
1077
1078 void exit_lexer(void)
1079 {
1080         strset_destroy(&stringset);
1081 }
1082
1083 static __attribute__((unused))
1084 void dbg_pos(const source_position_t source_position)
1085 {
1086         fprintf(stdout, "%s:%d\n", source_position.input_name,
1087                 source_position.linenr);
1088         fflush(stdout);
1089 }