parse wide strings
[cparser] / lexer.c
1 #include <config.h>
2
3 #include "lexer.h"
4 #include "token_t.h"
5 #include "symbol_table_t.h"
6 #include "adt/error.h"
7 #include "adt/strset.h"
8 #include "adt/util.h"
9
10 #include <assert.h>
11 #include <errno.h>
12 #include <string.h>
13 #include <ctype.h>
14
15 //#define DEBUG_CHARS
16 #define MAX_PUTBACK 3
17
18 static int         c;
19 token_t            lexer_token;
20 symbol_t          *symbol_L;
21 static FILE       *input;
22 static char        buf[1024 + MAX_PUTBACK];
23 static const char *bufend;
24 static const char *bufpos;
25 static strset_t    stringset;
26
27 static void error_prefix_at(const char *input_name, unsigned linenr)
28 {
29         fprintf(stderr, "%s:%u: Error: ", input_name, linenr);
30 }
31
32 static void error_prefix(void)
33 {
34         error_prefix_at(lexer_token.source_position.input_name,
35                         lexer_token.source_position.linenr);
36 }
37
38 static void parse_error(const char *msg)
39 {
40         error_prefix();
41         fprintf(stderr, "%s\n", msg);
42 }
43
44 static inline void next_real_char(void)
45 {
46         bufpos++;
47         if(bufpos >= bufend) {
48                 size_t s = fread(buf + MAX_PUTBACK, 1, sizeof(buf) - MAX_PUTBACK,
49                                  input);
50                 if(s == 0) {
51                         c = EOF;
52                         return;
53                 }
54                 bufpos = buf + MAX_PUTBACK;
55                 bufend = buf + MAX_PUTBACK + s;
56         }
57         c = *(bufpos);
58 }
59
60 static inline void put_back(int pc)
61 {
62         assert(bufpos >= buf);
63         assert(bufpos < buf+MAX_PUTBACK || *bufpos == pc);
64
65         char *p = buf + (bufpos - buf);
66         *p = pc;
67
68         /* going backwards in the buffer is legal as long as it's not more often
69          * than MAX_PUTBACK */
70         bufpos--;
71
72 #ifdef DEBUG_CHARS
73         printf("putback '%c'\n", pc);
74 #endif
75 }
76
77 static inline void next_char(void);
78
79 #define MATCH_NEWLINE(code)                   \
80         case '\r':                                \
81                 next_char();                          \
82                 if(c == '\n') {                       \
83                         next_char();                      \
84                 }                                     \
85                 lexer_token.source_position.linenr++; \
86                 code;                                 \
87         case '\n':                                \
88                 next_char();                          \
89                 lexer_token.source_position.linenr++; \
90                 code;
91
92 #define eat(c_type)  do { assert(c == c_type); next_char(); } while(0)
93
94 static void maybe_concat_lines(void)
95 {
96         eat('\\');
97
98         switch(c) {
99         MATCH_NEWLINE(return;)
100
101         default:
102                 break;
103         }
104
105         put_back(c);
106         c = '\\';
107 }
108
109 static inline void next_char(void)
110 {
111         next_real_char();
112
113 #if 0
114         /* filter trigraphs */
115         if(UNLIKELY(c == '\\')) {
116                 maybe_concat_lines();
117                 goto end_of_next_char;
118         }
119
120         if(LIKELY(c != '?'))
121                 goto end_of_next_char;
122
123         next_real_char();
124         if(LIKELY(c != '?')) {
125                 put_back(c);
126                 c = '?';
127                 goto end_of_next_char;
128         }
129
130         next_real_char();
131         switch(c) {
132         case '=': c = '#'; break;
133         case '(': c = '['; break;
134         case '/': c = '\\'; maybe_concat_lines(); break;
135         case ')': c = ']'; break;
136         case '\'': c = '^'; break;
137         case '<': c = '{'; break;
138         case '!': c = '|'; break;
139         case '>': c = '}'; break;
140         case '-': c = '~'; break;
141         default:
142                 put_back('?');
143                 put_back(c);
144                 c = '?';
145                 break;
146         }
147
148 end_of_next_char:
149 #endif
150         (void) maybe_concat_lines;
151 #ifdef DEBUG_CHARS
152         printf("nchar '%c'\n", c);
153 #else
154         ;
155 #endif
156 }
157
158 #define SYMBOL_CHARS  \
159         case 'a':         \
160         case 'b':         \
161         case 'c':         \
162         case 'd':         \
163         case 'e':         \
164         case 'f':         \
165         case 'g':         \
166         case 'h':         \
167         case 'i':         \
168         case 'j':         \
169         case 'k':         \
170         case 'l':         \
171         case 'm':         \
172         case 'n':         \
173         case 'o':         \
174         case 'p':         \
175         case 'q':         \
176         case 'r':         \
177         case 's':         \
178         case 't':         \
179         case 'u':         \
180         case 'v':         \
181         case 'w':         \
182         case 'x':         \
183         case 'y':         \
184         case 'z':         \
185         case 'A':         \
186         case 'B':         \
187         case 'C':         \
188         case 'D':         \
189         case 'E':         \
190         case 'F':         \
191         case 'G':         \
192         case 'H':         \
193         case 'I':         \
194         case 'J':         \
195         case 'K':         \
196         case 'L':         \
197         case 'M':         \
198         case 'N':         \
199         case 'O':         \
200         case 'P':         \
201         case 'Q':         \
202         case 'R':         \
203         case 'S':         \
204         case 'T':         \
205         case 'U':         \
206         case 'V':         \
207         case 'W':         \
208         case 'X':         \
209         case 'Y':         \
210         case 'Z':         \
211         case '_':
212
213 #define DIGITS        \
214         case '0':         \
215         case '1':         \
216         case '2':         \
217         case '3':         \
218         case '4':         \
219         case '5':         \
220         case '6':         \
221         case '7':         \
222         case '8':         \
223         case '9':
224
225 static void parse_symbol(void)
226 {
227         symbol_t *symbol;
228         char     *string;
229
230         obstack_1grow(&symbol_obstack, c);
231         next_char();
232
233         while(1) {
234                 switch(c) {
235                 DIGITS
236                 SYMBOL_CHARS
237                         obstack_1grow(&symbol_obstack, c);
238                         next_char();
239                         break;
240
241                 default:
242                         goto end_symbol;
243                 }
244         }
245
246 end_symbol:
247         obstack_1grow(&symbol_obstack, '\0');
248
249         string = obstack_finish(&symbol_obstack);
250         symbol = symbol_table_insert(string);
251
252         lexer_token.type     = symbol->ID;
253         lexer_token.v.symbol = symbol;
254
255         if(symbol->string != string) {
256                 obstack_free(&symbol_obstack, string);
257         }
258 }
259
260 static void parse_integer_suffix(void)
261 {
262         if(c == 'U' || c == 'U') {
263                 /* TODO do something with the suffixes... */
264                 next_char();
265                 if(c == 'L' || c == 'l') {
266                         next_char();
267                         if(c == 'L' || c == 'l') {
268                                 next_char();
269                         }
270                 }
271         } else if(c == 'l' || c == 'L') {
272                 next_char();
273                 if(c == 'l' || c == 'L') {
274                         next_char();
275                         if(c == 'u' || c == 'U') {
276                                 next_char();
277                         }
278                 } else if(c == 'u' || c == 'U') {
279                         next_char();
280                 }
281         }
282 }
283
284 static void parse_floating_suffix(void)
285 {
286         switch(c) {
287         /* TODO: do something usefull with the suffixes... */
288         case 'f':
289         case 'F':
290         case 'l':
291         case 'L':
292                 next_char();
293                 break;
294         default:
295                 break;
296         }
297 }
298
299 static void parse_number_hex(void)
300 {
301         assert(c == 'x' || c == 'X');
302         next_char();
303
304         if (!isdigit(c) &&
305                 !('A' <= c && c <= 'F') &&
306                 !('a' <= c && c <= 'f')) {
307                 parse_error("premature end of hex number literal");
308                 lexer_token.type = T_ERROR;
309                 return;
310         }
311
312         int value = 0;
313         while(1) {
314                 if (isdigit(c)) {
315                         value = 16 * value + c - '0';
316                 } else if ('A' <= c && c <= 'F') {
317                         value = 16 * value + c - 'A' + 10;
318                 } else if ('a' <= c && c <= 'f') {
319                         value = 16 * value + c - 'a' + 10;
320                 } else {
321                         parse_integer_suffix();
322
323                         lexer_token.type       = T_INTEGER;
324                         lexer_token.v.intvalue = value;
325                         return;
326                 }
327                 next_char();
328         }
329
330         if(c == '.' || c == 'p' || c == 'P') {
331                 next_char();
332                 panic("Hex floating point numbers not implemented yet");
333         }
334 }
335
336 static void parse_number_oct(void)
337 {
338         int value = 0;
339         while(c >= '0' && c <= '7') {
340                 value = 8 * value + c - '0';
341                 next_char();
342         }
343         if (c == '8' || c == '9') {
344                 parse_error("invalid octal number");
345                 lexer_token.type = T_ERROR;
346                 return;
347         }
348
349         lexer_token.type       = T_INTEGER;
350         lexer_token.v.intvalue = value;
351
352         parse_integer_suffix();
353 }
354
355 static void parse_floatingpoint_exponent(long double value)
356 {
357         unsigned int expo = 0;
358         long double  factor = 10.;
359
360         if(c == '-') {
361                 next_char();
362                 factor = 0.1;
363         } else if(c == '+') {
364                 next_char();
365         }
366
367         while(c >= '0' && c <= '9') {
368                 expo = 10 * expo + (c - '0');
369                 next_char();
370         }
371
372         while(1) {
373                 if(expo & 1)
374                         value *= factor;
375                 expo >>= 1;
376                 if(expo == 0)
377                         break;
378                 factor *= factor;
379         }
380
381         lexer_token.type         = T_FLOATINGPOINT;
382         lexer_token.v.floatvalue = value;
383
384         parse_floating_suffix();
385 }
386
387 static void parse_floatingpoint_fract(int integer_part)
388 {
389         long double value  = integer_part;
390         long double factor = 1.;
391
392         while(c >= '0' && c <= '9') {
393                 factor *= 0.1;
394                 value  += (c - '0') * factor;
395                 next_char();
396         }
397
398         if(c == 'e' || c == 'E') {
399                 next_char();
400                 parse_floatingpoint_exponent(value);
401                 return;
402         }
403
404         lexer_token.type         = T_FLOATINGPOINT;
405         lexer_token.v.floatvalue = value;
406
407         parse_floating_suffix();
408 }
409
410 static void parse_number_dec(void)
411 {
412         int value = 0;
413
414         while(isdigit(c)) {
415                 value = 10 * value + c - '0';
416                 next_char();
417         }
418
419         if(c == '.') {
420                 next_char();
421                 parse_floatingpoint_fract(value);
422                 return;
423         }
424         if(c == 'e' || c == 'E') {
425                 next_char();
426                 parse_floatingpoint_exponent(value);
427                 return;
428         }
429         parse_integer_suffix();
430
431         lexer_token.type       = T_INTEGER;
432         lexer_token.v.intvalue = value;
433 }
434
435 static void parse_number(void)
436 {
437         if (c == '0') {
438                 next_char();
439                 switch (c) {
440                         case 'X':
441                         case 'x':
442                                 parse_number_hex();
443                                 break;
444                         case '0':
445                         case '1':
446                         case '2':
447                         case '3':
448                         case '4':
449                         case '5':
450                         case '6':
451                         case '7':
452                                 parse_number_oct();
453                                 break;
454                         case '.':
455                                 next_char();
456                                 parse_floatingpoint_fract(0);
457                                 break;
458                         case 'e':
459                         case 'E':
460                                 parse_floatingpoint_exponent(0);
461                                 break;
462                         case '8':
463                         case '9':
464                                 next_char();
465                                 parse_error("invalid octal number");
466                                 lexer_token.type = T_ERROR;
467                                 return;
468                         default:
469                                 put_back(c);
470                                 c = '0';
471                                 parse_number_dec();
472                                 return;
473                 }
474         } else {
475                 parse_number_dec();
476         }
477 }
478
479 static int parse_octal_sequence(void)
480 {
481         int value = 0;
482         while(1) {
483                 if(c < '0' || c > '7')
484                         break;
485                 value = 8 * value + c - '0';
486                 next_char();
487         }
488
489         return value;
490 }
491
492 static int parse_hex_sequence(void)
493 {
494         int value = 0;
495         while(1) {
496                 if (c >= '0' && c <= '9') {
497                         value = 16 * value + c - '0';
498                 } else if ('A' <= c && c <= 'F') {
499                         value = 16 * value + c - 'A' + 10;
500                 } else if ('a' <= c && c <= 'f') {
501                         value = 16 * value + c - 'a' + 10;
502                 } else {
503                         break;
504                 }
505                 next_char();
506         }
507
508         return value;
509 }
510
511 static int parse_escape_sequence(void)
512 {
513         eat('\\');
514
515         int ec = c;
516         next_char();
517
518         switch(ec) {
519         case '"':  return '"';
520         case '\'': return'\'';
521         case '\\': return '\\';
522         case '?': return '\?';
523         case 'a': return '\a';
524         case 'b': return '\b';
525         case 'f': return '\f';
526         case 'n': return '\n';
527         case 'r': return '\r';
528         case 't': return '\t';
529         case 'v': return '\v';
530         case 'x':
531                 return parse_hex_sequence();
532         case '0':
533         case '1':
534         case '2':
535         case '3':
536         case '4':
537         case '5':
538         case '6':
539         case '7':
540                 return parse_octal_sequence();
541         case EOF:
542                 parse_error("reached end of file while parsing escape sequence");
543                 return EOF;
544         default:
545                 parse_error("unknown escape sequence");
546                 return EOF;
547         }
548 }
549
550 const char *concat_strings(const char *s1, const char *s2)
551 {
552         size_t  len1   = strlen(s1);
553         size_t  len2   = strlen(s2);
554
555         char   *concat = obstack_alloc(&symbol_obstack, len1 + len2 + 1);
556         memcpy(concat, s1, len1);
557         memcpy(concat + len1, s2, len2 + 1);
558
559         const char *result = strset_insert(&stringset, concat);
560         if(result != concat) {
561                 obstack_free(&symbol_obstack, concat);
562         }
563
564         return result;
565 }
566
567 static void parse_string_literal(void)
568 {
569         unsigned    start_linenr = lexer_token.source_position.linenr;
570         char       *string;
571         const char *result;
572
573         assert(c == '"');
574         next_char();
575
576         int tc;
577         while(1) {
578                 switch(c) {
579                 case '\\':
580                         tc = parse_escape_sequence();
581                         obstack_1grow(&symbol_obstack, tc);
582                         break;
583
584                 case EOF:
585                         error_prefix_at(lexer_token.source_position.input_name,
586                                         start_linenr);
587                         fprintf(stderr, "string has no end\n");
588                         lexer_token.type = T_ERROR;
589                         return;
590
591                 case '"':
592                         next_char();
593                         goto end_of_string;
594
595                 default:
596                         obstack_1grow(&symbol_obstack, c);
597                         next_char();
598                         break;
599                 }
600         }
601
602 end_of_string:
603
604         /* TODO: concatenate multiple strings separated by whitespace... */
605
606         /* add finishing 0 to the string */
607         obstack_1grow(&symbol_obstack, '\0');
608         string = obstack_finish(&symbol_obstack);
609
610         /* check if there is already a copy of the string */
611         result = strset_insert(&stringset, string);
612         if(result != string) {
613                 obstack_free(&symbol_obstack, string);
614         }
615
616         lexer_token.type     = T_STRING_LITERAL;
617         lexer_token.v.string = result;
618 }
619
620 static void parse_character_constant(void)
621 {
622         eat('\'');
623
624         int found_char = 0;
625         while(1) {
626                 switch(c) {
627                 case '\\':
628                         found_char = parse_escape_sequence();
629                         break;
630
631                 MATCH_NEWLINE(
632                         parse_error("newline while parsing character constant");
633                         break;
634                 )
635
636                 case '\'':
637                         next_char();
638                         goto end_of_char_constant;
639
640                 case EOF:
641                         parse_error("EOF while parsing character constant");
642                         lexer_token.type = T_ERROR;
643                         return;
644
645                 default:
646                         if(found_char != 0) {
647                                 parse_error("more than 1 characters in character "
648                                             "constant");
649                                 goto end_of_char_constant;
650                         } else {
651                                 found_char = c;
652                                 next_char();
653                         }
654                         break;
655                 }
656         }
657
658 end_of_char_constant:
659         lexer_token.type       = T_INTEGER;
660         lexer_token.v.intvalue = found_char;
661 }
662
663 static void skip_multiline_comment(void)
664 {
665         unsigned start_linenr = lexer_token.source_position.linenr;
666
667         while(1) {
668                 switch(c) {
669                 case '*':
670                         next_char();
671                         if(c == '/') {
672                                 next_char();
673                                 return;
674                         }
675                         break;
676
677                 MATCH_NEWLINE(break;)
678
679                 case EOF:
680                         error_prefix_at(lexer_token.source_position.input_name,
681                                         start_linenr);
682                         fprintf(stderr, "at end of file while looking for comment end\n");
683                         return;
684
685                 default:
686                         next_char();
687                         break;
688                 }
689         }
690 }
691
692 static void skip_line_comment(void)
693 {
694         while(1) {
695                 switch(c) {
696                 case EOF:
697                         return;
698
699                 case '\n':
700                 case '\r':
701                         return;
702
703                 default:
704                         next_char();
705                         break;
706                 }
707         }
708 }
709
710 static token_t pp_token;
711
712 static inline void next_pp_token(void)
713 {
714         lexer_next_preprocessing_token();
715         pp_token = lexer_token;
716 }
717
718 static void eat_until_newline(void)
719 {
720         while(pp_token.type != '\n' && pp_token.type != T_EOF) {
721                 next_pp_token();
722         }
723 }
724
725 static void error_directive(void)
726 {
727         error_prefix();
728         fprintf(stderr, "#error directive: \n");
729
730         /* parse pp-tokens until new-line */
731 }
732
733 static void define_directive(void)
734 {
735         lexer_next_preprocessing_token();
736         if(lexer_token.type != T_IDENTIFIER) {
737                 parse_error("expected identifier after #define\n");
738                 eat_until_newline();
739         }
740 }
741
742 static void ifdef_directive(int is_ifndef)
743 {
744         (void) is_ifndef;
745         lexer_next_preprocessing_token();
746         //expect_identifier();
747         //extect_newline();
748 }
749
750 static void endif_directive(void)
751 {
752         //expect_newline();
753 }
754
755 static void parse_line_directive(void)
756 {
757         if(pp_token.type != T_INTEGER) {
758                 parse_error("expected integer");
759         } else {
760                 lexer_token.source_position.linenr = pp_token.v.intvalue - 1;
761                 next_pp_token();
762         }
763         if(pp_token.type == T_STRING_LITERAL) {
764                 lexer_token.source_position.input_name = pp_token.v.string;
765                 next_pp_token();
766         }
767
768         eat_until_newline();
769 }
770
771 static void parse_preprocessor_identifier(void)
772 {
773         assert(pp_token.type == T_IDENTIFIER);
774         symbol_t *symbol = pp_token.v.symbol;
775
776         switch(symbol->pp_ID) {
777         case TP_include:
778                 printf("include - enable header name parsing!\n");
779                 break;
780         case TP_define:
781                 define_directive();
782                 break;
783         case TP_ifdef:
784                 ifdef_directive(0);
785                 break;
786         case TP_ifndef:
787                 ifdef_directive(1);
788                 break;
789         case TP_endif:
790                 endif_directive();
791                 break;
792         case TP_line:
793                 next_pp_token();
794                 parse_line_directive();
795                 break;
796         case TP_if:
797         case TP_else:
798         case TP_elif:
799         case TP_undef:
800         case TP_error:
801                 error_directive();
802                 break;
803         case TP_pragma:
804                 break;
805         }
806 }
807
808 static void parse_preprocessor_directive(void)
809 {
810         next_pp_token();
811
812         switch(pp_token.type) {
813         case T_IDENTIFIER:
814                 parse_preprocessor_identifier();
815                 break;
816         case T_INTEGER:
817                 parse_line_directive();
818                 break;
819         default:
820                 parse_error("invalid preprocessor directive");
821                 eat_until_newline();
822                 break;
823         }
824 }
825
826 #define MAYBE_PROLOG                                       \
827                         next_char();                                   \
828                         while(1) {                                     \
829                                 switch(c) {
830
831 #define MAYBE(ch, set_type)                                \
832                                 case ch:                                   \
833                                         next_char();                           \
834                                         lexer_token.type = set_type;           \
835                                         return;
836
837 #define ELSE_CODE(code)                                    \
838                                 default:                                   \
839                                         code;                                  \
840                                 }                                          \
841                         } /* end of while(1) */                        \
842                         break;
843
844 #define ELSE(set_type)                                     \
845                 ELSE_CODE(                                         \
846                         lexer_token.type = set_type;                   \
847                         return;                                        \
848                 )
849
850 void lexer_next_preprocessing_token(void)
851 {
852         while(1) {
853                 switch(c) {
854                 case ' ':
855                 case '\t':
856                         next_char();
857                         break;
858
859                 MATCH_NEWLINE(
860                         lexer_token.type = '\n';
861                         return;
862                 )
863
864                 SYMBOL_CHARS
865                         parse_symbol();
866                         /* might be a wide string ( L"string" ) */
867                         if(c == '"' && (lexer_token.type == T_IDENTIFIER &&
868                            lexer_token.v.symbol == symbol_L)) {
869                                 parse_string_literal();
870                                 return;
871                         }
872                         return;
873
874                 DIGITS
875                         parse_number();
876                         return;
877
878                 case '"':
879                         parse_string_literal();
880                         return;
881
882                 case '\'':
883                         parse_character_constant();
884                         return;
885
886                 case '.':
887                         MAYBE_PROLOG
888                                 case '.':
889                                         MAYBE_PROLOG
890                                         MAYBE('.', T_DOTDOTDOT)
891                                         ELSE_CODE(
892                                                 put_back(c);
893                                                 c = '.';
894                                                 lexer_token.type = '.';
895                                                 return;
896                                         )
897                         ELSE('.')
898                 case '&':
899                         MAYBE_PROLOG
900                         MAYBE('&', T_ANDAND)
901                         MAYBE('=', T_ANDEQUAL)
902                         ELSE('&')
903                 case '*':
904                         MAYBE_PROLOG
905                         MAYBE('=', T_ASTERISKEQUAL)
906                         ELSE('*')
907                 case '+':
908                         MAYBE_PROLOG
909                         MAYBE('+', T_PLUSPLUS)
910                         MAYBE('=', T_PLUSEQUAL)
911                         ELSE('+')
912                 case '-':
913                         MAYBE_PROLOG
914                         MAYBE('>', T_MINUSGREATER)
915                         MAYBE('-', T_MINUSMINUS)
916                         MAYBE('=', T_MINUSEQUAL)
917                         ELSE('-')
918                 case '!':
919                         MAYBE_PROLOG
920                         MAYBE('=', T_EXCLAMATIONMARKEQUAL)
921                         ELSE('!')
922                 case '/':
923                         MAYBE_PROLOG
924                         MAYBE('=', T_SLASHEQUAL)
925                                 case '*':
926                                         next_char();
927                                         skip_multiline_comment();
928                                         lexer_next_preprocessing_token();
929                                         return;
930                                 case '/':
931                                         next_char();
932                                         skip_line_comment();
933                                         lexer_next_preprocessing_token();
934                                         return;
935                         ELSE('/')
936                 case '%':
937                         MAYBE_PROLOG
938                         MAYBE('>', T_PERCENTGREATER)
939                         MAYBE('=', T_PERCENTEQUAL)
940                                 case ':':
941                                         MAYBE_PROLOG
942                                                 case '%':
943                                                         MAYBE_PROLOG
944                                                         MAYBE(':', T_PERCENTCOLONPERCENTCOLON)
945                                                         ELSE_CODE(
946                                                                 put_back(c);
947                                                                 c = '%';
948                                                                 lexer_token.type = T_PERCENTCOLON;
949                                                                 return;
950                                                         )
951                                         ELSE(T_PERCENTCOLON)
952                         ELSE('%')
953                 case '<':
954                         MAYBE_PROLOG
955                         MAYBE(':', T_LESSCOLON)
956                         MAYBE('%', T_LESSPERCENT)
957                         MAYBE('=', T_LESSEQUAL)
958                                 case '<':
959                                         MAYBE_PROLOG
960                                         MAYBE('=', T_LESSLESSEQUAL)
961                                         ELSE(T_LESSLESS)
962                         ELSE('<')
963                 case '>':
964                         MAYBE_PROLOG
965                         MAYBE('=', T_GREATEREQUAL)
966                                 case '>':
967                                         MAYBE_PROLOG
968                                         MAYBE('=', T_GREATERGREATEREQUAL)
969                                         ELSE(T_GREATERGREATER)
970                         ELSE('>')
971                 case '^':
972                         MAYBE_PROLOG
973                         MAYBE('=', T_CARETEQUAL)
974                         ELSE('^')
975                 case '|':
976                         MAYBE_PROLOG
977                         MAYBE('=', T_PIPEEQUAL)
978                         MAYBE('|', T_PIPEPIPE)
979                         ELSE('|')
980                 case ':':
981                         MAYBE_PROLOG
982                         MAYBE('>', T_COLONGREATER)
983                         ELSE(':')
984                 case '=':
985                         MAYBE_PROLOG
986                         MAYBE('=', T_EQUALEQUAL)
987                         ELSE('=')
988                 case '#':
989                         MAYBE_PROLOG
990                         MAYBE('#', T_HASHHASH)
991                         ELSE('#')
992
993                 case '?':
994                 case '[':
995                 case ']':
996                 case '(':
997                 case ')':
998                 case '{':
999                 case '}':
1000                 case '~':
1001                 case ';':
1002                 case ',':
1003                 case '\\':
1004                         lexer_token.type = c;
1005                         next_char();
1006                         return;
1007
1008                 case EOF:
1009                         lexer_token.type = T_EOF;
1010                         return;
1011
1012                 default:
1013                         next_char();
1014                         error_prefix();
1015                         fprintf(stderr, "unknown character '%c' found\n", c);
1016                         lexer_token.type = T_ERROR;
1017                         return;
1018                 }
1019         }
1020 }
1021
1022 void lexer_next_token(void)
1023 {
1024         lexer_next_preprocessing_token();
1025         if(lexer_token.type != '\n')
1026                 return;
1027
1028 newline_found:
1029         do {
1030                 lexer_next_preprocessing_token();
1031         } while(lexer_token.type == '\n');
1032
1033         if(lexer_token.type == '#') {
1034                 parse_preprocessor_directive();
1035                 goto newline_found;
1036         }
1037 }
1038
1039 void init_lexer(void)
1040 {
1041         strset_init(&stringset);
1042 }
1043
1044 void lexer_open_stream(FILE *stream, const char *input_name)
1045 {
1046         input                                  = stream;
1047         lexer_token.source_position.linenr     = 0;
1048         lexer_token.source_position.input_name = input_name;
1049
1050         symbol_L = symbol_table_insert("L");
1051
1052         /* place a virtual \n at the beginning so the lexer knows that we're
1053          * at the beginning of a line */
1054         c = '\n';
1055 }
1056
1057 void exit_lexer(void)
1058 {
1059         strset_destroy(&stringset);
1060 }
1061
1062 static __attribute__((unused))
1063 void dbg_pos(const source_position_t source_position)
1064 {
1065         fprintf(stdout, "%s:%d\n", source_position.input_name,
1066                 source_position.linenr);
1067         fflush(stdout);
1068 }