proper newline handling

author Matthias Braun <matze@braunis.de>

Sun, 10 Jun 2007 14:07:13 +0000 (14:07 +0000)

committer Matthias Braun <matze@braunis.de>

Sun, 10 Jun 2007 14:07:13 +0000 (14:07 +0000)
author Matthias Braun <matze@braunis.de>
Sun, 10 Jun 2007 14:07:13 +0000 (14:07 +0000)
committer Matthias Braun <matze@braunis.de>
Sun, 10 Jun 2007 14:07:13 +0000 (14:07 +0000)
diff --git a/TODO b/TODO

index 1dd8883..5c0ba24 100644 (file)
--- a/TODO
+++ b/TODO
@@ -2,4 +2,5 @@ Lexer:
  - proper support of preprocessor
  - parse float numbers
  - octal&hex escape sequences
-- handling of newline is probably not correct for all strange OSes
+- wide string constants
+- proper handling of different file encodings
diff --git a/lexer.c b/lexer.c

index 8968131..eeab9ef 100644 (file)
--- a/lexer.c
+++ b/lexer.c
@@ -106,7 +106,20 @@ int replace_trigraph(lexer_t *this)
                 put_back(this, '?');                   \
                 put_back(this, this->c);               \
                 this->c = '?';                         \
-               no_trigraph_code;                      \
+               no_trigraph_code;
+
+#define EAT_NEWLINE(newline_code)              \
+       if(this->c == '\r') {                      \
+               next_char(this);                       \
+               if(this->c == '\n')                    \
+                       next_char(this);                   \
+               this->source_position.linenr++;        \
+               newline_code;                          \
+       } else if(this->c == '\n') {               \
+               next_char(this);                       \
+               this->source_position.linenr++;        \
+               newline_code;                          \
+       }
  
  static
  void parse_symbol(lexer_t *this, token_t *token)
@@ -121,11 +134,8 @@ void parse_symbol(lexer_t *this, token_t *token)
                 switch(this->c) {
                 case '\\':
                         next_char(this);
-                       if(this->c == '\n') {
-                               next_char(this);
-                               this->source_position.linenr++;
-                               break;
-                       }
+                       EAT_NEWLINE(break;)
+                       goto end_symbol;
  
                 case 'A' ... 'Z':
                 case 'a' ... 'z':
@@ -286,51 +296,49 @@ void parse_number(lexer_t *this, token_t *token)
  static
  int parse_escape_sequence(lexer_t *this)
  {
-       int c = this->c;
-       next_char(this);
+       while(1) {
+               int c = this->c;
+               next_char(this);
  
-       switch(c) {
-       case '"': return '"';
-       case '\'': return'\'';
-       case '\\':
-               if(this->c == '\n') {
-                       this->source_position.linenr++;
+               switch(c) {
+               case '"': return '"';
+               case '\'': return'\'';
+               case '\\':
+                       EAT_NEWLINE(break;)
+                       return '\\';
+               case 'a': return '\a';
+               case 'b': return '\b';
+               case 'f': return '\f';
+               case 'n': return '\n';
+               case 'r': return '\r';
+               case 't': return '\t';
+               case 'v': return '\v';
+               case 'x': /* TODO parse hex number ... */
+                       parse_error(this, "hex escape sequences not implemented yet");
+                       return EOF;
+               case 0 ... 8: /* TODO parse octal number ... */
+                       parse_error(this, "octal escape sequences not implemented yet");
+                       return EOF;
+               case '?':
+                       if(this->c != '?') {
+                               return '?';
+                       }
+                       /* might be a trigraph */
                         next_char(this);
-                       return parse_escape_sequence(this);
-               }
-               return '\\';
-       case 'a': return '\a';
-       case 'b': return '\b';
-       case 'f': return '\f';
-       case 'n': return '\n';
-       case 'r': return '\r';
-       case 't': return '\t';
-       case 'v': return '\v';
-       case 'x': /* TODO parse hex number ... */
-               parse_error(this, "hex escape sequences not implemented yet");
-               return EOF;
-       case 0 ... 8: /* TODO parse octal number ... */
-               parse_error(this, "octal escape sequences not implemented yet");
-               return EOF;
-       case '?':
-               if(this->c != '?') {
+                       if(replace_trigraph(this)) {
+                               break;
+                       }
+                       put_back(this, this->c);
+                       this->c = '?';
                         return '?';
-               }
-               /* might be a trigraph */
-               next_char(this);
-               if(replace_trigraph(this)) {
-                       return parse_escape_sequence(this);
-               }
-               put_back(this, this->c);
-               this->c = '?';
-               return '?';
  
-       case EOF:
-               parse_error(this, "reached end of file while parsing escape sequence");
-               return EOF;
-       default:
-               parse_error(this, "unknown escape sequence\n");
-               return EOF;
+               case EOF:
+                       parse_error(this, "reached end of file while parsing escape sequence");
+                       return EOF;
+               default:
+                       parse_error(this, "unknown escape sequence");
+                       return EOF;
+               }
         }
  }
  
@@ -354,11 +362,7 @@ void parse_string_literal(lexer_t *this, token_t *token)
  
                 case '\\':
                         next_char(this);
-                       if(this->c == '\n') {
-                               next_char(this);
-                               this->source_position.linenr++;
-                               break;
-                       }
+                       EAT_NEWLINE(break;)
                         int c = parse_escape_sequence(this);
                         obstack_1grow(&symbol_obstack, c);
                         break;
@@ -399,6 +403,19 @@ end_of_string:
         token->v.string = result;
  }
  
+#define MATCH_NEWLINE(code)                 \
+       case '\r':                              \
+               next_char(this);                    \
+               if(this->c == '\n') {               \
+                       next_char(this);                \
+               }                                   \
+               this->source_position.linenr++;     \
+               code;                               \
+       case '\n':                              \
+               next_char(this);                    \
+               this->source_position.linenr++;     \
+               code;
+
  static
  void parse_character_constant(lexer_t *this, token_t *token)
  {
@@ -415,19 +432,14 @@ void parse_character_constant(lexer_t *this, token_t *token)
  
                 case '\\':
                         next_char(this);
-                       if(this->c == '\n') {
-                               next_char(this);
-                               this->source_position.linenr++;
-                               break;
-                       }
+                       EAT_NEWLINE(break;)
                         found_char = '\\';
                         break;
  
-               case '\n':
-                       next_char(this);
+               MATCH_NEWLINE(
                         parse_error(this, "newline while parsing character constant");
-                       this->source_position.linenr++;
                         break;
+               )
  
                 case '\'':
                         next_char(this);
@@ -482,10 +494,8 @@ void skip_multiline_comment(lexer_t *this)
                          * anyway */
                         break;
  
-               case '\n':
-                       this->source_position.linenr++;
-                       next_char(this);
-                       break;
+               MATCH_NEWLINE(break;)
+
                 case EOF:
                         error_prefix_at(this, this->source_position.input_name,
                                         start_linenr);
@@ -524,8 +534,10 @@ void skip_line_comment(lexer_t *this)
                         break;
  
                 case EOF:
+               case '\r':
                 case '\n':
                         return;
+
                 default:
                         next_char(this);
                         break;
@@ -533,17 +545,6 @@ void skip_line_comment(lexer_t *this)
         }
  }
  
-#if 0
-static
-void eat_until_newline(lexer_t *this)
-{
-       while(this->c != '\n') {
-               next_char(this);
-       }
-}
-#endif
-
-#if 0
  static
  void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
  {
@@ -553,7 +554,6 @@ void parse_preprocessor_directive(lexer_t *this, token_t *result_token)
                 next_char(this);
         }
  }
-#endif
  
  void preprocessor_next_token(lexer_t *this, token_t *token)
  {
@@ -572,18 +572,16 @@ void preprocessor_next_token(lexer_t *this, token_t *token)
  
  void lexer_next_token(lexer_t *this, token_t *token)
  {
+       int line_begin = 0;
+
         while(1) {
                 switch(this->c) {
                 case ' ':
                 case '\t':
-               case '\r':
                         next_char(this);
                         break;
  
-               case '\n':
-                       this->source_position.linenr++;
-                       next_char(this);
-                       break;
+               MATCH_NEWLINE(break;)
  
                 case 'A' ... 'Z':
                 case 'a' ... 'z':
@@ -753,17 +751,15 @@ void lexer_next_token(lexer_t *this, token_t *token)
                 case '#':
                         MAYBE_PROLOG
                         MAYBE('#', T_HASHHASH)
-#if 0
-                       else {
+                       ELSE_CODE(
                                 if(line_begin) {
                                         parse_preprocessor_directive(this, token);
                                         return;
                                 } else {
                                         token->type = '#';
+                                       return;
                                 }
-#else
-                       ELSE('#')
-#endif
+                       )
  
                 case '?':
                         next_char(this);
author	Matthias Braun <matze@braunis.de>
	Sun, 10 Jun 2007 14:07:13 +0000 (14:07 +0000)
committer	Matthias Braun <matze@braunis.de>
	Sun, 10 Jun 2007 14:07:13 +0000 (14:07 +0000)