nsz Git - cparser/blob - input.c

   1 #include "config.h"
   2
   3 #include "input.h"
   4
   5 #include <ctype.h>
   6 #include "lexer.h"
   7 #include "diagnostic.h"
   8
   9 typedef size_t (*decode_func)(input_t *input, utf32 *buffer, size_t buffer_size);
  10
  11 struct input_t {
  12         FILE       *file;
  13         decode_func decode;
  14
  15         /* state for utf-8 decoder */
  16         utf32  utf8_part_decoded_min_code;
  17         utf32  utf8_part_decoded_char;
  18         size_t utf8_part_decoded_rest_len;
  19 };
  20
  21 static input_error_callback_func input_error;
  22
  23 void set_input_error_callback(input_error_callback_func new_func)
  24 {
  25         input_error = new_func;
  26 }
  27
  28 static size_t read_block(FILE *file, unsigned char *const read_buf,
  29                          size_t const n)
  30 {
  31         size_t const s = fread(read_buf, 1, n, file);
  32         if (s == 0) {
  33                 /* on OS/X ferror appears to return true on eof as well when running
  34                  * the application in gdb... */
  35                 if (!feof(file) && ferror(file))
  36                         input_error(0, 0, "read from input failed");
  37                 return 0;
  38         }
  39         return s;
  40 }
  41
  42 static size_t decode_iso_8859_1(input_t *input, utf32 *buffer,
  43                                 size_t buffer_size)
  44 {
  45         unsigned char read_buf[buffer_size];
  46         size_t const s = read_block(input->file, read_buf, sizeof(read_buf));
  47
  48         unsigned char const *src = read_buf;
  49         unsigned char const *end = read_buf + s;
  50         utf32               *dst = buffer;
  51         while (src != end)
  52                 *dst++ = *src++;
  53
  54         return s;
  55 }
  56
  57 static size_t decode_iso_8859_15(input_t *input, utf32 *buffer,
  58                                  size_t buffer_size)
  59 {
  60         unsigned char read_buf[buffer_size];
  61         size_t const s = read_block(input->file, read_buf, sizeof(read_buf));
  62
  63         unsigned char const *src = read_buf;
  64         unsigned char const *end = read_buf + s;
  65         utf32               *dst = buffer;
  66         while (src != end) {
  67                 utf32 tc = *src++;
  68                 switch (tc) {
  69                         case 0xA4: tc = 0x20AC; break; // €
  70                         case 0xA6: tc = 0x0160; break; // Š
  71                         case 0xA8: tc = 0x0161; break; // š
  72                         case 0xB4: tc = 0x017D; break; // Ž
  73                         case 0xB8: tc = 0x017E; break; // ž
  74                         case 0xBC: tc = 0x0152; break; // Œ
  75                         case 0xBD: tc = 0x0153; break; // œ
  76                         case 0xBE: tc = 0x0178; break; // Ÿ
  77                 }
  78                 *dst++ = tc;
  79         }
  80
  81         return s;
  82 }
  83
  84 static size_t decode_utf8(input_t *input, utf32 *buffer, size_t buffer_size)
  85 {
  86         unsigned char read_buf[buffer_size];
  87
  88         while (true) {
  89                 size_t const s = read_block(input->file, read_buf, sizeof(read_buf));
  90                 if (s == 0) {
  91                         if (input->utf8_part_decoded_rest_len > 0)
  92                                 input_error(0, 0, "incomplete input char at end of input");
  93                         return 0;
  94                 }
  95
  96                 unsigned char const *src = read_buf;
  97                 unsigned char const *end = read_buf + s;
  98                 utf32               *dst = buffer;
  99                 utf32                decoded;
 100                 utf32                min_code;
 101
 102                 if (input->utf8_part_decoded_rest_len != 0) {
 103                         min_code              = input->utf8_part_decoded_min_code;
 104                         decoded               = input->utf8_part_decoded_char;
 105                         size_t const rest_len = input->utf8_part_decoded_rest_len;
 106                         input->utf8_part_decoded_rest_len = 0;
 107                         switch (rest_len) {
 108                                 case 4:  goto realign;
 109                                 case 3:  goto three_more;
 110                                 case 2:  goto two_more;
 111                                 default: goto one_more;
 112                         }
 113                 }
 114
 115                 while (src != end) {
 116                         if ((*src & 0x80) == 0) {
 117                                 decoded = *src++;
 118                         } else if ((*src & 0xE0) == 0xC0) {
 119                                 min_code = 0x80;
 120                                 decoded  = *src++ & 0x1F;
 121 one_more:
 122                                 if (src == end) {
 123                                         input->utf8_part_decoded_min_code = min_code;
 124                                         input->utf8_part_decoded_char     = decoded;
 125                                         input->utf8_part_decoded_rest_len = 1;
 126                                         break;
 127                                 }
 128                                 if ((*src & 0xC0) == 0x80) {
 129                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 130                                 } else {
 131                                         goto invalid_char;
 132                                 }
 133                                 if (decoded < min_code                      ||
 134                                                 decoded > 0x10FFFF                      ||
 135                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 136                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 137                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 138                                         input_error(0, 0, "invalid byte sequence in input");
 139                                 }
 140                         } else if ((*src & 0xF0) == 0xE0) {
 141                                 min_code = 0x800;
 142                                 decoded  = *src++ & 0x0F;
 143 two_more:
 144                                 if (src == end) {
 145                                         input->utf8_part_decoded_min_code = min_code;
 146                                         input->utf8_part_decoded_char     = decoded;
 147                                         input->utf8_part_decoded_rest_len = 2;
 148                                         break;
 149                                 }
 150                                 if ((*src & 0xC0) == 0x80) {
 151                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 152                                 } else {
 153                                         goto invalid_char;
 154                                 }
 155                                 goto one_more;
 156                         } else if ((*src & 0xF8) == 0xF0) {
 157                                 min_code = 0x10000;
 158                                 decoded  = *src++ & 0x07;
 159 three_more:
 160                                 if (src == end) {
 161                                         input->utf8_part_decoded_min_code = min_code;
 162                                         input->utf8_part_decoded_char     = decoded;
 163                                         input->utf8_part_decoded_rest_len = 3;
 164                                         break;
 165                                 }
 166                                 if ((*src & 0xC0) == 0x80) {
 167                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 168                                 } else {
 169                                         goto invalid_char;
 170                                 }
 171                                 goto two_more;
 172                         } else {
 173 invalid_char:
 174                                 input_error(0, 0, "invalid byte sequence in input");
 175 realign:
 176                                 do {
 177                                         ++src;
 178                                         if (src == end) {
 179                                                 input->utf8_part_decoded_rest_len = 4;
 180                                                 break;
 181                                         }
 182                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 183                                 continue;
 184                         }
 185                         *dst++ = decoded;
 186                 }
 187
 188                 /* we're done when we could read more than 1 char */
 189                 if (buffer != dst)
 190                         return dst - buffer;
 191         }
 192 }
 193
 194 static size_t decode_windows_1252(input_t *input, utf32 *buffer,
 195                                   size_t buffer_size)
 196 {
 197         unsigned char read_buf[buffer_size];
 198         size_t const s = read_block(input->file, read_buf, sizeof(read_buf));
 199
 200         unsigned char const *src = read_buf;
 201         unsigned char const *end = read_buf + s;
 202         utf32               *dst = buffer;
 203         while (src != end) {
 204                 utf32 tc = *src++;
 205                 switch (tc) {
 206                         case 0x80: tc = 0x20AC; break; // €
 207                         case 0x82: tc = 0x201A; break; // ‚
 208                         case 0x83: tc = 0x0192; break; // ƒ
 209                         case 0x84: tc = 0x201E; break; // „
 210                         case 0x85: tc = 0x2026; break; // …
 211                         case 0x86: tc = 0x2020; break; // †
 212                         case 0x87: tc = 0x2021; break; // ‡
 213                         case 0x88: tc = 0x02C6; break; // ˆ
 214                         case 0x89: tc = 0x2030; break; // ‰
 215                         case 0x8A: tc = 0x0160; break; // Š
 216                         case 0x8B: tc = 0x2039; break; // ‹
 217                         case 0x8C: tc = 0x0152; break; // Œ
 218                         case 0x8E: tc = 0x017D; break; // Ž
 219                         case 0x91: tc = 0x2018; break; // ‘
 220                         case 0x92: tc = 0x2019; break; // ’
 221                         case 0x93: tc = 0x201C; break; // “
 222                         case 0x94: tc = 0x201D; break; // ”
 223                         case 0x95: tc = 0x2022; break; // •
 224                         case 0x96: tc = 0x2013; break; // –
 225                         case 0x97: tc = 0x2014; break; // —
 226                         case 0x98: tc = 0x02DC; break; // ˜
 227                         case 0x99: tc = 0x2122; break; // ™
 228                         case 0x9A: tc = 0x0161; break; // š
 229                         case 0x9B: tc = 0x203A; break; // ›
 230                         case 0x9C: tc = 0x0153; break; // œ
 231                         case 0x9E: tc = 0x017E; break; // ž
 232                         case 0x9F: tc = 0x0178; break; // Ÿ
 233                 }
 234                 *dst++ = tc;
 235         }
 236
 237         return s;
 238 }
 239
 240 typedef struct named_decoder_t {
 241         char const *name;
 242         decode_func decoder;
 243 } named_decoder_t;
 244
 245 static named_decoder_t const decoders[] = {
 246         { "CP819",           decode_iso_8859_1   }, // official alias
 247         { "IBM819",          decode_iso_8859_1   }, // official alias
 248         { "ISO-8859-1",      decode_iso_8859_1   }, // official alias
 249         { "ISO-8859-15",     decode_iso_8859_15  }, // official name
 250         { "ISO8859-1",       decode_iso_8859_1   },
 251         { "ISO8859-15",      decode_iso_8859_15  },
 252         { "ISO_8859-1",      decode_iso_8859_1   }, // official alias
 253         { "ISO_8859-15",     decode_iso_8859_15  }, // official alias
 254         { "ISO_8859-1:1987", decode_iso_8859_1   }, // official name
 255         { "Latin-9",         decode_iso_8859_15  }, // official alias
 256         { "UTF-8",           decode_utf8         }, // official name
 257         { "csISOLatin1",     decode_iso_8859_1   }, // official alias
 258         { "cp1252",          decode_windows_1252 },
 259         { "iso-ir-100",      decode_iso_8859_1   }, // official alias
 260         { "l1",              decode_iso_8859_1   }, // official alias
 261         { "latin1",          decode_iso_8859_1   }, // official alias
 262         { "windows-1252",    decode_windows_1252 }, // official name
 263
 264         { NULL,              NULL                }
 265 };
 266
 267 /** strcasecmp is not part of C99 so we need our own implementation here */
 268 static int my_strcasecmp(const char *s1, const char *s2)
 269 {
 270         for ( ; *s1 != 0; ++s1, ++s2) {
 271                 if (tolower(*s1) != tolower(*s2))
 272                         break;
 273         }
 274         return (unsigned char)*s1 - (unsigned char)*s2;
 275 }
 276
 277 input_t *input_from_stream(FILE *file, const char *encoding)
 278 {
 279         input_t *result = XMALLOCZ(input_t);
 280         result->file = file;
 281
 282         if (encoding == NULL) {
 283                 result->decode = decode_utf8;
 284         } else {
 285                 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 286                         if (my_strcasecmp(encoding, i->name) != 0)
 287                                 continue;
 288                         result->decode = i->decoder;
 289                         break;
 290                 }
 291                 if (result->decode == NULL) {
 292                         fprintf(stderr, "error: input encoding \"%s\" not supported\n",
 293                                         encoding);
 294                         result->decode = decode_utf8;
 295                 }
 296         }
 297
 298         return result;
 299 }
 300
 301 size_t decode(input_t *input, utf32 *buffer, size_t buffer_size)
 302 {
 303         return input->decode(input, buffer, buffer_size);
 304 }
 305
 306 void input_free(input_t *input)
 307 {
 308         xfree(input);
 309 }