nsz Git - cparser/blob - input.c

   1 #include "config.h"
   2
   3 #include "input.h"
   4
   5 #include <ctype.h>
   6 #include "lexer.h"
   7 #include "diagnostic.h"
   8
   9 typedef size_t (*decode_func)(input_t *input, utf32 *buffer, size_t buffer_size);
  10
  11 typedef enum {
  12         INPUT_FILE,
  13         INPUT_STRING
  14 } input_kind_t;
  15
  16 struct input_t {
  17         input_kind_t kind;
  18         union {
  19                 FILE *file;
  20                 const char *string;
  21         } in;
  22         decode_func decode;
  23
  24         /* state for utf-8 decoder */
  25         utf32  utf8_part_decoded_min_code;
  26         utf32  utf8_part_decoded_char;
  27         size_t utf8_part_decoded_rest_len;
  28 };
  29
  30 static input_error_callback_func input_error;
  31
  32 void set_input_error_callback(input_error_callback_func new_func)
  33 {
  34         input_error = new_func;
  35 }
  36
  37 static size_t read_block(input_t *input, unsigned char *const read_buf,
  38                          size_t const n)
  39 {
  40         if (input->kind == INPUT_FILE) {
  41                 FILE *file = input->in.file;
  42                 size_t const s = fread(read_buf, 1, n, file);
  43                 if (s == 0) {
  44                         /* on OS/X ferror appears to return true on eof as well when running
  45                          * the application in gdb... */
  46                         if (!feof(file) && ferror(file))
  47                                 input_error(0, 0, "read from input failed");
  48                         return 0;
  49                 }
  50                 return s;
  51         } else {
  52                 assert(input->kind == INPUT_STRING);
  53                 size_t len = strlen(input->in.string);
  54                 if (len > n)
  55                         len = n;
  56                 memcpy(read_buf, input->in.string, len);
  57                 input->in.string += len;
  58                 return len;
  59         }
  60 }
  61
  62 static size_t decode_iso_8859_1(input_t *input, utf32 *buffer,
  63                                 size_t buffer_size)
  64 {
  65         unsigned char read_buf[buffer_size];
  66         size_t const s = read_block(input, read_buf, sizeof(read_buf));
  67
  68         unsigned char const *src = read_buf;
  69         unsigned char const *end = read_buf + s;
  70         utf32               *dst = buffer;
  71         while (src != end)
  72                 *dst++ = *src++;
  73
  74         return s;
  75 }
  76
  77 static size_t decode_iso_8859_15(input_t *input, utf32 *buffer,
  78                                  size_t buffer_size)
  79 {
  80         unsigned char read_buf[buffer_size];
  81         size_t const s = read_block(input, read_buf, sizeof(read_buf));
  82
  83         unsigned char const *src = read_buf;
  84         unsigned char const *end = read_buf + s;
  85         utf32               *dst = buffer;
  86         while (src != end) {
  87                 utf32 tc = *src++;
  88                 switch (tc) {
  89                         case 0xA4: tc = 0x20AC; break; // €
  90                         case 0xA6: tc = 0x0160; break; // Š
  91                         case 0xA8: tc = 0x0161; break; // š
  92                         case 0xB4: tc = 0x017D; break; // Ž
  93                         case 0xB8: tc = 0x017E; break; // ž
  94                         case 0xBC: tc = 0x0152; break; // Œ
  95                         case 0xBD: tc = 0x0153; break; // œ
  96                         case 0xBE: tc = 0x0178; break; // Ÿ
  97                 }
  98                 *dst++ = tc;
  99         }
 100
 101         return s;
 102 }
 103
 104 static size_t decode_utf8(input_t *input, utf32 *buffer, size_t buffer_size)
 105 {
 106         unsigned char read_buf[buffer_size];
 107
 108         while (true) {
 109                 size_t const s = read_block(input, read_buf, sizeof(read_buf));
 110                 if (s == 0) {
 111                         if (input->utf8_part_decoded_rest_len > 0)
 112                                 input_error(0, 0, "incomplete input char at end of input");
 113                         return 0;
 114                 }
 115
 116                 unsigned char const *src = read_buf;
 117                 unsigned char const *end = read_buf + s;
 118                 utf32               *dst = buffer;
 119                 utf32                decoded;
 120                 utf32                min_code;
 121
 122                 if (input->utf8_part_decoded_rest_len != 0) {
 123                         min_code              = input->utf8_part_decoded_min_code;
 124                         decoded               = input->utf8_part_decoded_char;
 125                         size_t const rest_len = input->utf8_part_decoded_rest_len;
 126                         input->utf8_part_decoded_rest_len = 0;
 127                         switch (rest_len) {
 128                                 case 4:  goto realign;
 129                                 case 3:  goto three_more;
 130                                 case 2:  goto two_more;
 131                                 default: goto one_more;
 132                         }
 133                 }
 134
 135                 while (src != end) {
 136                         if ((*src & 0x80) == 0) {
 137                                 decoded = *src++;
 138                         } else if ((*src & 0xE0) == 0xC0) {
 139                                 min_code = 0x80;
 140                                 decoded  = *src++ & 0x1F;
 141 one_more:
 142                                 if (src == end) {
 143                                         input->utf8_part_decoded_min_code = min_code;
 144                                         input->utf8_part_decoded_char     = decoded;
 145                                         input->utf8_part_decoded_rest_len = 1;
 146                                         break;
 147                                 }
 148                                 if ((*src & 0xC0) == 0x80) {
 149                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 150                                 } else {
 151                                         goto invalid_char;
 152                                 }
 153                                 if (decoded < min_code                      ||
 154                                                 decoded > 0x10FFFF                      ||
 155                                                 (0xD800 <= decoded && decoded < 0xE000) || // high/low surrogates
 156                                                 (0xFDD0 <= decoded && decoded < 0xFDF0) || // noncharacters
 157                                                 (decoded & 0xFFFE) == 0xFFFE) {            // noncharacters
 158                                         input_error(0, 0, "invalid byte sequence in input");
 159                                 }
 160                         } else if ((*src & 0xF0) == 0xE0) {
 161                                 min_code = 0x800;
 162                                 decoded  = *src++ & 0x0F;
 163 two_more:
 164                                 if (src == end) {
 165                                         input->utf8_part_decoded_min_code = min_code;
 166                                         input->utf8_part_decoded_char     = decoded;
 167                                         input->utf8_part_decoded_rest_len = 2;
 168                                         break;
 169                                 }
 170                                 if ((*src & 0xC0) == 0x80) {
 171                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 172                                 } else {
 173                                         goto invalid_char;
 174                                 }
 175                                 goto one_more;
 176                         } else if ((*src & 0xF8) == 0xF0) {
 177                                 min_code = 0x10000;
 178                                 decoded  = *src++ & 0x07;
 179 three_more:
 180                                 if (src == end) {
 181                                         input->utf8_part_decoded_min_code = min_code;
 182                                         input->utf8_part_decoded_char     = decoded;
 183                                         input->utf8_part_decoded_rest_len = 3;
 184                                         break;
 185                                 }
 186                                 if ((*src & 0xC0) == 0x80) {
 187                                         decoded = (decoded << 6) | (*src++ & 0x3F);
 188                                 } else {
 189                                         goto invalid_char;
 190                                 }
 191                                 goto two_more;
 192                         } else {
 193 invalid_char:
 194                                 input_error(0, 0, "invalid byte sequence in input");
 195 realign:
 196                                 do {
 197                                         ++src;
 198                                         if (src == end) {
 199                                                 input->utf8_part_decoded_rest_len = 4;
 200                                                 break;
 201                                         }
 202                                 } while ((*src & 0xC0) == 0x80 || (*src & 0xF8) == 0xF8);
 203                                 continue;
 204                         }
 205                         *dst++ = decoded;
 206                 }
 207
 208                 /* we're done when we could read more than 1 char */
 209                 if (buffer != dst)
 210                         return dst - buffer;
 211         }
 212 }
 213
 214 static size_t decode_windows_1252(input_t *input, utf32 *buffer,
 215                                   size_t buffer_size)
 216 {
 217         unsigned char read_buf[buffer_size];
 218         size_t const s = read_block(input, read_buf, sizeof(read_buf));
 219
 220         unsigned char const *src = read_buf;
 221         unsigned char const *end = read_buf + s;
 222         utf32               *dst = buffer;
 223         while (src != end) {
 224                 utf32 tc = *src++;
 225                 switch (tc) {
 226                         case 0x80: tc = 0x20AC; break; // €
 227                         case 0x82: tc = 0x201A; break; // ‚
 228                         case 0x83: tc = 0x0192; break; // ƒ
 229                         case 0x84: tc = 0x201E; break; // „
 230                         case 0x85: tc = 0x2026; break; // …
 231                         case 0x86: tc = 0x2020; break; // †
 232                         case 0x87: tc = 0x2021; break; // ‡
 233                         case 0x88: tc = 0x02C6; break; // ˆ
 234                         case 0x89: tc = 0x2030; break; // ‰
 235                         case 0x8A: tc = 0x0160; break; // Š
 236                         case 0x8B: tc = 0x2039; break; // ‹
 237                         case 0x8C: tc = 0x0152; break; // Œ
 238                         case 0x8E: tc = 0x017D; break; // Ž
 239                         case 0x91: tc = 0x2018; break; // ‘
 240                         case 0x92: tc = 0x2019; break; // ’
 241                         case 0x93: tc = 0x201C; break; // “
 242                         case 0x94: tc = 0x201D; break; // ”
 243                         case 0x95: tc = 0x2022; break; // •
 244                         case 0x96: tc = 0x2013; break; // –
 245                         case 0x97: tc = 0x2014; break; // —
 246                         case 0x98: tc = 0x02DC; break; // ˜
 247                         case 0x99: tc = 0x2122; break; // ™
 248                         case 0x9A: tc = 0x0161; break; // š
 249                         case 0x9B: tc = 0x203A; break; // ›
 250                         case 0x9C: tc = 0x0153; break; // œ
 251                         case 0x9E: tc = 0x017E; break; // ž
 252                         case 0x9F: tc = 0x0178; break; // Ÿ
 253                 }
 254                 *dst++ = tc;
 255         }
 256
 257         return s;
 258 }
 259
 260 typedef struct named_decoder_t {
 261         char const *name;
 262         decode_func decoder;
 263 } named_decoder_t;
 264
 265 static named_decoder_t const decoders[] = {
 266         { "CP819",           decode_iso_8859_1   }, // official alias
 267         { "IBM819",          decode_iso_8859_1   }, // official alias
 268         { "ISO-8859-1",      decode_iso_8859_1   }, // official alias
 269         { "ISO-8859-15",     decode_iso_8859_15  }, // official name
 270         { "ISO8859-1",       decode_iso_8859_1   },
 271         { "ISO8859-15",      decode_iso_8859_15  },
 272         { "ISO_8859-1",      decode_iso_8859_1   }, // official alias
 273         { "ISO_8859-15",     decode_iso_8859_15  }, // official alias
 274         { "ISO_8859-1:1987", decode_iso_8859_1   }, // official name
 275         { "Latin-9",         decode_iso_8859_15  }, // official alias
 276         { "UTF-8",           decode_utf8         }, // official name
 277         { "csISOLatin1",     decode_iso_8859_1   }, // official alias
 278         { "cp1252",          decode_windows_1252 },
 279         { "iso-ir-100",      decode_iso_8859_1   }, // official alias
 280         { "l1",              decode_iso_8859_1   }, // official alias
 281         { "latin1",          decode_iso_8859_1   }, // official alias
 282         { "windows-1252",    decode_windows_1252 }, // official name
 283
 284         { NULL,              NULL                }
 285 };
 286
 287 /** strcasecmp is not part of C99 so we need our own implementation here */
 288 static int my_strcasecmp(const char *s1, const char *s2)
 289 {
 290         for ( ; *s1 != 0; ++s1, ++s2) {
 291                 if (tolower(*s1) != tolower(*s2))
 292                         break;
 293         }
 294         return (unsigned char)*s1 - (unsigned char)*s2;
 295 }
 296
 297 static void choose_decoder(input_t *result, const char *encoding)
 298 {
 299         if (encoding == NULL) {
 300                 result->decode = decode_utf8;
 301         } else {
 302                 for (named_decoder_t const *i = decoders; i->name != NULL; ++i) {
 303                         if (my_strcasecmp(encoding, i->name) != 0)
 304                                 continue;
 305                         result->decode = i->decoder;
 306                         break;
 307                 }
 308                 if (result->decode == NULL) {
 309                         fprintf(stderr, "error: input encoding \"%s\" not supported\n",
 310                                         encoding);
 311                         result->decode = decode_utf8;
 312                 }
 313         }
 314 }
 315
 316 input_t *input_from_stream(FILE *file, const char *encoding)
 317 {
 318         input_t *result = XMALLOCZ(input_t);
 319         result->kind    = INPUT_FILE;
 320         result->in.file = file;
 321
 322         choose_decoder(result, encoding);
 323
 324         return result;
 325 }
 326
 327 input_t *input_from_string(const char *string, const char *encoding)
 328 {
 329         input_t *result   = XMALLOCZ(input_t);
 330         result->kind      = INPUT_STRING;
 331         result->in.string = string;
 332
 333         choose_decoder(result, encoding);
 334
 335         return result;
 336 }
 337
 338 size_t decode(input_t *input, utf32 *buffer, size_t buffer_size)
 339 {
 340         return input->decode(input, buffer, buffer_size);
 341 }
 342
 343 void input_free(input_t *input)
 344 {
 345         xfree(input);
 346 }