new scanf implementation and corresponding integer parser/converter
authorRich Felker <dalias@aerifal.cx>
Mon, 16 Apr 2012 20:03:45 +0000 (16:03 -0400)
committerRich Felker <dalias@aerifal.cx>
Mon, 16 Apr 2012 20:03:45 +0000 (16:03 -0400)
advantages over the old code:
- correct results for floating point (old code was bogus)
- wide/regular scanf separated so scanf does not pull in wide code
- well-defined behavior on integers that overflow dest type
- support for %[a-b] ranges with %[ (impl-defined by widely used)
- no intermediate conversion of fmt string to wide string
- cleaner, easier to share code with strto* functions
- better standards conformance for corner cases

the old code remains in the source tree, as the wide versions of the
scanf-family functions are still using it. it will be removed when no
longer needed.

src/internal/intscan.c [new file with mode: 0644]
src/internal/intscan.h [new file with mode: 0644]
src/internal/stdio_impl.h
src/stdio/__string_read.c [new file with mode: 0644]
src/stdio/vfscanf.c
src/stdio/vsscanf.c

diff --git a/src/internal/intscan.c b/src/internal/intscan.c
new file mode 100644 (file)
index 0000000..a00f2cc
--- /dev/null
@@ -0,0 +1,97 @@
+#include <limits.h>
+#include <errno.h>
+#include "shgetc.h"
+
+/* Lookup table for digit values. -1==255>=36 -> invalid */
+static const unsigned char table[] = { -1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
+-1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
+25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,-1,
+-1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
+25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+unsigned long long __intscan(FILE *f, unsigned base, int pok, unsigned long long lim)
+{
+       const unsigned char *val = table+1;
+       int c, neg=0;
+       unsigned x;
+       unsigned long long y;
+       if (base > 36) {
+               errno = EINVAL;
+               return 0;
+       }
+       c = shgetc(f);
+       if (c=='+' || c=='-') {
+               neg = -(c=='-');
+               c = shgetc(f);
+       }
+       if ((base == 0 || base == 16) && c=='0') {
+               c = shgetc(f);
+               if ((c|32)=='x') {
+                       c = shgetc(f);
+                       if (val[c]>=16) {
+                               shunget(f);
+                               if (pok) shunget(f);
+                               else shlim(f, 0);
+                               return 0;
+                       }
+                       base = 16;
+               } else if (base == 0) {
+                       base = 8;
+               }
+       } else {
+               if (base == 0) base = 10;
+               if (val[c] >= base) {
+                       shlim(f, 0);
+                       errno = EINVAL;
+                       return 0;
+               }
+       }
+       if (base == 10) {
+               for (x=0; c-'0'<10U && x<=UINT_MAX/10-1; c=shgetc(f))
+                       x = x*10 + (c-'0');
+               for (y=x; c-'0'<10U && y<=ULLONG_MAX/10 && 10*y<=ULLONG_MAX-(c-'0'); c=shgetc(f))
+                       y = y*10 + (c-'0');
+               if (c-'0'>=10U) goto done;
+       } else if (!(base & base-1)) {
+               int bs = "\0\1\2\4\7\3\6\5"[(0x17*base)>>5&7];
+               for (x=0; val[c]<base && x<=UINT_MAX/32; c=shgetc(f))
+                       x = x<<bs | val[c];
+               for (y=x; val[c]<base && y<=ULLONG_MAX>>bs; c=shgetc(f))
+                       y = y<<bs | val[c];
+       } else {
+               for (x=0; val[c]<base && x<=UINT_MAX/36-1; c=shgetc(f))
+                       x = x*base + val[c];
+               for (y=x; val[c]<base && y<=ULLONG_MAX/base && base*y<=ULLONG_MAX-val[c]; c=shgetc(f))
+                       y = y*base + val[c];
+       }
+       if (val[c]<base) {
+               for (; val[c]<base; c=shgetc(f));
+               errno = ERANGE;
+               y = lim;
+       }
+done:
+       shunget(f);
+       if (y>=lim) {
+               if (!(lim&1) && !neg) {
+                       errno = ERANGE;
+                       return lim-1;
+               } else if (y>lim) {
+                       errno = ERANGE;
+                       return lim;
+               }
+       }
+       return (y^neg)-neg;
+}
diff --git a/src/internal/intscan.h b/src/internal/intscan.h
new file mode 100644 (file)
index 0000000..994c5e7
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef INTSCAN_H
+#define INTSCAN_H
+
+#include <stdio.h>
+
+unsigned long long __intscan(FILE *, unsigned, int, unsigned long long);
+
+#endif
index 5ec296f..af7aacc 100644 (file)
@@ -69,6 +69,8 @@ size_t __stdout_write(FILE *, const unsigned char *, size_t);
 off_t __stdio_seek(FILE *, off_t, int);
 int __stdio_close(FILE *);
 
+size_t __string_read(FILE *, unsigned char *, size_t);
+
 int __toread(FILE *);
 int __towrite(FILE *);
 
diff --git a/src/stdio/__string_read.c b/src/stdio/__string_read.c
new file mode 100644 (file)
index 0000000..5c3728d
--- /dev/null
@@ -0,0 +1,13 @@
+#include "stdio_impl.h"
+
+size_t __string_read(FILE *f, unsigned char *buf, size_t len)
+{
+       char *src = f->cookie;
+       size_t k = strnlen(src, len+256);
+       if (k < len) len = k;
+       memcpy(buf, src, len);
+       f->rpos = (void *)(src+len);
+       f->rend = (void *)(src+k);
+       f->cookie = src+k;
+       return len;
+}
index 414c2a3..5c1e49b 100644 (file)
 #include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
+#include <limits.h>
 #include <string.h>
 #include <errno.h>
-#include <ctype.h>
+#include <math.h>
+#include <float.h>
 
 #include "stdio_impl.h"
-#include "__scanf.h"
+#include "shgetc.h"
+#include "intscan.h"
+#include "floatscan.h"
 
-static void f_read(rctx_t *r)
+#define SIZE_hh -2
+#define SIZE_h  -1
+#define SIZE_def 0
+#define SIZE_l   1
+#define SIZE_L   2
+#define SIZE_ll  3
+
+static void store_int(void *dest, int size, unsigned long long i)
 {
-       FILE *f = r->opaque;
-       if ((r->c = getc_unlocked(f)) >= 0) r->l++;
+       if (!dest) return;
+       switch (size) {
+       case SIZE_hh:
+               *(char *)dest = i;
+               break;
+       case SIZE_h:
+               *(short *)dest = i;
+               break;
+       case SIZE_def:
+               *(int *)dest = i;
+               break;
+       case SIZE_l:
+               *(long *)dest = i;
+               break;
+       case SIZE_ll:
+               *(long long *)dest = i;
+               break;
+       }
 }
 
-int vfscanf(FILE *f, const char *fmt, va_list ap)
+static void *arg_n(va_list ap, unsigned int n)
 {
-       size_t l = strlen(fmt), i, result;
-       rctx_t r = { f_read, (void *)f, 0, isspace };
-       wchar_t fmt2[l+1];
+       void *p;
+       unsigned int i;
+       va_list ap2;
+       va_copy(ap2, ap);
+       for (i=n; i>1; i--) va_arg(ap2, void *);
+       p = va_arg(ap2, void *);
+       va_end(ap2);
+       return p;
+}
 
-       if (l > 0x100000) {
-               errno = ENOMEM;
+static int readwc(int c, wchar_t **wcs, mbstate_t *st)
+{
+       char ch = c;
+       wchar_t wc;
+       switch (mbrtowc(&wc, &ch, 1, st)) {
+       case -1:
                return -1;
+       case -2:
+               break;
+       default:
+               if (*wcs) *(*wcs)++ = wc;
        }
-       for (i=0; i<=l; i++) fmt2[i] = (unsigned char)fmt[i];
+       return 0;
+}
+
+int vfscanf(FILE *f, const char *fmt, va_list ap)
+{
+       int width;
+       int size;
+       int alloc;
+       int base;
+       const unsigned char *p;
+       int c, t;
+       char *s;
+       wchar_t *wcs;
+       mbstate_t st;
+       void *dest=NULL;
+       int invert;
+       int matches=0;
+       unsigned long long x;
+       long double y;
+       off_t pos = 0;
 
        FLOCK(f);
 
-       result = __scanf(&r, fmt2, ap);
+       for (p=(const unsigned char *)fmt; *p; p++) {
+
+               if (isspace(*p)) {
+                       while (isspace(p[1])) p++;
+                       shlim(f, 0);
+                       while (isspace(shgetc(f)));
+                       shunget(f);
+                       pos += shcnt(f);
+                       continue;
+               }
+               if (*p != '%' || p[1] == '%') {
+                       p += *p=='%';
+                       c = shgetc(f);
+                       if (c!=*p) {
+                               shunget(f);
+                               if (c<0) goto input_fail;
+                               goto match_fail;
+                       }
+                       pos++;
+                       continue;
+               }
+
+               p++;
+               if (*p=='*') {
+                       dest = 0; p++;
+               } else if (isdigit(*p) && p[1]=='$') {
+                       dest = arg_n(ap, *p-'0'); p+=2;
+               } else {
+                       dest = va_arg(ap, void *);
+               }
+
+               for (width=0; isdigit(*p); p++) {
+                       width = 10*width + *p - '0';
+               }
 
-       if (r.u && r.c >= 0)
-               ungetc(r.c, f);
+               if (*p=='m') {
+                       alloc = 1;
+                       p++;
+               } else {
+                       alloc = 0;
+               }
 
+               size = SIZE_def;
+               switch (*p++) {
+               case 'h':
+                       if (*p == 'h') p++, size = SIZE_hh;
+                       else size = SIZE_h;
+                       break;
+               case 'l':
+                       if (*p == 'l') p++, size = SIZE_ll;
+                       else size = SIZE_l;
+                       break;
+               case 'j':
+                       size = SIZE_ll;
+                       break;
+               case 'z':
+               case 't':
+                       size = SIZE_l;
+                       break;
+               case 'L':
+                       size = SIZE_L;
+                       break;
+               case 'd': case 'i': case 'o': case 'u': case 'x':
+               case 'a': case 'e': case 'f': case 'g':
+               case 'A': case 'E': case 'F': case 'G': case 'X':
+               case 's': case 'c': case '[':
+               case 'S': case 'C':
+               case 'p': case 'n':
+                       p--;
+                       break;
+               default:
+                       goto fmt_fail;
+               }
+
+               t = *p;
+
+               switch (t) {
+               case 'C':
+               case 'c':
+                       if (width < 1) width = 1;
+               case 's':
+                       if (size == SIZE_l) t &= ~0x20;
+               case 'd': case 'i': case 'o': case 'u': case 'x':
+               case 'a': case 'e': case 'f': case 'g':
+               case 'A': case 'E': case 'F': case 'G': case 'X':
+               case '[': case 'S':
+               case 'p': case 'n':
+                       if (width < 1) width = 0;
+                       break;
+               default:
+                       goto fmt_fail;
+               }
+
+               shlim(f, width);
+
+               if (t != 'n') {
+                       if (shgetc(f) < 0) goto input_fail;
+                       shunget(f);
+               }
+
+               switch (t) {
+               case 'n':
+                       store_int(dest, size, pos);
+                       /* do not increment match count, etc! */
+                       continue;
+               case 'C':
+                       wcs = dest;
+                       st = (mbstate_t){ 0 };
+                       while ((c=shgetc(f)) >= 0) {
+                               if (readwc(c, &wcs, &st) < 0)
+                                       goto input_fail;
+                       }
+                       if (!mbsinit(&st)) goto input_fail;
+                       if (shcnt(f) != width) goto match_fail;
+                       break;
+               case 'c':
+                       if (dest) {
+                               s = dest;
+                               while ((c=shgetc(f)) >= 0) *s++ = c;
+                       } else {
+                               while (shgetc(f)>=0);
+                       }
+                       if (shcnt(f) < width) goto match_fail;
+                       break;
+               case '[':
+                       s = dest;
+                       wcs = dest;
+
+                       if (*++p == '^') p++, invert = 1;
+                       else invert = 0;
+
+                       unsigned char scanset[257];
+                       memset(scanset, invert, sizeof scanset);
+
+                       scanset[0] = 0;
+                       if (*p == '-') p++, scanset[1+'-'] = 1-invert;
+                       if (*p == ']') p++, scanset[1+']'] = 1-invert;
+                       for (; *p && *p != ']'; p++) {
+                               if (*p=='-' && p[1] != ']')
+                                       for (c=p++[-1]; c<*p; c++)
+                                               scanset[1+c] = 1-invert;
+                               scanset[1+*p] = 1-invert;
+                       }
+                       if (!*p) goto fmt_fail;
+
+                       if (size == SIZE_l) {
+                               st = (mbstate_t){0};
+                               while (scanset[(c=shgetc(f))+1]) {
+                                       if (readwc(c, &wcs, &st) < 0)
+                                               goto input_fail;
+                               }
+                               if (!mbsinit(&st)) goto input_fail;
+                               s = 0;
+                       } else if (s) {
+                               while (scanset[(c=shgetc(f))+1])
+                                       *s++ = c;
+                               wcs = 0;
+                       } else {
+                               while (scanset[(c=shgetc(f))+1]);
+                       }
+                       shunget(f);
+                       if (!shcnt(f)) goto match_fail;
+                       if (s) *s = 0;
+                       if (wcs) *wcs = 0;
+                       break;
+               default:
+                       shlim(f, 0);
+                       while (isspace(shgetc(f)));
+                       shunget(f);
+                       pos += shcnt(f);
+                       shlim(f, width);
+                       if (shgetc(f) < 0) goto input_fail;
+                       shunget(f);
+               }
+
+               switch (t) {
+               case 'p':
+               case 'X':
+               case 'x':
+                       base = 16;
+                       goto int_common;
+               case 'o':
+                       base = 8;
+                       goto int_common;
+               case 'd':
+               case 'u':
+                       base = 10;
+                       goto int_common;
+               case 'i':
+                       base = 0;
+               int_common:
+                       x = __intscan(f, base, 0, ULLONG_MAX);
+                       if (!shcnt(f)) goto match_fail;
+                       if (t=='p') *(void **)dest = (void *)(uintptr_t)x;
+                       else store_int(dest, size, x);
+                       break;
+               case 'a': case 'A':
+               case 'e': case 'E':
+               case 'f': case 'F':
+               case 'g': case 'G':
+                       y = __floatscan(f, -1, size, 0);
+                       if (!shcnt(f)) goto match_fail;
+                       if (dest) switch (size) {
+                       case SIZE_def:
+                               *(float *)dest = y;
+                               break;
+                       case SIZE_l:
+                               *(double *)dest = y;
+                               break;
+                       case SIZE_L:
+                               *(long double *)dest = y;
+                               break;
+                       }
+                       break;
+               case 'S':
+                       wcs = dest;
+                       st = (mbstate_t){ 0 };
+                       while (!isspace(c=shgetc(f)) && c!=EOF) {
+                               if (readwc(c, &wcs, &st) < 0)
+                                       goto input_fail;
+                       }
+                       if (!mbsinit(&st)) goto input_fail;
+                       if (dest) *wcs++ = 0;
+                       break;
+               case 's':
+                       if (dest) {
+                               s = dest;
+                               while (!isspace(c=shgetc(f)) && c!=EOF)
+                                       *s++ = c;
+                               *s = 0;
+                       } else {
+                               while (!isspace(c=shgetc(f)) && c!=EOF);
+                       }
+                       shunget(f);
+                       break;
+               }
+
+               pos += shcnt(f);
+               if (dest) matches++;
+       }
+       if (0) {
+fmt_fail:
+input_fail:
+               if (!matches) matches--;
+       }
+match_fail:
        FUNLOCK(f);
-       return result;
+       return matches;
 }
index fd48f70..fbc15e6 100644 (file)
@@ -1,21 +1,15 @@
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
+#include "stdio_impl.h"
 
-#include "__scanf.h"
-
-static void s_read(rctx_t *r)
+static size_t do_read(FILE *f, unsigned char *buf, size_t len)
 {
-       unsigned char *s = r->opaque;
-       if (!s[r->l]) r->c = -1;
-       else r->c = s[r->l++];
+       return __string_read(f, buf, len);
 }
 
 int vsscanf(const char *s, const char *fmt, va_list ap)
 {
-       size_t l = strlen(fmt), i;
-       wchar_t fmt2[l+1];
-       rctx_t r = { s_read, (void *)s, 0, isspace };
-       for (i=0; i<=l; i++) fmt2[i] = (unsigned char)fmt[i];
-       return __scanf(&r, fmt2, ap);
+       FILE f = {
+               .buf = (void *)s, .cookie = (void *)s,
+               .read = do_read, .lock = -1
+       };
+       return vfscanf(&f, fmt, ap);
 }