new fnmatch implementation
authorRich Felker <dalias@aerifal.cx>
Sat, 28 Apr 2012 22:05:29 +0000 (18:05 -0400)
committerRich Felker <dalias@aerifal.cx>
Sat, 28 Apr 2012 22:05:29 +0000 (18:05 -0400)
unlike the old one, this one's algorithm does not suffer from
potential stack overflow issues or pathologically bad performance on
certain patterns. instead of backtracking, it uses a matching
algorithm which I have not seen before (unsure whether I invented or
re-invented it) that runs in O(1) space and O(nm) time. it may be
possible to improve the time to O(n), but not without significantly
greater complexity.

src/regex/fnmatch.c

index 81f0c7b..ffd3ea0 100644 (file)
-#include <fnmatch.h>
-#include <wctype.h>
+/*
+ * An implementation of what I call the "Sea of Stars" algorithm for
+ * POSIX fnmatch(). The basic idea is that we factor the pattern into
+ * a head component (which we match first and can reject without ever
+ * measuring the length of the string), an optional tail component
+ * (which only exists if the pattern contains at least one star), and
+ * an optional "sea of stars", a set of star-separated components
+ * between the head and tail. After the head and tail matches have
+ * been removed from the input string, the components in the "sea of
+ * stars" are matched sequentially by searching for their first
+ * occurrence past the end of the previous match.
+ *
+ * - Rich Felker, April 2012
+ */
+
 #include <string.h>
-#include <wchar.h>
+#include <fnmatch.h>
 #include <stdlib.h>
-#include <limits.h>
+#include <wchar.h>
+#include <wctype.h>
 
-static int next(const char **s)
+#define END -1
+#define UNMATCHABLE -2
+#define BRACKET -3
+#define QUESTION -4
+#define STAR -5
+
+static int str_next(const char *str, size_t n, size_t *step)
 {
-       wchar_t c;
-       int l = mbtowc(&c, *s, MB_LEN_MAX);
-       /* hack to allow literal matches of invalid byte sequences */
-       if (l < 0) return (unsigned char)*(*s)++ - 0x100;
-       *s += l;
-       return c;
+       if (!n) {
+               *step = 0;
+               return 0;
+       }
+       if (str[0] >= 128U) {
+               wchar_t wc;
+               int k = mbtowc(&wc, str, n);
+               if (k<0) {
+                       *step = 1;
+                       return -1;
+               }
+               *step = k;
+               return wc;
+       }
+       *step = 1;
+       return str[0];
 }
 
-#define BRACKET_ERROR  -0x100
-#define BRACKET_NOCHAR -0x101
-
-static int bracket_next(const char **s)
+static int pat_next(const char *pat, size_t m, size_t *step, int flags)
 {
-       int c;
-       int type;
-       if (**s == '[') {
-               type = *(*s+1);
-               if (type == '.' || type == '=') {
-                       *s += 2;
-                       c = next(s);
-                       if (c <= 0) return BRACKET_ERROR;
-                       if (**s == type && *(*s+1) == ']') {
-                               *s += 2;
-                               return c;
+       int esc = 0;
+       if (!m || !*pat) {
+               *step = 0;
+               return END;
+       }
+       *step = 1;
+       if (pat[0]=='\\' && !(flags & FNM_NOESCAPE)) {
+               *step = 2;
+               pat++;
+               esc = 1;
+               goto escaped;
+       }
+       if (pat[0]=='[') {
+               size_t k = 1;
+               if (k<m) if (pat[k] == '^' || pat[k] == '!') k++;
+               if (k<m) if (pat[k] == ']') k++;
+               for (; k<m && pat[k] && pat[k]!=']'; k++) {
+                       if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) {
+                               int z = pat[k+1];
+                               k+=2;
+                               if (k<m && pat[k]) k++;
+                               while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++;
+                               if (k==m || !pat[k]) break;
                        }
-                       for (; **s && (**s != type || *(*s+1) != ']'); (*s)++);
-                       if (!**s) return BRACKET_ERROR;
-                       *s += 2;
-                       return BRACKET_NOCHAR;
                }
+               if (k==m || !pat[k]) {
+                       *step = 1;
+                       return '[';
+               }
+               *step = k+1;
+               return BRACKET;
+       }
+       if (pat[0] == '*')
+               return STAR;
+       if (pat[0] == '?')
+               return QUESTION;
+escaped:
+       if (pat[0] >= 128U) {
+               wchar_t wc;
+               int k = mbtowc(&wc, pat, m);
+               if (k<0) {
+                       *step = 0;
+                       return UNMATCHABLE;
+               }
+               *step = k + esc;
+               return wc;
        }
-       c = next(s);
-       if (c <= 0) return BRACKET_ERROR;
-       return c;
+       return pat[0];
 }
 
-#define __FNM_CONT 0x8000
+static int match_bracket(const char *p, int k)
+{
+       wchar_t wc;
+       int inv = 0;
+       p++;
+       if (*p=='^' || *p=='!') {
+               inv = 1;
+               p++;
+       }
+       if (*p==']') {
+               if (k==']') return !inv;
+               p++;
+       } else if (*p=='-') {
+               if (k=='-') return !inv;
+               p++;
+       }
+       wc = p[-1];
+       for (; *p != ']'; p++) {
+               if (p[0]=='-' && p[1]!=']') {
+                       wchar_t wc2;
+                       int l = mbtowc(&wc2, p+1, 4);
+                       if (l < 0) return 0;
+                       if (wc<=wc2 && (unsigned)k-wc <= wc2-wc) return !inv;
+                       p += l-1;
+                       continue;
+               }
+               if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) {
+                       const char *p0 = p+2;
+                       int z = p[1];
+                       p+=3;
+                       while (p[-1]!=z || p[0]!=']') p++;
+                       if (z == ':' && p-1-p0 < 16) {
+                               char buf[16];
+                               memcpy(buf, p0, p-1-p0);
+                               buf[p-1-p0] = 0;
+                               if (iswctype(k, wctype(buf))) return !inv;
+                       }
+                       continue;
+               }
+               if (*p < 128U) {
+                       wc = (unsigned char)*p;
+               } else {
+                       int l = mbtowc(&wc, p, 4);
+                       if (l < 0) return 0;
+                       p += l-1;
+               }
+               if (wc==k) return !inv;
+       }
+       return inv;
+}
 
-int fnmatch(const char *p, const char *s, int flags)
+static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)
 {
-       int c, d, k;
-       int not;
-       int match;
-       int first;
-       int no_slash = (flags & FNM_PATHNAME) ? '/' : 0;
-       int no_period = (flags & FNM_PERIOD) && !(flags & __FNM_CONT) ? '.' : 0x100;
-       const char *p1;
-
-       flags |= __FNM_CONT;
-
-       while ((c = *p++)) {
-               switch (c) {
-               case '?':
-                       k = next(&s);
-                       if (!k || k == no_period || k == no_slash)
-                               return FNM_NOMATCH;
+       const char *p, *ptail, *endpat;
+       const char *s, *stail, *endstr;
+       size_t pinc, sinc, tailcnt=0;
+       int c, k;
+
+       if (flags & FNM_PERIOD) {
+               if (*str == '.' && *pat != '.')
+                       return FNM_NOMATCH;
+       }
+       for (;;) {
+               switch ((c = pat_next(pat, m, &pinc, flags))) {
+               case UNMATCHABLE:
+                       return FNM_NOMATCH;
+               case STAR:
+                       pat++;
+                       m--;
                        break;
-               case '\\':
-                       if (!(flags & FNM_NOESCAPE)) {
-                               c = *p++;
-                               goto literal;
+               default:
+                       k = str_next(str, n, &sinc);
+                       if (k <= 0)
+                               return (c==END) ? 0 : FNM_NOMATCH;
+                       str += sinc;
+                       n -= sinc;
+                       if (c == BRACKET) {
+                               if (!match_bracket(pat, k))
+                                       return FNM_NOMATCH;
+                       } else if (c != QUESTION && k != c) {
+                               return FNM_NOMATCH;
                        }
-                       if (*s++ != c) return FNM_NOMATCH;
+                       pat+=pinc;
+                       m-=pinc;
+                       continue;
+               }
+               break;
+       }
+
+       /* Compute real pat length if it was initially unknown/-1 */
+       m = strnlen(pat, m);
+       endpat = pat + m;
+
+       /* Find the last * in pat and count chars needed after it */
+       for (p=ptail=pat; p<endpat; p+=pinc) {
+               switch (pat_next(p, endpat-p, &pinc, flags)) {
+               case UNMATCHABLE:
+                       return FNM_NOMATCH;
+               case STAR:
+                       tailcnt=0;
+                       ptail = p+1;
                        break;
-               case '*':
-                       for (; *p == '*'; p++);
-                       if (*p && !*s) return FNM_NOMATCH;
-                       if (*s == no_period)
+               default:
+                       tailcnt++;
+                       break;
+               }
+       }
+
+       /* Past this point we need not check for UNMATCHABLE in pat,
+        * because all of pat has already been parsed once. */
+
+       /* Compute real str length if it was initially unknown/-1 */
+       n = strnlen(str, n);
+       endstr = str + n;
+       if (n < tailcnt) return FNM_NOMATCH;
+
+       /* Find the final tailcnt chars of str, accounting for UTF-8.
+        * On illegal sequences we may get it wrong, but in that case
+        * we necessarily have a matching failure anyway. */
+       for (s=endstr; s>str && tailcnt; tailcnt--) {
+               if (s[-1] < 128U) s--;
+               else while ((unsigned char)*--s-0x80U<0x40 && s>str);
+       }
+       if (tailcnt) return FNM_NOMATCH;
+       stail = s;
+
+       /* Check that the pat and str tails match */
+       p = ptail;
+       for (;;) {
+               c = pat_next(p, endpat-p, &pinc, flags);
+               p += pinc;
+               if ((k = str_next(s, endstr-s, &sinc)) <= 0) {
+                       if (c != END) return FNM_NOMATCH;
+                       break;
+               }
+               s += sinc;
+               if (c == BRACKET) {
+                       if (!match_bracket(p-pinc, k))
                                return FNM_NOMATCH;
-                       if (!*p && (!no_slash || !strchr(s, no_slash)))
-                               return 0;
-                       for (; *s; s++)
-                               if (!fnmatch(p, s, flags))
-                                       return 0;
-                               else if (*s == no_slash)
-                                       break;
+               } else if (c != QUESTION && k != c) {
                        return FNM_NOMATCH;
-               case '[':
-                       p1 = p-1;
-                       not = (*p == '!' || *p == '^');
-                       if (not) p++;
-                       k = next(&s);
-                       if (!k || k == no_slash || k == no_period)
-                               return FNM_NOMATCH;
-                       match = 0;
-                       first = 1;
-                       for (;;) {
-                               if (!*p) goto literal_bracket;
-                               if (*p == ']' && !first) break;
-                               first = 0;
-                               if (*p == '[' && *(p+1) == ':') {
-                                       const char *z;
-                                       p += 2;
-                                       for (z=p; *z && (*z != ':' || *(z+1) != ']'); z++);
-                                       if (!*z || z-p > 32) { /* FIXME: symbolic const? */
-                                               return FNM_NOMATCH;
-                                       } else {
-                                               char class[33];
-                                               memcpy(class, p, z-p);
-                                               class[z-p] = 0;
-                                               if (iswctype(k, wctype(class)))
-                                                       match = 1;
-                                       }
-                                       p = z+2;
-                                       continue;
-                               }
-                               c = bracket_next(&p);
-                               if (c == BRACKET_ERROR) {
-literal_bracket:
-                                       match = (k=='[');
-                                       p = p1;
-                                       not = 0;
-                                       break;
-                               }
-                               if (c == BRACKET_NOCHAR)
-                                       continue;
-                               if (*p == '-' && *(p+1) != ']') {
-                                       p++;
-                                       d = bracket_next(&p);
-                                       if (d == BRACKET_ERROR)
-                                               goto literal_bracket;
-                                       if (d == BRACKET_NOCHAR)
-                                               continue;
-                                       if (k >= c && k <= d)
-                                               match = 1;
-                                       continue;
-                               }
-                               if (k == c) match = 1;
+               }
+       }
+
+       /* We're all done with the tails now, so throw them out */
+       endstr = stail;
+       endpat = ptail;
+
+       /* Match pattern components until there are none left */
+       while (pat<endpat) {
+               p = pat;
+               s = str;
+               for (;;) {
+                       c = pat_next(p, endpat-p, &pinc, flags);
+                       p += pinc;
+                       /* Encountering * completes/commits a component */
+                       if (c == STAR) {
+                               pat = p;
+                               str = s;
+                               break;
                        }
-                       p++;
-                       if (not == match)
+                       k = str_next(s, endstr-s, &sinc);
+                       if (!k)
                                return FNM_NOMATCH;
-                       break;
-               default:
-               literal:
-                       if (*s++ != c)
-                               return FNM_NOMATCH;
-                       if (c == no_slash && (flags & FNM_PERIOD)) {
-                               no_period = '.';
-                               continue;
+                       if (c == BRACKET) {
+                               if (!match_bracket(p-pinc, k))
+                                       break;
+                       } else if (c != QUESTION && k != c) {
+                               break;
                        }
-                       break;
+                       s += sinc;
                }
-               no_period = 0x100;
+               if (c == STAR) continue;
+               /* If we failed, advance str, by 1 char if it's a valid
+                * char, or past all invalid bytes otherwise. */
+               k = str_next(str, endstr-str, &sinc);
+               if (k > 0) str += sinc;
+               else for (str++; str_next(str, endstr-str, &sinc)<0; str++);
        }
-       if (*s) return FNM_NOMATCH;
+
        return 0;
 }
+
+int fnmatch(const char *pat, const char *str, int flags)
+{
+       const char *s, *p;
+       size_t inc;
+       int c;
+       if (flags & FNM_PATHNAME) for (;;) {
+               for (s=str; *s && *s!='/'; s++);
+               for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc);
+               if (*s && *p!=*s) return FNM_NOMATCH;
+               if (fnmatch_internal(pat, p-pat, str, s-str, flags))
+                       return FNM_NOMATCH;
+               if (!*s && c==END) return 0;
+               str = s+1;
+               pat = p+1;
+       }
+       return fnmatch_internal(pat, -1, str, -1, flags);
+}