X-Git-Url: http://nsz.repo.hu/git/?a=blobdiff_plain;f=src%2Fregex%2Fregcomp.c;h=330de46759730b55b90d9080fb7798d37ae5db86;hb=140ad50cbf9244eecc21a0126c743396a34e8106;hp=875f56fd8f8c93f6438696f800d4e33bd7c673e9;hpb=32aea2087a699bb4bd9c34347b6ef8d164ee0d0b;p=musl diff --git a/src/regex/regcomp.c b/src/regex/regcomp.c index 875f56fd..330de467 100644 --- a/src/regex/regcomp.c +++ b/src/regex/regcomp.c @@ -1,35 +1,61 @@ /* regcomp.c - TRE POSIX compatible regex compilation functions. - Copyright (c) 2001-2006 Ville Laurikari - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (c) 2001-2009 Ville Laurikari + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include -#include #include #include #include #include +#include #include "tre.h" #include +/*********************************************************************** + from tre-compile.h +***********************************************************************/ + +typedef struct { + int position; + int code_min; + int code_max; + int *tags; + int assertions; + tre_ctype_t class; + tre_ctype_t *neg_classes; + int backref; +} tre_pos_and_tags_t; + + /*********************************************************************** from tre-ast.c and tre-ast.h ***********************************************************************/ @@ -54,17 +80,6 @@ typedef enum { #define IS_TAG(x) ((x)->code_min == TAG) #define IS_BACKREF(x) ((x)->code_min == BACKREF) -/* Taken from tre-compile.h */ -typedef struct { - int position; - int code_min; - int code_max; - int *tags; - int assertions; - tre_ctype_t class; - tre_ctype_t *neg_classes; - int backref; -} tre_pos_and_tags_t; /* A generic AST node. All AST nodes consist of this node on the top level with `obj' pointing to the actual content. */ @@ -109,6 +124,10 @@ typedef struct { int min; /* Maximum number of consecutive matches. */ int max; + /* If 0, match as many characters as possible, if 1 match as few as + possible. Note that this does not always mean the same thing as + matching as many/few repetitions as possible. */ + unsigned int minimal:1; } tre_iteration_t; /* An "union" node. These are created for the "|" operator. */ @@ -117,124 +136,166 @@ typedef struct { tre_ast_node_t *right; } tre_union_t; + static tre_ast_node_t * -tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size) +tre_ast_new_node(tre_mem_t mem, int type, void *obj) { - tre_ast_node_t *node; - - node = tre_mem_calloc(mem, sizeof(*node)); - if (!node) - return NULL; - node->obj = tre_mem_calloc(mem, size); - if (!node->obj) - return NULL; - node->type = type; - node->nullable = -1; - node->submatch_id = -1; - - return node; + tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node); + if (!node || !obj) + return 0; + node->obj = obj; + node->type = type; + node->nullable = -1; + node->submatch_id = -1; + return node; } static tre_ast_node_t * tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position) { - tre_ast_node_t *node; - tre_literal_t *lit; - - node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t)); - if (!node) - return NULL; - lit = node->obj; - lit->code_min = code_min; - lit->code_max = code_max; - lit->position = position; - - return node; + tre_ast_node_t *node; + tre_literal_t *lit; + + lit = tre_mem_calloc(mem, sizeof *lit); + node = tre_ast_new_node(mem, LITERAL, lit); + if (!node) + return 0; + lit->code_min = code_min; + lit->code_max = code_max; + lit->position = position; + return node; } static tre_ast_node_t * -tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max) +tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal) { - tre_ast_node_t *node; - tre_iteration_t *iter; - - node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t)); - if (!node) - return NULL; - iter = node->obj; - iter->arg = arg; - iter->min = min; - iter->max = max; - node->num_submatches = arg->num_submatches; - - return node; + tre_ast_node_t *node; + tre_iteration_t *iter; + + iter = tre_mem_calloc(mem, sizeof *iter); + node = tre_ast_new_node(mem, ITERATION, iter); + if (!node) + return 0; + iter->arg = arg; + iter->min = min; + iter->max = max; + iter->minimal = minimal; + node->num_submatches = arg->num_submatches; + return node; } static tre_ast_node_t * tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) { - tre_ast_node_t *node; - - node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t)); - if (node == NULL) - return NULL; - ((tre_union_t *)node->obj)->left = left; - ((tre_union_t *)node->obj)->right = right; - node->num_submatches = left->num_submatches + right->num_submatches; - - return node; + tre_ast_node_t *node; + tre_union_t *un; + + if (!left) + return right; + un = tre_mem_calloc(mem, sizeof *un); + node = tre_ast_new_node(mem, UNION, un); + if (!node || !right) + return 0; + un->left = left; + un->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + return node; } static tre_ast_node_t * -tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, - tre_ast_node_t *right) +tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right) { - tre_ast_node_t *node; - - node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t)); - if (node == NULL) - return NULL; - ((tre_catenation_t *)node->obj)->left = left; - ((tre_catenation_t *)node->obj)->right = right; - node->num_submatches = left->num_submatches + right->num_submatches; - - return node; + tre_ast_node_t *node; + tre_catenation_t *cat; + + if (!left) + return right; + cat = tre_mem_calloc(mem, sizeof *cat); + node = tre_ast_new_node(mem, CATENATION, cat); + if (!node) + return 0; + cat->left = left; + cat->right = right; + node->num_submatches = left->num_submatches + right->num_submatches; + return node; } + /*********************************************************************** from tre-stack.c and tre-stack.h ***********************************************************************/ +typedef struct tre_stack_rec tre_stack_t; + +/* Creates a new stack object. `size' is initial size in bytes, `max_size' + is maximum size, and `increment' specifies how much more space will be + allocated with realloc() if all space gets used up. Returns the stack + object or NULL if out of memory. */ +static tre_stack_t * +tre_stack_new(int size, int max_size, int increment); + +/* Frees the stack object. */ +static void +tre_stack_destroy(tre_stack_t *s); + +/* Returns the current number of objects in the stack. */ +static int +tre_stack_num_objects(tre_stack_t *s); + +/* Each tre_stack_push_*(tre_stack_t *s, value) function pushes + `value' on top of stack `s'. Returns REG_ESPACE if out of memory. + This tries to realloc() more space before failing if maximum size + has not yet been reached. Returns REG_OK if successful. */ +#define declare_pushf(typetag, type) \ + static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value) + +declare_pushf(voidptr, void *); +declare_pushf(int, int); + +/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost + element off of stack `s' and returns it. The stack must not be + empty. */ +#define declare_popf(typetag, type) \ + static type tre_stack_pop_ ## typetag(tre_stack_t *s) + +declare_popf(voidptr, void *); +declare_popf(int, int); + /* Just to save some typing. */ -#define STACK_PUSH(s, value) \ +#define STACK_PUSH(s, typetag, value) \ do \ { \ - status = tre_stack_push(s, (void *)(value)); \ + status = tre_stack_push_ ## typetag(s, value); \ } \ - while (0) + while (/*CONSTCOND*/0) -#define STACK_PUSHX(s, value) \ +#define STACK_PUSHX(s, typetag, value) \ { \ - status = tre_stack_push(s, (void *)(value)); \ + status = tre_stack_push_ ## typetag(s, value); \ if (status != REG_OK) \ break; \ } -#define STACK_PUSHR(s, value) \ +#define STACK_PUSHR(s, typetag, value) \ { \ - reg_errcode_t status; \ - status = tre_stack_push(s, (void *)(value)); \ - if (status != REG_OK) \ - return status; \ + reg_errcode_t _status; \ + _status = tre_stack_push_ ## typetag(s, value); \ + if (_status != REG_OK) \ + return _status; \ } -typedef struct tre_stack_rec { +union tre_stack_item { + void *voidptr_value; + int int_value; +}; + +struct tre_stack_rec { int size; int max_size; int increment; int ptr; - void **stack; -} tre_stack_t; + union tre_stack_item *stack; +}; static tre_stack_t * @@ -273,7 +334,7 @@ tre_stack_num_objects(tre_stack_t *s) } static reg_errcode_t -tre_stack_push(tre_stack_t *s, void *value) +tre_stack_push(tre_stack_t *s, union tre_stack_item value) { if (s->ptr < s->size) { @@ -284,24 +345,20 @@ tre_stack_push(tre_stack_t *s, void *value) { if (s->size >= s->max_size) { - DPRINT(("tre_stack_push: stack full\n")); return REG_ESPACE; } else { - void **new_buffer; + union tre_stack_item *new_buffer; int new_size; - DPRINT(("tre_stack_push: trying to realloc more space\n")); new_size = s->size + s->increment; if (new_size > s->max_size) new_size = s->max_size; new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size); if (new_buffer == NULL) { - DPRINT(("tre_stack_push: realloc failed.\n")); return REG_ESPACE; } - DPRINT(("tre_stack_push: realloc succeeded.\n")); assert(new_size > s->size); s->size = new_size; s->stack = new_buffer; @@ -311,12 +368,24 @@ tre_stack_push(tre_stack_t *s, void *value) return REG_OK; } -static void * -tre_stack_pop(tre_stack_t *s) -{ - return s->stack[--s->ptr]; +#define define_pushf(typetag, type) \ + declare_pushf(typetag, type) { \ + union tre_stack_item item; \ + item.typetag ## _value = value; \ + return tre_stack_push(s, item); \ } +define_pushf(int, int) +define_pushf(voidptr, void *) + +#define define_popf(typetag, type) \ + declare_popf(typetag, type) { \ + return s->stack[--s->ptr].typetag ## _value; \ + } + +define_popf(int, int) +define_popf(voidptr, void *) + /*********************************************************************** from tre-parse.c and tre-parse.h @@ -324,1096 +393,651 @@ tre_stack_pop(tre_stack_t *s) /* Parse context. */ typedef struct { - /* Memory allocator. The AST is allocated using this. */ - tre_mem_t mem; - /* Stack used for keeping track of regexp syntax. */ - tre_stack_t *stack; - /* The parse result. */ - tre_ast_node_t *result; - /* The regexp to parse and its length. */ - const tre_char_t *re; - /* The first character of the entire regexp. */ - const tre_char_t *re_start; - /* The first character after the end of the regexp. */ - const tre_char_t *re_end; - int len; - /* Current submatch ID. */ - int submatch_id; - /* Current position (number of literal). */ - int position; - /* The highest back reference or -1 if none seen so far. */ - int max_backref; - /* Compilation flags. */ - int cflags; - /* If this flag is set the top-level submatch is not captured. */ - int nofirstsub; + /* Memory allocator. The AST is allocated using this. */ + tre_mem_t mem; + /* Stack used for keeping track of regexp syntax. */ + tre_stack_t *stack; + /* The parsed node after a parse function returns. */ + tre_ast_node_t *n; + /* Position in the regexp pattern after a parse function returns. */ + const char *s; + /* The first character of the regexp. */ + const char *re; + /* Current submatch ID. */ + int submatch_id; + /* Current position (number of literal). */ + int position; + /* The highest back reference or -1 if none seen so far. */ + int max_backref; + /* Compilation flags. */ + int cflags; } tre_parse_ctx_t; -static reg_errcode_t -tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i, - tre_ast_node_t ***items) +/* Some macros for expanding \w, \s, etc. */ +static const struct { + char c; + const char *expansion; +} tre_macros[] = { + {'t', "\t"}, {'n', "\n"}, {'r', "\r"}, + {'f', "\f"}, {'a', "\a"}, {'e', "\033"}, + {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"}, + {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"}, + { 0, 0 } +}; + +/* Expands a macro delimited by `regex' and `regex_end' to `buf', which + must have at least `len' items. Sets buf[0] to zero if the there + is no match in `tre_macros'. */ +static const char *tre_expand_macro(const char *s) { - reg_errcode_t status; - tre_ast_node_t **array = *items; - /* Allocate more space if necessary. */ - if (*i >= *max_i) - { - tre_ast_node_t **new_items; - DPRINT(("out of array space, i = %d\n", *i)); - /* If the array is already 1024 items large, give up -- there's - probably an error in the regexp (e.g. not a '\0' terminated - string and missing ']') */ - if (*max_i > 1024) - return REG_ESPACE; - *max_i *= 2; - new_items = xrealloc(array, sizeof(*items) * *max_i); - if (new_items == NULL) - return REG_ESPACE; - *items = array = new_items; - } - array[*i] = tre_ast_new_literal(mem, min, max, -1); - status = array[*i] == NULL ? REG_ESPACE : REG_OK; - (*i)++; - return status; + int i; + for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++); + return tre_macros[i].expansion; } - -/* Expands a character class to character ranges. */ -static reg_errcode_t -tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items, - int *i, int *max_i, int cflags) +static int +tre_compare_lit(const void *a, const void *b) { - reg_errcode_t status = REG_OK; - tre_cint_t c; - int j, min = -1, max = 0; - assert(TRE_MB_CUR_MAX == 1); + const tre_literal_t *const *la = a; + const tre_literal_t *const *lb = b; + /* assumes the range of valid code_min is < INT_MAX */ + return la[0]->code_min - lb[0]->code_min; +} - DPRINT((" expanding class to character ranges\n")); - for (j = 0; (j < 256) && (status == REG_OK); j++) - { - c = j; - if (tre_isctype(c, class) - || ((cflags & REG_ICASE) - && (tre_isctype(tre_tolower(c), class) - || tre_isctype(tre_toupper(c), class)))) +struct literals { + tre_mem_t mem; + tre_literal_t **a; + int len; + int cap; +}; + +static tre_literal_t *tre_new_lit(struct literals *p) { - if (min < 0) - min = c; - max = c; + tre_literal_t **a; + if (p->len >= p->cap) { + if (p->cap >= 1<<15) + return 0; + p->cap *= 2; + a = xrealloc(p->a, p->cap * sizeof *p->a); + if (!a) + return 0; + p->a = a; } - else if (min >= 0) - { - DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max)); - status = tre_new_item(mem, min, max, i, max_i, items); - min = -1; - } - } - if (min >= 0 && status == REG_OK) - status = tre_new_item(mem, min, max, i, max_i, items); - return status; + a = p->a + p->len++; + *a = tre_mem_calloc(p->mem, sizeof **a); + return *a; } - -static int -tre_compare_items(const void *a, const void *b) +static int add_icase_literals(struct literals *ls, int min, int max) { - tre_ast_node_t *node_a = *(tre_ast_node_t **)a; - tre_ast_node_t *node_b = *(tre_ast_node_t **)b; - tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj; - int a_min = l_a->code_min, b_min = l_b->code_min; - - if (a_min < b_min) - return -1; - else if (a_min > b_min) - return 1; - else - return 0; + tre_literal_t *lit; + int b, e, c; + for (c=min; c<=max; ) { + /* assumes islower(c) and isupper(c) are exclusive + and toupper(c)!=c if islower(c). + multiple opposite case characters are not supported */ + if (tre_islower(c)) { + b = e = tre_toupper(c); + for (c++, e++; c<=max; c++, e++) + if (tre_toupper(c) != e) break; + } else if (tre_isupper(c)) { + b = e = tre_tolower(c); + for (c++, e++; c<=max; c++, e++) + if (tre_tolower(c) != e) break; + } else { + c++; + continue; + } + lit = tre_new_lit(ls); + if (!lit) + return -1; + lit->code_min = b; + lit->code_max = e-1; + lit->position = -1; + } + return 0; } -/* Maximum number of character classes that can occur in a negated bracket - expression. */ + +/* Maximum number of character classes in a negated bracket expression. */ #define MAX_NEG_CLASSES 64 -/* Maximum length of character class names. */ -#define MAX_CLASS_NAME +struct neg { + int negate; + int len; + tre_ctype_t a[MAX_NEG_CLASSES]; +}; -static reg_errcode_t -tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate, - tre_ctype_t neg_classes[], int *num_neg_classes, - tre_ast_node_t ***items, int *num_items, - int *items_size) -{ - const tre_char_t *re = ctx->re; - reg_errcode_t status = REG_OK; - tre_ctype_t class = (tre_ctype_t)0; - int i = *num_items; - int max_i = *items_size; - int skip; +// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges - /* Build an array of the items in the bracket expression. */ - while (status == REG_OK) - { - skip = 0; - if (re == ctx->re_end) - { - status = REG_EBRACK; - } - else if (*re == ']' && re > ctx->re) - { - DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", - ctx->re_end - re, re)); - re++; - break; - } - else - { - tre_cint_t min = 0, max = 0; +/* +bracket grammar: +Bracket = '[' List ']' | '[^' List ']' +List = Term | List Term +Term = Char | Range | Chclass | Eqclass +Range = Char '-' Char | Char '-' '-' +Char = Coll | coll_single +Meta = ']' | '-' +Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]' +Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]' +Chclass = '[:' class ':]' + +coll_single is a single char collating element but it can be + '-' only at the beginning or end of a List and + ']' only at the beginning of a List and + '^' anywhere except after the openning '[' +*/ - class = (tre_ctype_t)0; - if (re + 2 < ctx->re_end - && *(re + 1) == '-' && *(re + 2) != ']') - { - DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", - ctx->re_end - re, re)); - min = *re; - max = *(re + 2); - re += 3; - /* XXX - Should use collation order instead of encoding values - in character ranges. */ - if (min > max) - status = REG_ERANGE; - } - else if (re + 1 < ctx->re_end - && *re == '[' && *(re + 1) == '.') - status = REG_ECOLLATE; - else if (re + 1 < ctx->re_end - && *re == '[' && *(re + 1) == '=') - status = REG_ECOLLATE; - else if (re + 1 < ctx->re_end - && *re == '[' && *(re + 1) == ':') - { - char tmp_str[64]; - const tre_char_t *endptr = re + 2; - int len; - DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", - ctx->re_end - re, re)); - while (endptr < ctx->re_end && *endptr != ':') - endptr++; - if (endptr != ctx->re_end) - { - len = MIN(endptr - re - 2, 63); -#ifdef TRE_WCHAR - { - tre_char_t tmp_wcs[64]; - wcsncpy(tmp_wcs, re + 2, len); - tmp_wcs[len] = '\0'; -#if defined HAVE_WCSRTOMBS - { - mbstate_t state; - const tre_char_t *src = tmp_wcs; - memset(&state, '\0', sizeof(state)); - len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state); - } -#elif defined HAVE_WCSTOMBS - len = wcstombs(tmp_str, tmp_wcs, 63); -#endif /* defined HAVE_WCSTOMBS */ - } -#else /* !TRE_WCHAR */ - strncpy(tmp_str, re + 2, len); -#endif /* !TRE_WCHAR */ - tmp_str[len] = '\0'; - DPRINT((" class name: %s\n", tmp_str)); - class = tre_ctype(tmp_str); - if (!class) - status = REG_ECTYPE; - /* Optimize character classes for 8 bit character sets. */ - if (status == REG_OK && TRE_MB_CUR_MAX == 1) - { - status = tre_expand_ctype(ctx->mem, class, items, - &i, &max_i, ctx->cflags); - class = (tre_ctype_t)0; - skip = 1; - } - re = endptr + 2; +static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg) +{ + const char *start = s; + tre_ctype_t class; + int min, max; + wchar_t wc; + int len; + + for (;;) { + class = 0; + len = mbtowc(&wc, s, -1); + if (len <= 0) + return *s ? REG_BADPAT : REG_EBRACK; + if (*s == ']' && s != start) { + ctx->s = s+1; + return REG_OK; + } + if (*s == '-' && s != start && s[1] != ']' && + /* extension: [a-z--@] is accepted as [a-z]|[--@] */ + (s[1] != '-' || s[2] == ']')) + return REG_ERANGE; + if (*s == '[' && (s[1] == '.' || s[1] == '=')) + /* collating symbols and equivalence classes are not supported */ + return REG_ECOLLATE; + if (*s == '[' && s[1] == ':') { + char tmp[CHARCLASS_NAME_MAX+1]; + s += 2; + for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) { + if (s[len] == ':') { + memcpy(tmp, s, len); + tmp[len] = 0; + class = tre_ctype(tmp); + break; + } + } + if (!class || s[len+1] != ']') + return REG_ECTYPE; + min = 0; + max = TRE_CHAR_MAX; + s += len+2; + } else { + min = max = wc; + s += len; + if (*s == '-' && s[1] != ']') { + s++; + len = mbtowc(&wc, s, -1); + max = wc; + /* XXX - Should use collation order instead of + encoding values in character ranges. */ + if (len <= 0 || min > max) + return REG_ERANGE; + s += len; + } } - else - status = REG_ECTYPE; - min = 0; - max = TRE_CHAR_MAX; - } - else - { - DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", - ctx->re_end - re, re)); - if (*re == '-' && *(re + 1) != ']' - && ctx->re != re) - /* Two ranges are not allowed to share and endpoint. */ - status = REG_ERANGE; - min = max = *re++; - } - - if (status != REG_OK) - break; - - if (class && negate) - if (*num_neg_classes >= MAX_NEG_CLASSES) - status = REG_ESPACE; - else - neg_classes[(*num_neg_classes)++] = class; - else if (!skip) - { - status = tre_new_item(ctx->mem, min, max, &i, &max_i, items); - if (status != REG_OK) - break; - ((tre_literal_t*)((*items)[i-1])->obj)->class = class; - } - - /* Add opposite-case counterpoints if REG_ICASE is present. - This is broken if there are more than two "same" characters. */ - if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip) - { - int cmin, ccurr; - DPRINT(("adding opposite-case counterpoints\n")); - while (min <= max) - { - if (tre_islower(min)) - { - cmin = ccurr = tre_toupper(min++); - while (tre_islower(min) && tre_toupper(min) == ccurr + 1 - && min <= max) - ccurr = tre_toupper(min++); - status = tre_new_item(ctx->mem, cmin, ccurr, - &i, &max_i, items); - } - else if (tre_isupper(min)) - { - cmin = ccurr = tre_tolower(min++); - while (tre_isupper(min) && tre_tolower(min) == ccurr + 1 - && min <= max) - ccurr = tre_tolower(min++); - status = tre_new_item(ctx->mem, cmin, ccurr, - &i, &max_i, items); - } - else min++; - if (status != REG_OK) - break; + if (class && neg->negate) { + if (neg->len >= MAX_NEG_CLASSES) + return REG_ESPACE; + neg->a[neg->len++] = class; + } else { + tre_literal_t *lit = tre_new_lit(ls); + if (!lit) + return REG_ESPACE; + lit->code_min = min; + lit->code_max = max; + lit->class = class; + lit->position = -1; + + /* Add opposite-case codepoints if REG_ICASE is present. + It seems that POSIX requires that bracket negation + should happen before case-folding, but most practical + implementations do it the other way around. Changing + the order would need efficient representation of + case-fold ranges and bracket range sets even with + simple patterns so this is ok for now. */ + if (ctx->cflags & REG_ICASE && !class) + if (add_icase_literals(ls, min, max)) + return REG_ESPACE; } - if (status != REG_OK) - break; - } } - } - *num_items = i; - *items_size = max_i; - ctx->re = re; - return status; } -static reg_errcode_t -tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s) { - tre_ast_node_t *node = NULL; - int negate = 0; - reg_errcode_t status = REG_OK; - tre_ast_node_t **items, *u, *n; - int i = 0, j, max_i = 32, curr_max, curr_min; - tre_ctype_t neg_classes[MAX_NEG_CLASSES]; - int num_neg_classes = 0; - - /* Start off with an array of `max_i' elements. */ - items = xmalloc(sizeof(*items) * max_i); - if (items == NULL) - return REG_ESPACE; - - if (*ctx->re == '^') - { - DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - negate = 1; - ctx->re++; - } - - status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes, - &items, &i, &max_i); - - if (status != REG_OK) - goto parse_bracket_done; - - /* Sort the array if we need to negate it. */ - if (negate) - qsort(items, i, sizeof(*items), tre_compare_items); - - curr_max = curr_min = 0; - /* Build a union of the items in the array, negated if necessary. */ - for (j = 0; j < i && status == REG_OK; j++) - { - int min, max; - tre_literal_t *l = items[j]->obj; - min = l->code_min; - max = l->code_max; - - DPRINT(("item: %d - %d, class %ld, curr_max = %d\n", - (int)l->code_min, (int)l->code_max, (long)l->class, curr_max)); - - if (negate) - { - if (min < curr_max) - { - /* Overlap. */ - curr_max = MAX(max + 1, curr_max); - DPRINT(("overlap, curr_max = %d\n", curr_max)); - l = NULL; - } - else - { - /* No overlap. */ - curr_max = min - 1; - if (curr_max >= curr_min) - { - DPRINT(("no overlap\n")); - l->code_min = curr_min; - l->code_max = curr_max; + int i, max, min, negmax, negmin; + tre_ast_node_t *node = 0, *n; + tre_ctype_t *nc = 0; + tre_literal_t *lit; + struct literals ls; + struct neg neg; + reg_errcode_t err; + + ls.mem = ctx->mem; + ls.len = 0; + ls.cap = 32; + ls.a = xmalloc(ls.cap * sizeof *ls.a); + if (!ls.a) + return REG_ESPACE; + neg.len = 0; + neg.negate = *s == '^'; + if (neg.negate) + s++; + + err = parse_bracket_terms(ctx, s, &ls, &neg); + if (err != REG_OK) + goto parse_bracket_done; + + if (neg.negate) { + /* Sort the array if we need to negate it. */ + qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit); + /* extra lit for the last negated range */ + lit = tre_new_lit(&ls); + if (!lit) { + err = REG_ESPACE; + goto parse_bracket_done; } - else - { - DPRINT(("no overlap, zero room\n")); - l = NULL; + lit->code_min = TRE_CHAR_MAX+1; + lit->code_max = TRE_CHAR_MAX+1; + lit->position = -1; + /* negated classes */ + if (neg.len) { + nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a); + if (!nc) { + err = REG_ESPACE; + goto parse_bracket_done; + } + memcpy(nc, neg.a, neg.len*sizeof *neg.a); + nc[neg.len] = 0; } - curr_min = curr_max = max + 1; - } } - if (l != NULL) - { - int k; - DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max)); - l->position = ctx->position; - if (num_neg_classes > 0) - { - l->neg_classes = tre_mem_alloc(ctx->mem, - (sizeof(l->neg_classes) - * (num_neg_classes + 1))); - if (l->neg_classes == NULL) - { - status = REG_ESPACE; - break; + /* Build a union of the items in the array, negated if necessary. */ + negmax = negmin = 0; + for (i = 0; i < ls.len; i++) { + lit = ls.a[i]; + min = lit->code_min; + max = lit->code_max; + if (neg.negate) { + if (min <= negmin) { + /* Overlap. */ + negmin = MAX(max + 1, negmin); + continue; + } + negmax = min - 1; + lit->code_min = negmin; + lit->code_max = negmax; + negmin = max + 1; } - for (k = 0; k < num_neg_classes; k++) - l->neg_classes[k] = neg_classes[k]; - l->neg_classes[k] = (tre_ctype_t)0; - } - else - l->neg_classes = NULL; - if (node == NULL) - node = items[j]; - else - { - u = tre_ast_new_union(ctx->mem, node, items[j]); - if (u == NULL) - status = REG_ESPACE; - node = u; - } - } - } - - if (status != REG_OK) - goto parse_bracket_done; - - if (negate) - { - int k; - DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX)); - n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position); - if (n == NULL) - status = REG_ESPACE; - else - { - tre_literal_t *l = n->obj; - if (num_neg_classes > 0) - { - l->neg_classes = tre_mem_alloc(ctx->mem, - (sizeof(l->neg_classes) - * (num_neg_classes + 1))); - if (l->neg_classes == NULL) - { - status = REG_ESPACE; - goto parse_bracket_done; + lit->position = ctx->position; + lit->neg_classes = nc; + n = tre_ast_new_node(ctx->mem, LITERAL, lit); + node = tre_ast_new_union(ctx->mem, node, n); + if (!node) { + err = REG_ESPACE; + break; } - for (k = 0; k < num_neg_classes; k++) - l->neg_classes[k] = neg_classes[k]; - l->neg_classes[k] = (tre_ctype_t)0; - } - else - l->neg_classes = NULL; - if (node == NULL) - node = n; - else - { - u = tre_ast_new_union(ctx->mem, node, n); - if (u == NULL) - status = REG_ESPACE; - node = u; - } } - } - - if (status != REG_OK) - goto parse_bracket_done; - -#ifdef TRE_DEBUG - tre_ast_print(node); -#endif /* TRE_DEBUG */ - parse_bracket_done: - xfree(items); - ctx->position++; - *result = node; - return status; +parse_bracket_done: + xfree(ls.a); + ctx->position++; + ctx->n = node; + return err; } - -/* Parses a positive decimal integer. Returns -1 if the string does not - contain a valid number. */ -static int -tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end) +static const char *parse_dup_count(const char *s, int *n) { - int num = -1; - const tre_char_t *r = *regex; - while (r < regex_end && *r >= '0' && *r <= '9') - { - if (num < 0) - num = 0; - num = num * 10 + *r - '0'; - r++; - } - *regex = r; - return num; + *n = -1; + if (!isdigit(*s)) + return s; + *n = 0; + for (;;) { + *n = 10 * *n + (*s - '0'); + s++; + if (!isdigit(*s) || *n > RE_DUP_MAX) + break; + } + return s; } - -static reg_errcode_t -tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result) +static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s) { - int min, max; - const tre_char_t *r = ctx->re; - const tre_char_t *start; - int counts_set = 0; - - /* Parse number (minimum repetition count). */ - min = -1; - if (r < ctx->re_end && *r >= '0' && *r <= '9') { - DPRINT(("tre_parse: min count: '%.*" STRF "'\n", ctx->re_end - r, r)); - min = tre_parse_int(&r, ctx->re_end); - } - - /* Parse comma and second number (maximum repetition count). */ - max = min; - if (r < ctx->re_end && *r == ',') - { - r++; - DPRINT(("tre_parse: max count: '%.*" STRF "'\n", ctx->re_end - r, r)); - max = tre_parse_int(&r, ctx->re_end); - } - - /* Check that the repeat counts are sane. */ - if ((max >= 0 && min > max) || max > RE_DUP_MAX) - return REG_BADBR; - - - /* - '{' - optionally followed immediately by a number == minimum repcount - optionally followed by , then a number == maximum repcount - */ - - - do { - int done; - start = r; - - /* Parse count limit settings */ - done = 0; - if (!counts_set) - while (r + 1 < ctx->re_end && !done) - { - switch (*r) - { - case ',': - r++; - break; - case ' ': - r++; - break; - case '}': - done = 1; - break; - default: - done = 1; - break; - } - } - } while (start != r); - - /* Missing }. */ - if (r >= ctx->re_end) - return REG_EBRACE; - - /* Empty contents of {}. */ - if (r == ctx->re) - return REG_BADBR; - - /* Parse the ending '}' or '\}'.*/ - if (ctx->cflags & REG_EXTENDED) - { - if (r >= ctx->re_end || *r != '}') - return REG_BADBR; - r++; - } - else - { - if (r + 1 >= ctx->re_end - || *r != '\\' - || *(r + 1) != '}') - return REG_BADBR; - r += 2; - } - - - /* Create the AST node(s). */ - if (min == 0 && max == 0) - { - *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (*result == NULL) - return REG_ESPACE; - } - else - { - if (min < 0 && max < 0) - /* Only approximate parameters set, no repetitions. */ - min = max = 1; - - *result = tre_ast_new_iter(ctx->mem, *result, min, max); - if (!*result) - return REG_ESPACE; - } - - ctx->re = r; - return REG_OK; + int min, max; + + s = parse_dup_count(s, &min); + if (*s == ',') + s = parse_dup_count(s+1, &max); + else + max = min; + + if ( + (max < min && max >= 0) || + max > RE_DUP_MAX || + min > RE_DUP_MAX || + min < 0 || + (!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') || + *s++ != '}' + ) + return REG_BADBR; + + if (min == 0 && max == 0) + ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + else + ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); + if (!ctx->n) + return REG_ESPACE; + ctx->s = s; + return REG_OK; } -typedef enum { - PARSE_RE = 0, - PARSE_ATOM, - PARSE_MARK_FOR_SUBMATCH, - PARSE_BRANCH, - PARSE_PIECE, - PARSE_CATENATION, - PARSE_POST_CATENATION, - PARSE_UNION, - PARSE_POST_UNION, - PARSE_POSTFIX, - PARSE_RESTORE_CFLAGS -} tre_parse_re_stack_symbol_t; - - -static reg_errcode_t -tre_parse(tre_parse_ctx_t *ctx) +static int hexval(unsigned c) { - tre_ast_node_t *result = NULL; - tre_parse_re_stack_symbol_t symbol; - reg_errcode_t status = REG_OK; - tre_stack_t *stack = ctx->stack; - int bottom = tre_stack_num_objects(stack); - int depth = 0; - - DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n", - ctx->len, ctx->re, ctx->len)); - - if (!ctx->nofirstsub) - { - STACK_PUSH(stack, ctx->re); - STACK_PUSH(stack, ctx->submatch_id); - STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH); - ctx->submatch_id++; - } - STACK_PUSH(stack, PARSE_RE); - ctx->re_start = ctx->re; - ctx->re_end = ctx->re + ctx->len; - - - /* The following is basically just a recursive descent parser. I use - an explicit stack instead of recursive functions mostly because of - two reasons: compatibility with systems which have an overflowable - call stack, and efficiency (both in lines of code and speed). */ - while (tre_stack_num_objects(stack) > bottom && status == REG_OK) - { - if (status != REG_OK) - break; - symbol = (tre_parse_re_stack_symbol_t)tre_stack_pop(stack); - switch (symbol) - { - case PARSE_RE: - /* Parse a full regexp. A regexp is one or more branches, - separated by the union operator `|'. */ - if (ctx->cflags & REG_EXTENDED) - STACK_PUSHX(stack, PARSE_UNION); - STACK_PUSHX(stack, PARSE_BRANCH); - break; - - case PARSE_BRANCH: - /* Parse a branch. A branch is one or more pieces, concatenated. - A piece is an atom possibly followed by a postfix operator. */ - STACK_PUSHX(stack, PARSE_CATENATION); - STACK_PUSHX(stack, PARSE_PIECE); - break; - - case PARSE_PIECE: - /* Parse a piece. A piece is an atom possibly followed by one - or more postfix operators. */ - STACK_PUSHX(stack, PARSE_POSTFIX); - STACK_PUSHX(stack, PARSE_ATOM); - break; - - case PARSE_CATENATION: - /* If the expression has not ended, parse another piece. */ - { - tre_char_t c; - if (ctx->re >= ctx->re_end) - break; - c = *ctx->re; - if (ctx->cflags & REG_EXTENDED && c == '|') - break; - if ((ctx->cflags & REG_EXTENDED - && c == ')' && depth > 0) - || (!(ctx->cflags & REG_EXTENDED) - && (c == '\\' - && *(ctx->re + 1) == ')'))) - { - if (!(ctx->cflags & REG_EXTENDED) && depth == 0) - status = REG_EPAREN; - DPRINT(("tre_parse: group end: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - depth--; - if (!(ctx->cflags & REG_EXTENDED)) - ctx->re += 2; - break; - } - - /* Left associative concatenation. */ - STACK_PUSHX(stack, PARSE_CATENATION); - STACK_PUSHX(stack, result); - STACK_PUSHX(stack, PARSE_POST_CATENATION); - STACK_PUSHX(stack, PARSE_PIECE); - break; - } - - case PARSE_POST_CATENATION: - { - tre_ast_node_t *tree = tre_stack_pop(stack); - tre_ast_node_t *tmp_node; - tmp_node = tre_ast_new_catenation(ctx->mem, tree, result); - if (!tmp_node) - return REG_ESPACE; - result = tmp_node; - break; - } - - case PARSE_UNION: - if (ctx->re >= ctx->re_end) - break; - switch (*ctx->re) - { - case '|': - DPRINT(("tre_parse: union: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - STACK_PUSHX(stack, PARSE_UNION); - STACK_PUSHX(stack, result); - STACK_PUSHX(stack, PARSE_POST_UNION); - STACK_PUSHX(stack, PARSE_BRANCH); - ctx->re++; - break; - - case ')': - ctx->re++; - break; - - default: - break; - } - break; - - case PARSE_POST_UNION: - { - tre_ast_node_t *tmp_node; - tre_ast_node_t *tree = tre_stack_pop(stack); - tmp_node = tre_ast_new_union(ctx->mem, tree, result); - if (!tmp_node) - return REG_ESPACE; - result = tmp_node; - break; - } - - case PARSE_POSTFIX: - /* Parse postfix operators. */ - if (ctx->re >= ctx->re_end) - break; - switch (*ctx->re) - { - case '+': - case '?': - if (!(ctx->cflags & REG_EXTENDED)) - break; - case '*': - { - tre_ast_node_t *tmp_node; - int rep_min = 0; - int rep_max = -1; - if (*ctx->re == '+') - rep_min = 1; - if (*ctx->re == '?') - rep_max = 1; - - ctx->re++; - tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max); - if (tmp_node == NULL) - return REG_ESPACE; - result = tmp_node; - STACK_PUSHX(stack, PARSE_POSTFIX); - break; - } - - case '\\': - /* "\{" is special without REG_EXTENDED */ - if (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && *(ctx->re + 1) == '{') - { - ctx->re++; - goto parse_brace; - } - else - break; - - case '{': - /* "{" is literal without REG_EXTENDED */ - if (!(ctx->cflags & REG_EXTENDED)) - break; - - parse_brace: - DPRINT(("tre_parse: bound: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - ctx->re++; - - status = tre_parse_bound(ctx, &result); - if (status != REG_OK) - return status; - STACK_PUSHX(stack, PARSE_POSTFIX); - break; - } - break; - - case PARSE_ATOM: - /* Parse an atom. An atom is a regular expression enclosed in `()', - an empty set of `()', a bracket expression, `.', `^', `$', - a `\' followed by a character, or a single character. */ - - /* End of regexp? (empty string). */ - if (ctx->re >= ctx->re_end) - goto parse_literal; - - switch (*ctx->re) - { - case '(': /* parenthesized subexpression */ + if (c-'0'<10) return c-'0'; + c |= 32; + if (c-'a'<6) return c-'a'+10; + return -1; +} - if (ctx->cflags & REG_EXTENDED - || (ctx->re > ctx->re_start - && *(ctx->re - 1) == '\\')) - { - depth++; - { - DPRINT(("tre_parse: group begin: '%.*" STRF - "', submatch %d\n", - ctx->re_end - ctx->re, ctx->re, - ctx->submatch_id)); - ctx->re++; - /* First parse a whole RE, then mark the resulting tree - for submatching. */ - STACK_PUSHX(stack, ctx->submatch_id); - STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH); - STACK_PUSHX(stack, PARSE_RE); - ctx->submatch_id++; - } - } - else - goto parse_literal; - break; +static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid) +{ + if (node->submatch_id >= 0) { + tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (!n) + return REG_ESPACE; + n = tre_ast_new_catenation(ctx->mem, n, node); + if (!n) + return REG_ESPACE; + n->num_submatches = node->num_submatches; + node = n; + } + node->submatch_id = subid; + node->num_submatches++; + ctx->n = node; + return REG_OK; +} - case ')': /* end of current subexpression */ - if ((ctx->cflags & REG_EXTENDED && depth > 0) - || (ctx->re > ctx->re_start - && *(ctx->re - 1) == '\\')) - { - DPRINT(("tre_parse: empty: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - /* We were expecting an atom, but instead the current - subexpression was closed. POSIX leaves the meaning of - this to be implementation-defined. We interpret this as - an empty expression (which matches an empty string). */ - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (result == NULL) - return REG_ESPACE; - if (!(ctx->cflags & REG_EXTENDED)) - ctx->re--; - } - else - goto parse_literal; - break; +/* +BRE grammar: +Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$' +Branch = Atom | Branch Atom +Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref +Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}' - case '[': /* bracket expression */ - DPRINT(("tre_parse: bracket: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - ctx->re++; - status = tre_parse_bracket(ctx, &result); - if (status != REG_OK) - return status; - break; +(leading ^ and trailing $ in a sub expr may be an anchor or literal as well) - case '\\': - /* If this is "\(" or "\)" chew off the backslash and - try again. */ - if (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && (*(ctx->re + 1) == '(' - || *(ctx->re + 1) == ')')) - { - ctx->re++; - STACK_PUSHX(stack, PARSE_ATOM); - break; - } +ERE grammar: +Regex = Branch | Regex '|' Branch +Branch = Atom | Branch Atom +Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$' +Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}' - if (ctx->re + 1 >= ctx->re_end) - /* Trailing backslash. */ - return REG_EESCAPE; +(a*+?, ^*, $+, \X, {, (|a) are unspecified) +*/ - DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - ctx->re++; - switch (*ctx->re) - { - default: - if (!(ctx->cflags & REG_EXTENDED) && tre_isdigit(*ctx->re)) - { - /* Back reference. */ - int val = *ctx->re - '0'; - DPRINT(("tre_parse: backref: '%.*" STRF "'\n", - ctx->re_end - ctx->re + 1, ctx->re - 1)); - result = tre_ast_new_literal(ctx->mem, BACKREF, val, - ctx->position); - if (result == NULL) - return REG_ESPACE; - ctx->position++; - ctx->max_backref = MAX(val, ctx->max_backref); - ctx->re++; - } - else - { - /* Escaped character. */ - DPRINT(("tre_parse: escaped: '%.*" STRF "'\n", - ctx->re_end - ctx->re + 1, ctx->re - 1)); - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); - ctx->position++; - ctx->re++; - } - break; +static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s) +{ + int len, ere = ctx->cflags & REG_EXTENDED; + const char *p; + tre_ast_node_t *node; + wchar_t wc; + switch (*s) { + case '[': + return parse_bracket(ctx, s+1); + case '\\': + p = tre_expand_macro(s+1); + if (p) { + /* assume \X expansion is a single atom */ + reg_errcode_t err = parse_atom(ctx, p); + ctx->s = s+2; + return err; } - if (result == NULL) - return REG_ESPACE; - break; - - case '.': /* the any-symbol */ - DPRINT(("tre_parse: any: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - if (ctx->cflags & REG_NEWLINE) - { - tre_ast_node_t *tmp1; - tre_ast_node_t *tmp2; - tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1, - ctx->position); - if (!tmp1) - return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX, - ctx->position + 1); - if (!tmp2) - return REG_ESPACE; - result = tre_ast_new_union(ctx->mem, tmp1, tmp2); - if (!result) - return REG_ESPACE; - ctx->position += 2; + /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */ + switch (*++s) { + case 0: + return REG_EESCAPE; + case 'b': + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1); + break; + case 'B': + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1); + break; + case '<': + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1); + break; + case '>': + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1); + break; + case 'x': + s++; + int i, v = 0, c; + len = 2; + if (*s == '{') { + len = 8; + s++; + } + for (i=0; imem, v, v, ctx->position); + ctx->position++; + s--; + break; + default: + if (!ere && (unsigned)*s-'1' < 9) { + /* back reference */ + int val = *s - '0'; + node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position); + ctx->max_backref = MAX(val, ctx->max_backref); + } else { + /* extension: accept unknown escaped char + as a literal */ + goto parse_literal; + } + ctx->position++; } - else - { - result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, - ctx->position); - if (!result) - return REG_ESPACE; - ctx->position++; + s++; + break; + case '.': + if (ctx->cflags & REG_NEWLINE) { + tre_ast_node_t *tmp1, *tmp2; + tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++); + tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++); + if (tmp1 && tmp2) + node = tre_ast_new_union(ctx->mem, tmp1, tmp2); + else + node = 0; + } else { + node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++); } - ctx->re++; - break; - - case '^': /* beginning of line assertion */ - /* '^' has a special meaning everywhere in EREs, and in the - beginning of the RE and after \( is BREs. */ - if (ctx->cflags & REG_EXTENDED - || (ctx->re - 2 >= ctx->re_start - && *(ctx->re - 2) == '\\' - && *(ctx->re - 1) == '(') - || ctx->re == ctx->re_start) - { - DPRINT(("tre_parse: BOL: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_BOL, -1); - if (result == NULL) - return REG_ESPACE; - ctx->re++; + s++; + break; + case '^': + /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */ + if (!ere && s != ctx->re) + goto parse_literal; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1); + s++; + break; + case '$': + /* '$' is special everywhere in EREs, and in the end of the string in BREs. */ + if (!ere && s[1]) + goto parse_literal; + node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1); + s++; + break; + case '*': + case '|': + case '{': + case '+': + case '?': + if (!ere) + goto parse_literal; + case 0: + node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + break; + default: +parse_literal: + len = mbtowc(&wc, s, -1); + if (len < 0) + return REG_BADPAT; + if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) { + tre_ast_node_t *tmp1, *tmp2; + /* multiple opposite case characters are not supported */ + tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position); + tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position); + if (tmp1 && tmp2) + node = tre_ast_new_union(ctx->mem, tmp1, tmp2); + else + node = 0; + } else { + node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position); } - else - goto parse_literal; - break; + ctx->position++; + s += len; + break; + } + if (!node) + return REG_ESPACE; + ctx->n = node; + ctx->s = s; + return REG_OK; +} - case '$': /* end of line assertion. */ - /* '$' is special everywhere in EREs, and in the end of the - string and before \) is BREs. */ - if (ctx->cflags & REG_EXTENDED - || (ctx->re + 2 < ctx->re_end - && *(ctx->re + 1) == '\\' - && *(ctx->re + 2) == ')') - || ctx->re + 1 == ctx->re_end) - { - DPRINT(("tre_parse: EOL: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - result = tre_ast_new_literal(ctx->mem, ASSERTION, - ASSERT_AT_EOL, -1); - if (result == NULL) - return REG_ESPACE; - ctx->re++; - } - else - goto parse_literal; - break; +#define PUSHPTR(err, s, v) do { \ + if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \ + return err; \ +} while(0) - default: - parse_literal: - - /* We are expecting an atom. If the subexpression (or the whole - regexp ends here, we interpret it as an empty expression - (which matches an empty string). */ - if ( - (ctx->re >= ctx->re_end - || *ctx->re == '*' - || (ctx->cflags & REG_EXTENDED - && (*ctx->re == '|' - || *ctx->re == '{' - || *ctx->re == '+' - || *ctx->re == '?')) - /* Test for "\)" in BRE mode. */ - || (!(ctx->cflags & REG_EXTENDED) - && ctx->re + 1 < ctx->re_end - && *ctx->re == '\\' - && *(ctx->re + 1) == '{'))) - { - DPRINT(("tre_parse: empty: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (!result) - return REG_ESPACE; - break; - } +#define PUSHINT(err, s, v) do { \ + if ((err = tre_stack_push_int(s, v)) != REG_OK) \ + return err; \ +} while(0) - DPRINT(("tre_parse: literal: '%.*" STRF "'\n", - ctx->re_end - ctx->re, ctx->re)); - /* Note that we can't use an tre_isalpha() test here, since there - may be characters which are alphabetic but neither upper or - lower case. */ - if (ctx->cflags & REG_ICASE - && (tre_isupper(*ctx->re) || tre_islower(*ctx->re))) - { - tre_ast_node_t *tmp1; - tre_ast_node_t *tmp2; - - /* XXX - Can there be more than one opposite-case - counterpoints for some character in some locale? Or - more than two characters which all should be regarded - the same character if case is ignored? If yes, there - does not seem to be a portable way to detect it. I guess - that at least for multi-character collating elements there - could be several opposite-case counterpoints, but they - cannot be supported portably anyway. */ - tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re), - tre_toupper(*ctx->re), - ctx->position); - if (!tmp1) - return REG_ESPACE; - tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re), - tre_tolower(*ctx->re), - ctx->position); - if (!tmp2) - return REG_ESPACE; - result = tre_ast_new_union(ctx->mem, tmp1, tmp2); - if (!result) - return REG_ESPACE; +static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx) +{ + tre_ast_node_t *nbranch=0, *nunion=0; + int ere = ctx->cflags & REG_EXTENDED; + const char *s = ctx->re; + int subid = 0; + int depth = 0; + reg_errcode_t err; + tre_stack_t *stack = ctx->stack; + + PUSHINT(err, stack, subid++); + for (;;) { + if ((!ere && *s == '\\' && s[1] == '(') || + (ere && *s == '(')) { + PUSHPTR(err, stack, nunion); + PUSHPTR(err, stack, nbranch); + PUSHINT(err, stack, subid++); + s++; + if (!ere) + s++; + depth++; + nbranch = nunion = 0; + continue; } - else - { - result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re, - ctx->position); - if (!result) - return REG_ESPACE; + if ((!ere && *s == '\\' && s[1] == ')') || + (ere && *s == ')' && depth)) { + ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); + if (!ctx->n) + return REG_ESPACE; + } else { + err = parse_atom(ctx, s); + if (err != REG_OK) + return err; + s = ctx->s; } - ctx->position++; - ctx->re++; - break; - } - break; - - case PARSE_MARK_FOR_SUBMATCH: - { - int submatch_id = (int)tre_stack_pop(stack); - if (result->submatch_id >= 0) - { - tre_ast_node_t *n, *tmp_node; - n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1); - if (n == NULL) - return REG_ESPACE; - tmp_node = tre_ast_new_catenation(ctx->mem, n, result); - if (tmp_node == NULL) - return REG_ESPACE; - tmp_node->num_submatches = result->num_submatches; - result = tmp_node; - } - result->submatch_id = submatch_id; - result->num_submatches++; - break; - } + parse_iter: + /* extension: repetitions are accepted after an empty node + eg. (+), ^*, a$?, a|{2} */ + switch (*s) { + case '+': + case '?': + if (!ere) + break; + /* fallthrough */ + case '*':; + int min=0, max=-1; + if (*s == '+') + min = 1; + if (*s == '?') + max = 1; + s++; + ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0); + if (!ctx->n) + return REG_ESPACE; + /* extension: multiple consecutive *+?{,} is unspecified, + but (a+)+ has to be supported so accepting a++ makes + sense, note however that the RE_DUP_MAX limit can be + circumvented: (a{255}){255} uses a lot of memory.. */ + goto parse_iter; + case '\\': + if (ere || s[1] != '{') + break; + s++; + goto parse_brace; + case '{': + if (!ere) + break; + parse_brace: + err = parse_dup(ctx, s+1); + if (err != REG_OK) + return err; + s = ctx->s; + goto parse_iter; + } - case PARSE_RESTORE_CFLAGS: - ctx->cflags = (int)tre_stack_pop(stack); - break; + nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n); + if ((ere && *s == '|') || + (ere && *s == ')' && depth) || + (!ere && *s == '\\' && s[1] == ')') || + !*s) { + /* extension: empty branch is unspecified (), (|a), (a|) + here they are not rejected but match on empty string */ + int c = *s; + nunion = tre_ast_new_union(ctx->mem, nunion, nbranch); + nbranch = 0; + if (c != '|') { + if (c == '\\') { + if (!depth) return REG_EPAREN; + s+=2; + } else if (c == ')') + s++; + depth--; + err = marksub(ctx, nunion, tre_stack_pop_int(stack)); + if (err != REG_OK) + return err; + if (!c && depth<0) { + ctx->submatch_id = subid; + return REG_OK; + } + if (!c || depth<0) + return REG_EPAREN; + nbranch = tre_stack_pop_voidptr(stack); + nunion = tre_stack_pop_voidptr(stack); + goto parse_iter; + } + s++; + } } - } - - /* Check for missing closing parentheses. */ - if (depth > 0) - return REG_EPAREN; - - if (status == REG_OK) - ctx->result = result; - - return status; } @@ -1421,6 +1045,13 @@ tre_parse(tre_parse_ctx_t *ctx) from tre-compile.c ***********************************************************************/ + +/* + TODO: + - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive + function calls. +*/ + /* Algorithms to setup tags so that submatch addressing can be done. */ @@ -1429,42 +1060,60 @@ tre_parse(tre_parse_ctx_t *ctx) /* Inserts a catenation node to the root of the tree given in `node'. As the left child a new tag with number `tag_id' to `node' is added, and the right child is the old root. */ -/* OR */ +static reg_errcode_t +tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id) +{ + tre_catenation_t *c; + + c = tre_mem_alloc(mem, sizeof(*c)); + if (c == NULL) + return REG_ESPACE; + c->left = tre_ast_new_literal(mem, TAG, tag_id, -1); + if (c->left == NULL) + return REG_ESPACE; + c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + if (c->right == NULL) + return REG_ESPACE; + + c->right->obj = node->obj; + c->right->type = node->type; + c->right->nullable = -1; + c->right->submatch_id = -1; + c->right->firstpos = NULL; + c->right->lastpos = NULL; + c->right->num_tags = 0; + node->obj = c; + node->type = CATENATION; + return REG_OK; +} + /* Inserts a catenation node to the root of the tree given in `node'. As the right child a new tag with number `tag_id' to `node' is added, and the left child is the old root. */ static reg_errcode_t -tre_add_tag(tre_mem_t mem, tre_ast_node_t *node, int tag_id, int right) +tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id) { tre_catenation_t *c; - tre_ast_node_t *child_tag, *child_old; - - DPRINT(("add_tag_%s: tag %d\n", right ? "right" : "left", tag_id)); c = tre_mem_alloc(mem, sizeof(*c)); if (c == NULL) return REG_ESPACE; - child_tag = tre_ast_new_literal(mem, TAG, tag_id, -1); - if (child_tag == NULL) + c->right = tre_ast_new_literal(mem, TAG, tag_id, -1); + if (c->right == NULL) return REG_ESPACE; - child_old = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); - if (child_old == NULL) + c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t)); + if (c->left == NULL) return REG_ESPACE; - child_old->obj = node->obj; - child_old->type = node->type; - child_old->nullable = -1; - child_old->submatch_id = -1; - child_old->firstpos = NULL; - child_old->lastpos = NULL; - child_old->num_tags = 0; + c->left->obj = node->obj; + c->left->type = node->type; + c->left->nullable = -1; + c->left->submatch_id = -1; + c->left->firstpos = NULL; + c->left->lastpos = NULL; + c->left->num_tags = 0; node->obj = c; node->type = CATENATION; - - c->right = c->left = child_old; - if (right) c->right = child_tag; - else c->left = child_tag; - return REG_OK; } @@ -1484,6 +1133,27 @@ typedef struct { int next_tag; } tre_tag_states_t; + +/* Go through `regset' and set submatch data for submatches that are + using this tag. */ +static void +tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag) +{ + int i; + + for (i = 0; regset[i] >= 0; i++) + { + int id = regset[i] / 2; + int start = !(regset[i] % 2); + if (start) + tnfa->submatch_data[id].so_tag = tag; + else + tnfa->submatch_data[id].eo_tag = tag; + } + regset[0] = -1; +} + + /* Adds tags to appropriate locations in the parse tree in `tree', so that subexpressions marked for submatch addressing can be traced. */ static reg_errcode_t @@ -1498,15 +1168,20 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, int first_pass = (mem == NULL || tnfa == NULL); int *regset, *orig_regset; int num_tags = 0; /* Total number of tags. */ + int num_minimals = 0; /* Number of special minimal tags. */ int tag = 0; /* The tag that is to be added next. */ int next_tag = 1; /* Next tag to use after this one. */ int *parents; /* Stack of submatches the current submatch is contained in. */ + int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */ tre_tag_states_t *saved_states; tre_tag_direction_t direction = TRE_TAG_MINIMIZE; if (!first_pass) + { tnfa->end_tag = 0; + tnfa->minimal_tags[0] = -1; + } regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2)); if (regset == NULL) @@ -1536,21 +1211,21 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, saved_states[i].tag = -1; } - STACK_PUSH(stack, node); - STACK_PUSH(stack, ADDTAGS_RECURSE); + STACK_PUSH(stack, voidptr, node); + STACK_PUSH(stack, int, ADDTAGS_RECURSE); while (tre_stack_num_objects(stack) > bottom) { if (status != REG_OK) break; - symbol = (tre_addtags_symbol_t)tre_stack_pop(stack); + symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack); switch (symbol) { case ADDTAGS_SET_SUBMATCH_END: { - int id = (int)tre_stack_pop(stack); + int id = tre_stack_pop_int(stack); int i; /* Add end of this submatch to regset. */ @@ -1565,7 +1240,7 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, } case ADDTAGS_RECURSE: - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); if (node->submatch_id >= 0) { @@ -1600,8 +1275,8 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, /* Add end of this submatch to regset after processing this node. */ - STACK_PUSHX(stack, node->submatch_id); - STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END); + STACK_PUSHX(stack, int, node->submatch_id); + STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END); } switch (node->type) @@ -1613,38 +1288,30 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, if (!IS_SPECIAL(lit) || IS_BACKREF(lit)) { int i; - DPRINT(("Literal %d-%d\n", - (int)lit->code_min, (int)lit->code_max)); if (regset[0] >= 0) { /* Regset is not empty, so add a tag before the literal or backref. */ if (!first_pass) { - status = tre_add_tag(mem, node, tag, 0 /*left*/); + status = tre_add_tag_left(mem, node, tag); tnfa->tag_directions[tag] = direction; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) + if (minimal_tag >= 0) { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", tag, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = tag; - else - tnfa->submatch_data[id].eo_tag = tag; + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } + tre_purge_regset(regset, tnfa, tag); } else { - DPRINT((" num_tags = 1\n")); node->num_tags = 1; } - DPRINT((" num_tags++\n")); regset[0] = -1; tag = next_tag; num_tags++; @@ -1663,82 +1330,75 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, tre_ast_node_t *left = cat->left; tre_ast_node_t *right = cat->right; int reserved_tag = -1; - DPRINT(("Catenation, next_tag = %d\n", next_tag)); /* After processing right child. */ - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT); /* Process right child. */ - STACK_PUSHX(stack, right); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* After processing left child. */ - STACK_PUSHX(stack, next_tag + left->num_tags); - DPRINT((" Pushing %d for after left\n", - next_tag + left->num_tags)); + STACK_PUSHX(stack, int, next_tag + left->num_tags); if (left->num_tags > 0 && right->num_tags > 0) { /* Reserve the next tag to the right child. */ - DPRINT((" Reserving next_tag %d to right child\n", - next_tag)); reserved_tag = next_tag; next_tag++; } - STACK_PUSHX(stack, reserved_tag); - STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT); + STACK_PUSHX(stack, int, reserved_tag); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT); /* Process left child. */ - STACK_PUSHX(stack, left); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); } break; case ITERATION: { tre_iteration_t *iter = node->obj; - DPRINT(("Iteration\n")); if (first_pass) { - STACK_PUSHX(stack, regset[0] >= 0); + STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal); } else { - STACK_PUSHX(stack, tag); + STACK_PUSHX(stack, int, tag); + STACK_PUSHX(stack, int, iter->minimal); } - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION); - STACK_PUSHX(stack, iter->arg); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* Regset is not empty, so add a tag here. */ - if (regset[0] >= 0) + if (regset[0] >= 0 || iter->minimal) { if (!first_pass) { int i; - status = tre_add_tag(mem, node, tag, 0 /*left*/); - tnfa->tag_directions[tag] = direction; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) + status = tre_add_tag_left(mem, node, tag); + if (iter->minimal) + tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; + else + tnfa->tag_directions[tag] = direction; + if (minimal_tag >= 0) { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", tag, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = tag; - else - tnfa->submatch_data[id].eo_tag = tag; + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } + tre_purge_regset(regset, tnfa, tag); } - DPRINT((" num_tags++\n")); regset[0] = -1; tag = next_tag; num_tags++; @@ -1766,28 +1426,26 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, right_tag = next_tag; } - DPRINT(("Union\n")); - /* After processing right child. */ - STACK_PUSHX(stack, right_tag); - STACK_PUSHX(stack, left_tag); - STACK_PUSHX(stack, regset); - STACK_PUSHX(stack, regset[0] >= 0); - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, right); - STACK_PUSHX(stack, left); - STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT); + STACK_PUSHX(stack, int, right_tag); + STACK_PUSHX(stack, int, left_tag); + STACK_PUSHX(stack, voidptr, regset); + STACK_PUSHX(stack, int, regset[0] >= 0); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT); /* Process right child. */ - STACK_PUSHX(stack, right); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, right); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* After processing left child. */ - STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT); + STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT); /* Process left child. */ - STACK_PUSHX(stack, left); - STACK_PUSHX(stack, ADDTAGS_RECURSE); + STACK_PUSHX(stack, voidptr, left); + STACK_PUSHX(stack, int, ADDTAGS_RECURSE); /* Regset is not empty, so add a tag here. */ if (regset[0] >= 0) @@ -1795,25 +1453,20 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, if (!first_pass) { int i; - status = tre_add_tag(mem, node, tag, 0 /*left*/); + status = tre_add_tag_left(mem, node, tag); tnfa->tag_directions[tag] = direction; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) + if (minimal_tag >= 0) { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", tag, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = tag; - else - tnfa->submatch_data[id].eo_tag = tag; + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } + tre_purge_regset(regset, tnfa, tag); } - DPRINT((" num_tags++\n")); regset[0] = -1; tag = next_tag; num_tags++; @@ -1845,43 +1498,52 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, case ADDTAGS_AFTER_ITERATION: { + int minimal = 0; int enter_tag; - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); if (first_pass) + { node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags - + (int)tre_stack_pop(stack); + + tre_stack_pop_int(stack); + minimal_tag = -1; + } else - enter_tag = (int)tre_stack_pop(stack); + { + minimal = tre_stack_pop_int(stack); + enter_tag = tre_stack_pop_int(stack); + if (minimal) + minimal_tag = enter_tag; + } - DPRINT(("After iteration\n")); - direction = TRE_TAG_MAXIMIZE; + if (!first_pass) + { + if (minimal) + direction = TRE_TAG_MINIMIZE; + else + direction = TRE_TAG_MAXIMIZE; + } break; } case ADDTAGS_AFTER_CAT_LEFT: { - int new_tag = (int)tre_stack_pop(stack); - next_tag = (int)tre_stack_pop(stack); - DPRINT(("After cat left, tag = %d, next_tag = %d\n", - tag, next_tag)); + int new_tag = tre_stack_pop_int(stack); + next_tag = tre_stack_pop_int(stack); if (new_tag >= 0) { - DPRINT((" Setting tag to %d\n", new_tag)); tag = new_tag; } break; } case ADDTAGS_AFTER_CAT_RIGHT: - DPRINT(("After cat right\n")); - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); if (first_pass) node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags + ((tre_catenation_t *)node->obj)->right->num_tags; break; case ADDTAGS_AFTER_UNION_LEFT: - DPRINT(("After union left\n")); /* Lift the bottom of the `regset' array so that when processing the right operand the items currently in the array are invisible. The original bottom was saved at ADDTAGS_UNION and @@ -1893,20 +1555,19 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, case ADDTAGS_AFTER_UNION_RIGHT: { int added_tags, tag_left, tag_right; - tre_ast_node_t *left = tre_stack_pop(stack); - tre_ast_node_t *right = tre_stack_pop(stack); - DPRINT(("After union right\n")); - node = tre_stack_pop(stack); - added_tags = (int)tre_stack_pop(stack); + tre_ast_node_t *left = tre_stack_pop_voidptr(stack); + tre_ast_node_t *right = tre_stack_pop_voidptr(stack); + node = tre_stack_pop_voidptr(stack); + added_tags = tre_stack_pop_int(stack); if (first_pass) { node->num_tags = ((tre_union_t *)node->obj)->left->num_tags + ((tre_union_t *)node->obj)->right->num_tags + added_tags + ((node->num_submatches > 0) ? 2 : 0); } - regset = tre_stack_pop(stack); - tag_left = (int)tre_stack_pop(stack); - tag_right = (int)tre_stack_pop(stack); + regset = tre_stack_pop_voidptr(stack); + tag_left = tre_stack_pop_int(stack); + tag_right = tre_stack_pop_int(stack); /* Add tags after both children, the left child gets a smaller tag than the right child. This guarantees that we prefer @@ -1921,12 +1582,12 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, { if (!first_pass) { - status = tre_add_tag(mem, left, tag_left, 1 /*right*/); - tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; - status = tre_add_tag(mem, right, tag_right, 1 /*right*/); - tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE; + status = tre_add_tag_right(mem, left, tag_left); + tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE; + if (status == REG_OK) + status = tre_add_tag_right(mem, right, tag_right); + tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE; } - DPRINT((" num_tags += 2\n")); num_tags += 2; } direction = TRE_TAG_MAXIMIZE; @@ -1941,30 +1602,23 @@ tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree, } /* end while(tre_stack_num_objects(stack) > bottom) */ if (!first_pass) + tre_purge_regset(regset, tnfa, tag); + + if (!first_pass && minimal_tag >= 0) { int i; - /* Go through the regset and set submatch data for - submatches that are using this tag. */ - for (i = 0; regset[i] >= 0; i++) - { - int id = regset[i] >> 1; - int start = !(regset[i] & 1); - DPRINT((" Using tag %d for %s offset of " - "submatch %d\n", num_tags, - start ? "start" : "end", id)); - if (start) - tnfa->submatch_data[id].so_tag = num_tags; - else - tnfa->submatch_data[id].eo_tag = num_tags; - } + for (i = 0; tnfa->minimal_tags[i] >= 0; i++); + tnfa->minimal_tags[i] = tag; + tnfa->minimal_tags[i + 1] = minimal_tag; + tnfa->minimal_tags[i + 2] = -1; + minimal_tag = -1; + num_minimals++; } - DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n", - first_pass? "First pass" : "Second pass", num_tags)); - assert(tree->num_tags == num_tags); tnfa->end_tag = num_tags; tnfa->num_tags = num_tags; + tnfa->num_minimals = num_minimals; xfree(orig_regset); xfree(parents); xfree(saved_states); @@ -1998,8 +1652,8 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, tre_ast_node_t **result = copy; tre_copyast_symbol_t symbol; - STACK_PUSH(stack, ast); - STACK_PUSH(stack, COPY_RECURSE); + STACK_PUSH(stack, voidptr, ast); + STACK_PUSH(stack, int, COPY_RECURSE); while (status == REG_OK && tre_stack_num_objects(stack) > bottom) { @@ -2007,14 +1661,14 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (status != REG_OK) break; - symbol = (tre_copyast_symbol_t)tre_stack_pop(stack); + symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack); switch (symbol) { case COPY_SET_RESULT_PTR: - result = tre_stack_pop(stack); + result = tre_stack_pop_voidptr(stack); break; case COPY_RECURSE: - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); switch (node->type) { case LITERAL: @@ -2047,6 +1701,11 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, *result = tre_ast_new_literal(mem, min, max, pos); if (*result == NULL) status = REG_ESPACE; + else { + tre_literal_t *p = (*result)->obj; + p->class = lit->class; + p->neg_classes = lit->neg_classes; + } if (pos > *max_pos) *max_pos = pos; @@ -2055,52 +1714,53 @@ tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, case UNION: { tre_union_t *uni = node->obj; - tre_union_t *copy; + tre_union_t *tmp; *result = tre_ast_new_union(mem, uni->left, uni->right); if (*result == NULL) { status = REG_ESPACE; break; } - copy = (*result)->obj; - result = ©->left; - STACK_PUSHX(stack, uni->right); - STACK_PUSHX(stack, COPY_RECURSE); - STACK_PUSHX(stack, ©->right); - STACK_PUSHX(stack, COPY_SET_RESULT_PTR); - STACK_PUSHX(stack, uni->left); - STACK_PUSHX(stack, COPY_RECURSE); + tmp = (*result)->obj; + result = &tmp->left; + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, COPY_RECURSE); break; } case CATENATION: { tre_catenation_t *cat = node->obj; - tre_catenation_t *copy; + tre_catenation_t *tmp; *result = tre_ast_new_catenation(mem, cat->left, cat->right); if (*result == NULL) { status = REG_ESPACE; break; } - copy = (*result)->obj; - copy->left = NULL; - copy->right = NULL; - result = ©->left; - - STACK_PUSHX(stack, cat->right); - STACK_PUSHX(stack, COPY_RECURSE); - STACK_PUSHX(stack, ©->right); - STACK_PUSHX(stack, COPY_SET_RESULT_PTR); - STACK_PUSHX(stack, cat->left); - STACK_PUSHX(stack, COPY_RECURSE); + tmp = (*result)->obj; + tmp->left = NULL; + tmp->right = NULL; + result = &tmp->left; + + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, COPY_RECURSE); + STACK_PUSHX(stack, voidptr, &tmp->right); + STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, COPY_RECURSE); break; } case ITERATION: { tre_iteration_t *iter = node->obj; - STACK_PUSHX(stack, iter->arg); - STACK_PUSHX(stack, COPY_RECURSE); - *result = tre_ast_new_iter(mem, iter->arg, iter->min, iter->max); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, COPY_RECURSE); + *result = tre_ast_new_iter(mem, iter->arg, iter->min, + iter->max, iter->minimal); if (*result == NULL) { status = REG_ESPACE; @@ -2130,19 +1790,17 @@ typedef enum { iteration count to a catenated sequence of copies of the node. */ static reg_errcode_t tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, - int *position, tre_tag_direction_t *tag_directions, - int *max_depth) + int *position, tre_tag_direction_t *tag_directions) { reg_errcode_t status = REG_OK; int bottom = tre_stack_num_objects(stack); int pos_add = 0; int pos_add_total = 0; int max_pos = 0; - /* Approximate parameter nesting level. */ int iter_depth = 0; - STACK_PUSHR(stack, ast); - STACK_PUSHR(stack, EXPAND_RECURSE); + STACK_PUSHR(stack, voidptr, ast); + STACK_PUSHR(stack, int, EXPAND_RECURSE); while (status == REG_OK && tre_stack_num_objects(stack) > bottom) { tre_ast_node_t *node; @@ -2151,10 +1809,8 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (status != REG_OK) break; - DPRINT(("pos_add %d\n", pos_add)); - - symbol = (tre_expand_ast_symbol_t)tre_stack_pop(stack); - node = tre_stack_pop(stack); + symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); switch (symbol) { case EXPAND_RECURSE: @@ -2174,36 +1830,35 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, case UNION: { tre_union_t *uni = node->obj; - STACK_PUSHX(stack, uni->right); - STACK_PUSHX(stack, EXPAND_RECURSE); - STACK_PUSHX(stack, uni->left); - STACK_PUSHX(stack, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, uni->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, uni->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); break; } case CATENATION: { tre_catenation_t *cat = node->obj; - STACK_PUSHX(stack, cat->right); - STACK_PUSHX(stack, EXPAND_RECURSE); - STACK_PUSHX(stack, cat->left); - STACK_PUSHX(stack, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, cat->right); + STACK_PUSHX(stack, int, EXPAND_RECURSE); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, int, EXPAND_RECURSE); break; } case ITERATION: { tre_iteration_t *iter = node->obj; - STACK_PUSHX(stack, pos_add); - STACK_PUSHX(stack, node); - STACK_PUSHX(stack, EXPAND_AFTER_ITER); - STACK_PUSHX(stack, iter->arg); - STACK_PUSHX(stack, EXPAND_RECURSE); + STACK_PUSHX(stack, int, pos_add); + STACK_PUSHX(stack, voidptr, node); + STACK_PUSHX(stack, int, EXPAND_AFTER_ITER); + STACK_PUSHX(stack, voidptr, iter->arg); + STACK_PUSHX(stack, int, EXPAND_RECURSE); /* If we are going to expand this node at EXPAND_AFTER_ITER then don't increase the `pos' fields of the nodes now, it will get done when expanding. */ if (iter->min > 1 || iter->max > 1) pos_add = 0; iter_depth++; - DPRINT(("iter\n")); break; } default: @@ -2215,23 +1870,22 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, { tre_iteration_t *iter = node->obj; int pos_add_last; - pos_add = (int)tre_stack_pop(stack); + pos_add = tre_stack_pop_int(stack); pos_add_last = pos_add; if (iter->min > 1 || iter->max > 1) { tre_ast_node_t *seq1 = NULL, *seq2 = NULL; - int i; + int j; int pos_add_save = pos_add; /* Create a catenated sequence of copies of the node. */ - for (i = 0; i < iter->min; i++) + for (j = 0; j < iter->min; j++) { tre_ast_node_t *copy; /* Remove tags from all but the last copy. */ - int flags = ((i + 1 < iter->min) + int flags = ((j + 1 < iter->min) ? COPY_REMOVE_TAGS : COPY_MAXIMIZE_FIRST_TAG); - DPRINT((" pos_add %d\n", pos_add)); pos_add_save = pos_add; status = tre_copy_ast(mem, stack, iter->arg, flags, &pos_add, tag_directions, ©, @@ -2254,13 +1908,13 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, &pos_add, NULL, &seq2, &max_pos); if (status != REG_OK) return status; - seq2 = tre_ast_new_iter(mem, seq2, 0, -1); + seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0); if (seq2 == NULL) return REG_ESPACE; } else { - for (i = iter->min; i < iter->max; i++) + for (j = iter->min; j < iter->max; j++) { tre_ast_node_t *tmp, *copy; pos_add_save = pos_add; @@ -2316,12 +1970,6 @@ tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast, if (max_pos > *position) *position = max_pos; -#ifdef TRE_DEBUG - DPRINT(("Expanded AST:\n")); - tre_ast_print(ast); - DPRINT(("*position %d, max_pos %d\n", *position, max_pos)); -#endif - return status; } @@ -2453,12 +2101,12 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, if (num_tags_seen) *num_tags_seen = 0; - status = tre_stack_push(stack, node); + status = tre_stack_push_voidptr(stack, node); /* Walk through the tree recursively. */ while (status == REG_OK && tre_stack_num_objects(stack) > bottom) { - node = tre_stack_pop(stack); + node = tre_stack_pop_voidptr(stack); switch (node->type) { @@ -2505,9 +2153,9 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, right subexpression. */ uni = (tre_union_t *)node->obj; if (uni->left->nullable) - STACK_PUSHX(stack, uni->left) + STACK_PUSHX(stack, voidptr, uni->left) else if (uni->right->nullable) - STACK_PUSHX(stack, uni->right) + STACK_PUSHX(stack, voidptr, uni->right) else assert(0); break; @@ -2517,8 +2165,8 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, cat = (tre_catenation_t *)node->obj; assert(cat->left->nullable); assert(cat->right->nullable); - STACK_PUSHX(stack, cat->left); - STACK_PUSHX(stack, cat->right); + STACK_PUSHX(stack, voidptr, cat->left); + STACK_PUSHX(stack, voidptr, cat->right); break; case ITERATION: @@ -2526,7 +2174,7 @@ tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags, all, so we go through the argument if possible. */ iter = (tre_iteration_t *)node->obj; if (iter->arg->nullable) - STACK_PUSHX(stack, iter->arg); + STACK_PUSHX(stack, voidptr, iter->arg); break; default: @@ -2554,16 +2202,16 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) { int bottom = tre_stack_num_objects(stack); - STACK_PUSHR(stack, tree); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, tree); + STACK_PUSHR(stack, int, NFL_RECURSE); while (tre_stack_num_objects(stack) > bottom) { tre_nfl_stack_symbol_t symbol; tre_ast_node_t *node; - symbol = (tre_nfl_stack_symbol_t) tre_stack_pop(stack); - node = tre_stack_pop(stack); + symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack); + node = tre_stack_pop_voidptr(stack); switch (symbol) { case NFL_RECURSE: @@ -2583,13 +2231,13 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) return REG_ESPACE; node->lastpos = tre_set_one(mem, lit->position, 0, TRE_CHAR_MAX, 0, NULL, - lit->code_max); + (int)lit->code_max); if (!node->lastpos) return REG_ESPACE; } else if (lit->code_min < 0) { - /* Tags, empty strings and zero width assertions: + /* Tags, empty strings, params, and zero width assertions: nullable = true, firstpos = {}, and lastpos = {}. */ node->nullable = 1; node->firstpos = tre_set_empty(mem); @@ -2605,12 +2253,13 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) lastpos = {i}. */ node->nullable = 0; node->firstpos = - tre_set_one(mem, lit->position, lit->code_min, - lit->code_max, 0, NULL, -1); + tre_set_one(mem, lit->position, (int)lit->code_min, + (int)lit->code_max, 0, NULL, -1); if (!node->firstpos) return REG_ESPACE; node->lastpos = tre_set_one(mem, lit->position, - lit->code_min, lit->code_max, + (int)lit->code_min, + (int)lit->code_max, lit->class, lit->neg_classes, -1); if (!node->lastpos) @@ -2622,32 +2271,32 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) case UNION: /* Compute the attributes for the two subtrees, and after that for this node. */ - STACK_PUSHR(stack, node); - STACK_PUSHR(stack, NFL_POST_UNION); - STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right); - STACK_PUSHR(stack, NFL_RECURSE); - STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_UNION); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right); + STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left); + STACK_PUSHR(stack, int, NFL_RECURSE); break; case CATENATION: /* Compute the attributes for the two subtrees, and after that for this node. */ - STACK_PUSHR(stack, node); - STACK_PUSHR(stack, NFL_POST_CATENATION); - STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right); - STACK_PUSHR(stack, NFL_RECURSE); - STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_CATENATION); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right); + STACK_PUSHR(stack, int, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left); + STACK_PUSHR(stack, int, NFL_RECURSE); break; case ITERATION: /* Compute the attributes for the subtree, and after that for this node. */ - STACK_PUSHR(stack, node); - STACK_PUSHR(stack, NFL_POST_ITERATION); - STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg); - STACK_PUSHR(stack, NFL_RECURSE); + STACK_PUSHR(stack, voidptr, node); + STACK_PUSHR(stack, int, NFL_POST_ITERATION); + STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg); + STACK_PUSHR(stack, int, NFL_RECURSE); break; } break; /* end case: NFL_RECURSE */ @@ -2691,7 +2340,8 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) if (cat->left->nullable) { /* The left side matches the empty string. Make a first pass - with tre_match_empty() to get the number of tags. */ + with tre_match_empty() to get the number of tags and + parameters. */ status = tre_match_empty(stack, cat->left, NULL, NULL, &num_tags); if (status != REG_OK) @@ -2703,7 +2353,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) tags[0] = -1; assertions = 0; /* Second pass with tre_mach_empty() to get the list of - tags. */ + tags and parameters. */ status = tre_match_empty(stack, cat->left, tags, &assertions, NULL); if (status != REG_OK) @@ -2727,7 +2377,8 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) if (cat->right->nullable) { /* The right side matches the empty string. Make a first pass - with tre_match_empty() to get the number of tags. */ + with tre_match_empty() to get the number of tags and + parameters. */ status = tre_match_empty(stack, cat->right, NULL, NULL, &num_tags); if (status != REG_OK) @@ -2739,7 +2390,7 @@ tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree) tags[0] = -1; assertions = 0; /* Second pass with tre_mach_empty() to get the list of - tags. */ + tags and parameters. */ status = tre_match_empty(stack, cat->right, tags, &assertions, NULL); if (status != REG_OK) @@ -2814,7 +2465,6 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, detect it here. */ if (trans->state_id == p2->position) { - DPRINT(("*")); break; } #endif @@ -2903,39 +2553,6 @@ tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2, trans->tags[l] = -1; } - -#ifdef TRE_DEBUG - { - int *tags; - - DPRINT((" %2d -> %2d on %3d", p1->position, p2->position, - p1->code_min)); - if (p1->code_max != p1->code_min) - DPRINT(("-%3d", p1->code_max)); - tags = trans->tags; - if (tags) - { - DPRINT((", tags [")); - while (*tags >= 0) - { - DPRINT(("%d", *tags)); - tags++; - if (*tags >= 0) - DPRINT((",")); - } - DPRINT(("]")); - } - if (trans->assertions) - DPRINT((", assert %d", trans->assertions)); - if (trans->assertions & ASSERT_BACKREF) - DPRINT((", backref %d", trans->u.backref)); - else if (trans->class) - DPRINT((", class %ld", (long)trans->class)); - if (trans->neg_classes) - DPRINT((", neg_classes %p", trans->neg_classes)); - DPRINT(("\n")); - } -#endif /* TRE_DEBUG */ p2++; } p1++; @@ -3017,63 +2634,18 @@ tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions, } -static void -tre_free(regex_t *preg) -{ - tre_tnfa_t *tnfa; - unsigned int i; - tre_tnfa_transition_t *trans; - - tnfa = (void *)preg->TRE_REGEX_T_FIELD; - if (!tnfa) - return; - - for (i = 0; i < tnfa->num_transitions; i++) - if (tnfa->transitions[i].state) - { - if (tnfa->transitions[i].tags) - xfree(tnfa->transitions[i].tags); - if (tnfa->transitions[i].neg_classes) - xfree(tnfa->transitions[i].neg_classes); - } - if (tnfa->transitions) - xfree(tnfa->transitions); - - if (tnfa->initial) - { - for (trans = tnfa->initial; trans->state; trans++) - { - if (trans->tags) - xfree(trans->tags); - } - xfree(tnfa->initial); - } - - if (tnfa->submatch_data) - { - for (i = 0; i < tnfa->num_submatches; i++) - if (tnfa->submatch_data[i].parents) - xfree(tnfa->submatch_data[i].parents); - xfree(tnfa->submatch_data); - } - - if (tnfa->tag_directions) - xfree(tnfa->tag_directions); - xfree(tnfa); -} - - #define ERROR_EXIT(err) \ do \ { \ errcode = err; \ - if (1) goto error_exit; \ + if (/*CONSTCOND*/1) \ + goto error_exit; \ } \ - while (0) + while (/*CONSTCOND*/0) -static int -tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) +int +regcomp(regex_t *restrict preg, const char *restrict regex, int cflags) { tre_stack_t *stack; tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r; @@ -3108,15 +2680,13 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) parse_ctx.mem = mem; parse_ctx.stack = stack; parse_ctx.re = regex; - parse_ctx.len = n; parse_ctx.cflags = cflags; parse_ctx.max_backref = -1; - DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex)); errcode = tre_parse(&parse_ctx); if (errcode != REG_OK) ERROR_EXIT(errcode); - preg->re_nsub = preg->__nsub2 = parse_ctx.submatch_id - 1; - tree = parse_ctx.result; + preg->re_nsub = parse_ctx.submatch_id - 1; + tree = parse_ctx.n; #ifdef TRE_DEBUG tre_ast_print(tree); @@ -3131,21 +2701,18 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (tnfa == NULL) ERROR_EXIT(REG_ESPACE); tnfa->have_backrefs = parse_ctx.max_backref >= 0; + tnfa->have_approx = 0; tnfa->num_submatches = parse_ctx.submatch_id; /* Set up tags for submatch addressing. If REG_NOSUB is set and the regexp does not have back references, this can be skipped. */ if (tnfa->have_backrefs || !(cflags & REG_NOSUB)) { - DPRINT(("tre_compile: setting up tags\n")); /* Figure out how many tags we will need. */ errcode = tre_add_tags(NULL, stack, tree, tnfa); if (errcode != REG_OK) ERROR_EXIT(errcode); -#ifdef TRE_DEBUG - tre_ast_print(tree); -#endif /* TRE_DEBUG */ if (tnfa->num_tags > 0) { @@ -3157,8 +2724,13 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) memset(tag_directions, -1, sizeof(*tag_directions) * (tnfa->num_tags + 1)); } + tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1, + sizeof(*tnfa->minimal_tags)); + if (tnfa->minimal_tags == NULL) + ERROR_EXIT(REG_ESPACE); - submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data)); + submatch_data = xcalloc((unsigned)parse_ctx.submatch_id, + sizeof(*submatch_data)); if (submatch_data == NULL) ERROR_EXIT(REG_ESPACE); tnfa->submatch_data = submatch_data; @@ -3167,20 +2739,11 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (errcode != REG_OK) ERROR_EXIT(errcode); -#ifdef TRE_DEBUG - for (i = 0; i < parse_ctx.submatch_id; i++) - DPRINT(("pmatch[%d] = {t%d, t%d}\n", - i, submatch_data[i].so_tag, submatch_data[i].eo_tag)); - for (i = 0; i < tnfa->num_tags; i++) - DPRINT(("t%d is %s\n", i, - tag_directions[i] == TRE_TAG_MINIMIZE ? - "minimized" : "maximized")); -#endif /* TRE_DEBUG */ } /* Expand iteration nodes. */ errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position, - tag_directions, NULL); + tag_directions); if (errcode != REG_OK) ERROR_EXIT(errcode); @@ -3197,11 +2760,6 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (tree == NULL) ERROR_EXIT(REG_ESPACE); -#ifdef TRE_DEBUG - tre_ast_print(tree); - DPRINT(("Number of states: %d\n", parse_ctx.position)); -#endif /* TRE_DEBUG */ - errcode = tre_compute_nfl(mem, stack, tree); if (errcode != REG_OK) ERROR_EXIT(errcode); @@ -3225,49 +2783,27 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) add += counts[i] + 1; counts[i] = 0; } - transitions = xcalloc(add + 1, sizeof(*transitions)); + transitions = xcalloc((unsigned)add + 1, sizeof(*transitions)); if (transitions == NULL) ERROR_EXIT(REG_ESPACE); tnfa->transitions = transitions; tnfa->num_transitions = add; - DPRINT(("Converting to TNFA:\n")); errcode = tre_ast_to_tnfa(tree, transitions, counts, offs); if (errcode != REG_OK) ERROR_EXIT(errcode); + tnfa->firstpos_chars = NULL; + p = tree->firstpos; i = 0; while (p->position >= 0) { i++; - -#ifdef TRE_DEBUG - { - int *tags; - DPRINT(("initial: %d", p->position)); - tags = p->tags; - if (tags != NULL) - { - if (*tags >= 0) - DPRINT(("/")); - while (*tags >= 0) - { - DPRINT(("%d", *tags)); - tags++; - if (*tags >= 0) - DPRINT((",")); - } - } - DPRINT((", assert %d", p->assertions)); - DPRINT(("\n")); - } -#endif /* TRE_DEBUG */ - p++; } - initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t)); + initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t)); if (initial == NULL) ERROR_EXIT(REG_ESPACE); tnfa->initial = initial; @@ -3278,7 +2814,7 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) initial[i].state = transitions + offs[p->position]; initial[i].state_id = p->position; initial[i].tags = NULL; - /* Copy the arrays p->tags, they are allocated + /* Copy the arrays p->tags, and p->params, they are allocated from a tre_mem object. */ if (p->tags) { @@ -3299,8 +2835,6 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) tnfa->num_states = parse_ctx.position; tnfa->cflags = cflags; - DPRINT(("final state %p\n", (void *)tnfa->final)); - tre_mem_destroy(mem); tre_stack_destroy(stack); xfree(counts); @@ -3319,44 +2853,58 @@ tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags) if (offs != NULL) xfree(offs); preg->TRE_REGEX_T_FIELD = (void *)tnfa; - tre_free(preg); + regfree(preg); return errcode; } -/*********************************************************************** - from regcomp.c -***********************************************************************/ -int -regcomp(regex_t *preg, const char *regex, int cflags) + +void +regfree(regex_t *preg) { - int ret; - tre_char_t *wregex; - size_t n = strlen(regex); + tre_tnfa_t *tnfa; + unsigned int i; + tre_tnfa_transition_t *trans; - if (n+1 > SIZE_MAX/sizeof(tre_char_t)) - return REG_ESPACE; - wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); - if (wregex == NULL) - return REG_ESPACE; + tnfa = (void *)preg->TRE_REGEX_T_FIELD; + if (!tnfa) + return; - n = mbstowcs(wregex, regex, n+1); - if (n == (size_t)-1) { - xfree(wregex); - return REG_BADPAT; - } + for (i = 0; i < tnfa->num_transitions; i++) + if (tnfa->transitions[i].state) + { + if (tnfa->transitions[i].tags) + xfree(tnfa->transitions[i].tags); + if (tnfa->transitions[i].neg_classes) + xfree(tnfa->transitions[i].neg_classes); + } + if (tnfa->transitions) + xfree(tnfa->transitions); - ret = tre_compile(preg, wregex, n, cflags); - xfree(wregex); + if (tnfa->initial) + { + for (trans = tnfa->initial; trans->state; trans++) + { + if (trans->tags) + xfree(trans->tags); + } + xfree(tnfa->initial); + } - return ret; -} + if (tnfa->submatch_data) + { + for (i = 0; i < tnfa->num_submatches; i++) + if (tnfa->submatch_data[i].parents) + xfree(tnfa->submatch_data[i].parents); + xfree(tnfa->submatch_data); + } -void -regfree(regex_t *preg) -{ - tre_free(preg); + if (tnfa->tag_directions) + xfree(tnfa->tag_directions); + if (tnfa->firstpos_chars) + xfree(tnfa->firstpos_chars); + if (tnfa->minimal_tags) + xfree(tnfa->minimal_tags); + xfree(tnfa); } - -/* EOF */